Lilac

LilacReader #

Bases: BaseReader

紫丁香数据集读取器。

Source code in llama_index/readers/lilac/base.py

class LilacReader(BaseReader):
    """紫丁香数据集读取器。"""

    def load_data(
        self,
        dataset: str,
        text_path: "Path" = "text",
        doc_id_path: Optional["Path"] = "doc_id",
        columns: Optional[List["ColumnId"]] = None,
        filters: Optional[List["FilterLike"]] = None,
        project_dir: Optional[str] = None,
    ) -> List[Document]:
        """从相关帖子和顶层评论中加载文本，给定搜索关键词。

Args:
    project_dir (Optional[str]): 要读取的Lilac项目目录。如果未定义，则使用`LILAC_PROJECT_DIR`环境变量。
    text_path: 数据集中文本字段的路径。如果未定义，则使用'text'。
    columns (Optional[List[ColumnId]]): 从数据集中加载的列。如果未定义，则加载所有列。
    dataset (str): 要加载的数据集。应格式化为{namespace}/{dataset_name}。
    filters (Optional[Filter]): 加载到文档中之前应用于数据集的过滤器。用于过滤标记数据非常有用。
"""
        try:
            import lilac as ll
        except ImportError:
            raise ("`lilac` package not found, please run `pip install lilac`")

        namespace, dataset_name = dataset.split("/")
        lilac_dataset = ll.get_dataset(namespace, dataset_name, project_dir=project_dir)

        # Check to make sure text path, and doc_id path are valid.
        manifest = lilac_dataset.manifest()

        text_path = ll.normalize_path(text_path)
        text_field = manifest.data_schema.get_field(text_path)
        if not text_field:
            raise ValueError(
                f"Could not find text field {text_path} in dataset {dataset}"
            )

        doc_id_path = ll.normalize_path(doc_id_path)
        doc_id_field = manifest.data_schema.get_field(doc_id_path)
        if not doc_id_field:
            raise ValueError(
                f"Could not find doc_id field {doc_id_path} in dataset {dataset}"
            )

        rows = lilac_dataset.select_rows(
            columns=([*columns, text_field, doc_id_path]) if columns else ["*"],
            filters=filters,
            combine_columns=True,
        )

        def _item_from_path(item: ll.Item, path: ll.PathTuple) -> ll.Item:
            if len(path) == 1:
                item = item[path[0]]
                if isinstance(item, dict):
                    return item[ll.VALUE_KEY]
                else:
                    return item
            else:
                return _item_from_path(item[path[0]], path[1:])

        def _remove_item_path(item: ll.Item, path: ll.PathTuple) -> None:
            if len(path) == 0:
                return
            if len(path) == 1:
                if item and path[0] in item:
                    leaf_item = item[path[0]]
                    if isinstance(leaf_item, dict):
                        del item[path[0]][ll.VALUE_KEY]
                    else:
                        del item[path[0]]
                return
            else:
                _remove_item_path(item[path[0]], path[1:])

        documents: List[Document] = []
        for row in rows:
            text = _item_from_path(row, text_path)
            doc_id = _item_from_path(row, doc_id_path)
            _remove_item_path(row, text_path)
            _remove_item_path(row, doc_id_path)
            documents.append(Document(text=text, doc_id=doc_id, extra_info=row or {}))

        return documents

load_data #

load_data(
    dataset: str,
    text_path: Path = "text",
    doc_id_path: Optional[Path] = "doc_id",
    columns: Optional[List[ColumnId]] = None,
    filters: Optional[List[FilterLike]] = None,
    project_dir: Optional[str] = None,
) -> List[Document]

从相关帖子和顶层评论中加载文本，给定搜索关键词。

Parameters:

Name	Type	Description	Default
`project_dir`	`Optional[str]`	要读取的Lilac项目目录。如果未定义，则使用`LILAC_PROJECT_DIR`环境变量。	`None`
`text_path`	`Path`	数据集中文本字段的路径。如果未定义，则使用'text'。	`'text'`
`columns`	`Optional[List[ColumnId]]`	从数据集中加载的列。如果未定义，则加载所有列。	`None`
`dataset`	`str`	要加载的数据集。应格式化为{namespace}/{dataset_name}。	required
`filters`	`Optional[Filter]`	加载到文档中之前应用于数据集的过滤器。用于过滤标记数据非常有用。	`None`

Source code in llama_index/readers/lilac/base.py

    def load_data(
        self,
        dataset: str,
        text_path: "Path" = "text",
        doc_id_path: Optional["Path"] = "doc_id",
        columns: Optional[List["ColumnId"]] = None,
        filters: Optional[List["FilterLike"]] = None,
        project_dir: Optional[str] = None,
    ) -> List[Document]:
        """从相关帖子和顶层评论中加载文本，给定搜索关键词。

Args:
    project_dir (Optional[str]): 要读取的Lilac项目目录。如果未定义，则使用`LILAC_PROJECT_DIR`环境变量。
    text_path: 数据集中文本字段的路径。如果未定义，则使用'text'。
    columns (Optional[List[ColumnId]]): 从数据集中加载的列。如果未定义，则加载所有列。
    dataset (str): 要加载的数据集。应格式化为{namespace}/{dataset_name}。
    filters (Optional[Filter]): 加载到文档中之前应用于数据集的过滤器。用于过滤标记数据非常有用。
"""
        try:
            import lilac as ll
        except ImportError:
            raise ("`lilac` package not found, please run `pip install lilac`")

        namespace, dataset_name = dataset.split("/")
        lilac_dataset = ll.get_dataset(namespace, dataset_name, project_dir=project_dir)

        # Check to make sure text path, and doc_id path are valid.
        manifest = lilac_dataset.manifest()

        text_path = ll.normalize_path(text_path)
        text_field = manifest.data_schema.get_field(text_path)
        if not text_field:
            raise ValueError(
                f"Could not find text field {text_path} in dataset {dataset}"
            )

        doc_id_path = ll.normalize_path(doc_id_path)
        doc_id_field = manifest.data_schema.get_field(doc_id_path)
        if not doc_id_field:
            raise ValueError(
                f"Could not find doc_id field {doc_id_path} in dataset {dataset}"
            )

        rows = lilac_dataset.select_rows(
            columns=([*columns, text_field, doc_id_path]) if columns else ["*"],
            filters=filters,
            combine_columns=True,
        )

        def _item_from_path(item: ll.Item, path: ll.PathTuple) -> ll.Item:
            if len(path) == 1:
                item = item[path[0]]
                if isinstance(item, dict):
                    return item[ll.VALUE_KEY]
                else:
                    return item
            else:
                return _item_from_path(item[path[0]], path[1:])

        def _remove_item_path(item: ll.Item, path: ll.PathTuple) -> None:
            if len(path) == 0:
                return
            if len(path) == 1:
                if item and path[0] in item:
                    leaf_item = item[path[0]]
                    if isinstance(leaf_item, dict):
                        del item[path[0]][ll.VALUE_KEY]
                    else:
                        del item[path[0]]
                return
            else:
                _remove_item_path(item[path[0]], path[1:])

        documents: List[Document] = []
        for row in rows:
            text = _item_from_path(row, text_path)
            doc_id = _item_from_path(row, doc_id_path)
            _remove_item_path(row, text_path)
            _remove_item_path(row, doc_id_path)
            documents.append(Document(text=text, doc_id=doc_id, extra_info=row or {}))

        return documents