Mongodb

SimpleMongoReader #

基类: BaseReader

简易Mongo读取器。

将每个Mongo文档连接成LlamaIndex使用的Document。

参数:

名称	类型	描述	默认值
`host`	`str`	Mongo主机。	`None`
`port`	`int`	Mongo端口。	`None`

Source code in llama-index-integrations/readers/llama-index-readers-mongodb/llama_index/readers/mongodb/base.py

class SimpleMongoReader(BaseReader):
    """
    Simple mongo reader.

    Concatenates each Mongo doc into Document used by LlamaIndex.

    Args:
        host (str): Mongo host.
        port (int): Mongo port.

    """

    def __init__(
        self,
        host: Optional[str] = None,
        port: Optional[int] = None,
        uri: Optional[str] = None,
    ) -> None:
        """Initialize with parameters."""
        try:
            from pymongo import MongoClient
        except ImportError as err:
            raise ImportError(
                "`pymongo` package not found, please run `pip install pymongo`"
            ) from err

        client: MongoClient
        if uri:
            client = MongoClient(uri)
        elif host and port:
            client = MongoClient(host, port)
        else:
            raise ValueError("Either `host` and `port` or `uri` must be provided.")

        self.client = client

    def lazy_load_data(
        self,
        db_name: str,
        collection_name: str,
        field_names: List[str] = ["text"],
        separator: str = "",
        query_dict: Optional[Dict] = None,
        max_docs: int = 0,
        metadata_names: Optional[List[str]] = None,
        field_extractors: Optional[Dict[str, Callable[..., str]]] = None,
    ) -> Iterable[Document]:
        """
        Load data from the input directory.

        Args:
            db_name (str): name of the database.
            collection_name (str): name of the collection.
            field_names(List[str]): names of the fields to be concatenated.
                Defaults to ["text"]
            separator (str): separator to be used between fields.
                Defaults to ""
            query_dict (Optional[Dict]): query to filter documents. Read more
            at [official docs](https://www.mongodb.com/docs/manual/reference/method/db.collection.find/#std-label-method-find-query)
                Defaults to None
            max_docs (int): maximum number of documents to load.
                Defaults to 0 (no limit)
            metadata_names (Optional[List[str]]): names of the fields to be added
                to the metadata attribute of the Document. Defaults to None
            field_extractors (Optional[Dict[str, Callable[..., str]]]): dictionary
                containing field name and a function to extract text from the field.
                The default extractor function is `str`. Defaults to None.

        Returns:
            List[Document]: A list of documents.

        """
        db = self.client[db_name]
        cursor = db[collection_name].find(
            filter=query_dict or {},
            limit=max_docs,
            projection=dict.fromkeys(field_names + (metadata_names or []), 1),
        )

        field_extractors = field_extractors or {}

        for item in cursor:
            try:
                texts = [
                    field_extractors.get(name, str)(item[name]) for name in field_names
                ]
            except KeyError as err:
                raise ValueError(
                    f"{err.args[0]} field not found in Mongo document."
                ) from err

            text = separator.join(texts)

            if metadata_names is None:
                yield Document(text=text, id_=str(item["_id"]))
            else:
                try:
                    metadata = {name: item.get(name) for name in metadata_names}
                    metadata["collection"] = collection_name
                except KeyError as err:
                    raise ValueError(
                        f"{err.args[0]} field not found in Mongo document."
                    ) from err
                yield Document(text=text, id_=str(item["_id"]), metadata=metadata)

lazy_load_data #

lazy_load_data(db_name: str, collection_name: str, field_names: List[str] = ['text'], separator: str = '', query_dict: Optional[Dict] = None, max_docs: int = 0, metadata_names: Optional[List[str]] = None, field_extractors: Optional[Dict[str, Callable[..., str]]] = None) -> Iterable[Document]

从输入目录加载数据。

参数:

名称	类型	描述	默认值
`db_name`	`str`	数据库名称。	required
`collection_name`	`str`	集合的名称。	required
`field_names(List[str])`		需要连接的字段名称。默认为 ["text"]	required
`separator`	`str`	字段之间使用的分隔符。默认为""	`''`
`query_dict`	`Optional[Dict]`	查询以筛选文档。了解更多	`None`
`at`	`[official docs](https`	//www.mongodb.com/docs/manual/reference/method/db.collection.find/#std-label-method-find-query) 默认为None	required
`max_docs`	`int`	最大加载文档数量。默认为0（无限制）	`0`
`metadata_names`	`Optional[List[str]]`	要添加到文档元数据属性中的字段名称。默认为None	`None`
`field_extractors`	`Optional[Dict[str, Callable[..., str]]]`	字典包含字段名称和从该字段提取文本的函数。默认的提取函数是 `str`。默认为 None。	`None`

返回：

类型	描述
`Iterable[Document]`	List[Document]: 文档列表。

Source code in llama-index-integrations/readers/llama-index-readers-mongodb/llama_index/readers/mongodb/base.py

def lazy_load_data(
    self,
    db_name: str,
    collection_name: str,
    field_names: List[str] = ["text"],
    separator: str = "",
    query_dict: Optional[Dict] = None,
    max_docs: int = 0,
    metadata_names: Optional[List[str]] = None,
    field_extractors: Optional[Dict[str, Callable[..., str]]] = None,
) -> Iterable[Document]:
    """
    Load data from the input directory.

    Args:
        db_name (str): name of the database.
        collection_name (str): name of the collection.
        field_names(List[str]): names of the fields to be concatenated.
            Defaults to ["text"]
        separator (str): separator to be used between fields.
            Defaults to ""
        query_dict (Optional[Dict]): query to filter documents. Read more
        at [official docs](https://www.mongodb.com/docs/manual/reference/method/db.collection.find/#std-label-method-find-query)
            Defaults to None
        max_docs (int): maximum number of documents to load.
            Defaults to 0 (no limit)
        metadata_names (Optional[List[str]]): names of the fields to be added
            to the metadata attribute of the Document. Defaults to None
        field_extractors (Optional[Dict[str, Callable[..., str]]]): dictionary
            containing field name and a function to extract text from the field.
            The default extractor function is `str`. Defaults to None.

    Returns:
        List[Document]: A list of documents.

    """
    db = self.client[db_name]
    cursor = db[collection_name].find(
        filter=query_dict or {},
        limit=max_docs,
        projection=dict.fromkeys(field_names + (metadata_names or []), 1),
    )

    field_extractors = field_extractors or {}

    for item in cursor:
        try:
            texts = [
                field_extractors.get(name, str)(item[name]) for name in field_names
            ]
        except KeyError as err:
            raise ValueError(
                f"{err.args[0]} field not found in Mongo document."
            ) from err

        text = separator.join(texts)

        if metadata_names is None:
            yield Document(text=text, id_=str(item["_id"]))
        else:
            try:
                metadata = {name: item.get(name) for name in metadata_names}
                metadata["collection"] = collection_name
            except KeyError as err:
                raise ValueError(
                    f"{err.args[0]} field not found in Mongo document."
                ) from err
            yield Document(text=text, id_=str(item["_id"]), metadata=metadata)