Source code for langchain_community.document_loaders.apify_dataset

from typing import Any, Callable, Dict, List

from langchain_core.documents import Document
from langchain_core.pydantic_v1 import BaseModel, root_validator

from langchain_community.document_loaders.base import BaseLoader


[docs]class ApifyDatasetLoader(BaseLoader, BaseModel): """从`Apify`网络爬虫、抓取和数据提取平台加载数据集。 详情请参阅 https://docs.apify.com/platform/integrations/langchain 示例: .. code-block:: python from langchain_community.document_loaders import ApifyDatasetLoader from langchain_core.documents import Document loader = ApifyDatasetLoader( dataset_id="YOUR-DATASET-ID", dataset_mapping_function=lambda dataset_item: Document( page_content=dataset_item["text"], metadata={"source": dataset_item["url"]} ), ) documents = loader.load()""" # noqa: E501 apify_client: Any """来自apify-client Python包的ApifyClient类的一个实例。""" dataset_id: str """Apify平台上数据集的ID。""" dataset_mapping_function: Callable[[Dict], Document] """一个自定义函数,接受一个字典(Apify数据集项)并将其转换为Document类的实例。""" def __init__( self, dataset_id: str, dataset_mapping_function: Callable[[Dict], Document] ): """使用Apify数据集ID和映射函数初始化加载器。 参数: dataset_id(str):Apify平台上数据集的ID。 dataset_mapping_function(Callable):一个接受单个字典(Apify数据集项)并将其转换为Document类实例的函数。 """ super().__init__( dataset_id=dataset_id, dataset_mapping_function=dataset_mapping_function ) @root_validator() def validate_environment(cls, values: Dict) -> Dict: """验证环境。 参数: values: 需要验证的数值。 """ try: from apify_client import ApifyClient client = ApifyClient() if httpx_client := getattr(client.http_client, "httpx_client"): httpx_client.headers["user-agent"] += "; Origin/langchain" values["apify_client"] = client except ImportError: raise ImportError( "Could not import apify-client Python package. " "Please install it with `pip install apify-client`." ) return values
[docs] def load(self) -> List[Document]: """加载文档。""" dataset_items = ( self.apify_client.dataset(self.dataset_id).list_items(clean=True).items ) return list(map(self.dataset_mapping_function, dataset_items))