Source code for langchain_community.document_loaders.apify_dataset
from typing import Any, Callable, Dict, List
from langchain_core.documents import Document
from langchain_core.pydantic_v1 import BaseModel, root_validator
from langchain_community.document_loaders.base import BaseLoader
[docs]class ApifyDatasetLoader(BaseLoader, BaseModel):
"""从`Apify`网络爬虫、抓取和数据提取平台加载数据集。
详情请参阅 https://docs.apify.com/platform/integrations/langchain
示例:
.. code-block:: python
from langchain_community.document_loaders import ApifyDatasetLoader
from langchain_core.documents import Document
loader = ApifyDatasetLoader(
dataset_id="YOUR-DATASET-ID",
dataset_mapping_function=lambda dataset_item: Document(
page_content=dataset_item["text"], metadata={"source": dataset_item["url"]}
),
)
documents = loader.load()""" # noqa: E501
apify_client: Any
"""来自apify-client Python包的ApifyClient类的一个实例。"""
dataset_id: str
"""Apify平台上数据集的ID。"""
dataset_mapping_function: Callable[[Dict], Document]
"""一个自定义函数,接受一个字典(Apify数据集项)并将其转换为Document类的实例。"""
def __init__(
self, dataset_id: str, dataset_mapping_function: Callable[[Dict], Document]
):
"""使用Apify数据集ID和映射函数初始化加载器。
参数:
dataset_id(str):Apify平台上数据集的ID。
dataset_mapping_function(Callable):一个接受单个字典(Apify数据集项)并将其转换为Document类实例的函数。
"""
super().__init__(
dataset_id=dataset_id, dataset_mapping_function=dataset_mapping_function
)
@root_validator()
def validate_environment(cls, values: Dict) -> Dict:
"""验证环境。
参数:
values: 需要验证的数值。
"""
try:
from apify_client import ApifyClient
client = ApifyClient()
if httpx_client := getattr(client.http_client, "httpx_client"):
httpx_client.headers["user-agent"] += "; Origin/langchain"
values["apify_client"] = client
except ImportError:
raise ImportError(
"Could not import apify-client Python package. "
"Please install it with `pip install apify-client`."
)
return values
[docs] def load(self) -> List[Document]:
"""加载文档。"""
dataset_items = (
self.apify_client.dataset(self.dataset_id).list_items(clean=True).items
)
return list(map(self.dataset_mapping_function, dataset_items))