Source code for langchain_community.document_loaders.tensorflow_datasets

from typing import Callable, Dict, Iterator, Optional

from langchain_core.documents import Document

from langchain_community.document_loaders.base import BaseLoader
from langchain_community.utilities.tensorflow_datasets import TensorflowDatasets


[docs]class TensorflowDatasetLoader(BaseLoader): """从`TensorFlow Dataset`加载。 属性: dataset_name: 要加载的数据集的名称 split_name: 要加载的拆分名称。 load_max_docs: 加载文档数量的限制。默认为100。 sample_to_document_function: 将数据集样本转换为文档的函数 示例: .. code-block:: python from langchain_community.document_loaders import TensorflowDatasetLoader def mlqaen_example_to_document(example: dict) -> Document: return Document( page_content=decode_to_str(example["context"]), metadata={ "id": decode_to_str(example["id"]), "title": decode_to_str(example["title"]), "question": decode_to_str(example["question"]), "answer": decode_to_str(example["answers"]["text"][0]), }, ) tsds_client = TensorflowDatasetLoader( dataset_name="mlqa/en", split_name="test", load_max_docs=100, sample_to_document_function=mlqaen_example_to_document, )"""
[docs] def __init__( self, dataset_name: str, split_name: str, load_max_docs: Optional[int] = 100, sample_to_document_function: Optional[Callable[[Dict], Document]] = None, ): """初始化TensorflowDatasetLoader。 参数: dataset_name:要加载的数据集的名称 split_name:要加载的拆分的名称。 load_max_docs:加载文档数量的限制。默认为100。 sample_to_document_function:将数据集样本转换为文档的函数。 """ self.dataset_name: str = dataset_name self.split_name: str = split_name self.load_max_docs = load_max_docs """The maximum number of documents to load.""" self.sample_to_document_function: Optional[ Callable[[Dict], Document] ] = sample_to_document_function """Custom function that transform a dataset sample into a Document.""" self._tfds_client = TensorflowDatasets( # type: ignore[call-arg] dataset_name=self.dataset_name, split_name=self.split_name, load_max_docs=self.load_max_docs, # type: ignore[arg-type] sample_to_document_function=self.sample_to_document_function, )
[docs] def lazy_load(self) -> Iterator[Document]: yield from self._tfds_client.lazy_load()