Source code for langchain_community.document_loaders.hugging_face_dataset

import json
from typing import Iterator, Mapping, Optional, Sequence, Union

from langchain_core.documents import Document

from langchain_community.document_loaders.base import BaseLoader


[docs]class HuggingFaceDatasetLoader(BaseLoader): """从`Hugging Face Hub`加载数据集。"""
[docs] def __init__( self, path: str, page_content_column: str = "text", name: Optional[str] = None, data_dir: Optional[str] = None, data_files: Optional[ Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]] ] = None, cache_dir: Optional[str] = None, keep_in_memory: Optional[bool] = None, save_infos: bool = False, use_auth_token: Optional[Union[bool, str]] = None, num_proc: Optional[int] = None, ): """初始化HuggingFaceDatasetLoader。 参数: path: 数据集的路径或名称。 page_content_column: 页面内容列的名称。默认为"text"。 name: 数据集配置的名称。 data_dir: 数据集配置的数据目录。 data_files: 源数据文件的路径。 cache_dir: 读取/写入数据的目录。 keep_in_memory: 是否将数据集保存在内存中。 save_infos: 是否保存数据集信息(校验和/大小/拆分等)。默认为False。 use_auth_token: Dataset Hub 上远程文件的 Bearer token。 num_proc: 进程数量。 """ self.path = path self.page_content_column = page_content_column self.name = name self.data_dir = data_dir self.data_files = data_files self.cache_dir = cache_dir self.keep_in_memory = keep_in_memory self.save_infos = save_infos self.use_auth_token = use_auth_token self.num_proc = num_proc
[docs] def lazy_load( self, ) -> Iterator[Document]: """懒加载文档。""" try: from datasets import load_dataset except ImportError: raise ImportError( "Could not import datasets python package. " "Please install it with `pip install datasets`." ) dataset = load_dataset( path=self.path, name=self.name, data_dir=self.data_dir, data_files=self.data_files, cache_dir=self.cache_dir, keep_in_memory=self.keep_in_memory, save_infos=self.save_infos, use_auth_token=self.use_auth_token, num_proc=self.num_proc, ) yield from ( Document( page_content=self.parse_obj(row.pop(self.page_content_column)), metadata=row, ) for key in dataset.keys() for row in dataset[key] )
[docs] def parse_obj(self, page_content: Union[str, object]) -> str: if isinstance(page_content, object): return json.dumps(page_content) return page_content