Source code for langchain_community.document_loaders.hugging_face_dataset
import json
from typing import Iterator, Mapping, Optional, Sequence, Union
from langchain_core.documents import Document
from langchain_community.document_loaders.base import BaseLoader
[docs]class HuggingFaceDatasetLoader(BaseLoader):
"""从`Hugging Face Hub`加载数据集。"""
[docs] def __init__(
self,
path: str,
page_content_column: str = "text",
name: Optional[str] = None,
data_dir: Optional[str] = None,
data_files: Optional[
Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]]
] = None,
cache_dir: Optional[str] = None,
keep_in_memory: Optional[bool] = None,
save_infos: bool = False,
use_auth_token: Optional[Union[bool, str]] = None,
num_proc: Optional[int] = None,
):
"""初始化HuggingFaceDatasetLoader。
参数:
path: 数据集的路径或名称。
page_content_column: 页面内容列的名称。默认为"text"。
name: 数据集配置的名称。
data_dir: 数据集配置的数据目录。
data_files: 源数据文件的路径。
cache_dir: 读取/写入数据的目录。
keep_in_memory: 是否将数据集保存在内存中。
save_infos: 是否保存数据集信息(校验和/大小/拆分等)。默认为False。
use_auth_token: Dataset Hub 上远程文件的 Bearer token。
num_proc: 进程数量。
"""
self.path = path
self.page_content_column = page_content_column
self.name = name
self.data_dir = data_dir
self.data_files = data_files
self.cache_dir = cache_dir
self.keep_in_memory = keep_in_memory
self.save_infos = save_infos
self.use_auth_token = use_auth_token
self.num_proc = num_proc
[docs] def lazy_load(
self,
) -> Iterator[Document]:
"""懒加载文档。"""
try:
from datasets import load_dataset
except ImportError:
raise ImportError(
"Could not import datasets python package. "
"Please install it with `pip install datasets`."
)
dataset = load_dataset(
path=self.path,
name=self.name,
data_dir=self.data_dir,
data_files=self.data_files,
cache_dir=self.cache_dir,
keep_in_memory=self.keep_in_memory,
save_infos=self.save_infos,
use_auth_token=self.use_auth_token,
num_proc=self.num_proc,
)
yield from (
Document(
page_content=self.parse_obj(row.pop(self.page_content_column)),
metadata=row,
)
for key in dataset.keys()
for row in dataset[key]
)
[docs] def parse_obj(self, page_content: Union[str, object]) -> str:
if isinstance(page_content, object):
return json.dumps(page_content)
return page_content