import json
from pathlib import Path
from typing import Any, Callable, Dict, Iterator, Optional, Union
from langchain_core.documents import Document
from langchain_community.document_loaders.base import BaseLoader
[docs]class JSONLoader(BaseLoader):
"""使用`jq`模式加载一个`JSON`文件。
例如:
[{"text": ...}, {"text": ...}, {"text": ...}] -> 模式 = .[].text
{"key": [{"text": ...}, {"text": ...}, {"text": ...}]} -> 模式 = .key[].text
["", "", ""] -> 模式 = .[]"""
[docs] def __init__(
self,
file_path: Union[str, Path],
jq_schema: str,
content_key: Optional[str] = None,
is_content_key_jq_parsable: Optional[bool] = False,
metadata_func: Optional[Callable[[Dict, Dict], Dict]] = None,
text_content: bool = True,
json_lines: bool = False,
):
"""初始化JSONLoader。
参数:
file_path(Union[str, Path]):JSON或JSON Lines文件的路径。
jq_schema(str):用于从JSON中提取数据或文本的jq模式。
content_key(str):用于从JSON中提取内容的键,如果jq_schema的结果是对象(字典)的列表。如果is_content_key_jq_parsable为True,则此键必须是jq兼容的模式。如果is_content_key_jq_parsable为False,则此键应为简单字符串键。
is_content_key_jq_parsable(bool):一个标志,用于确定content_key是否可被jq解析。如果为True,则content_key将被视为jq模式并相应地编译。如果为False或content_key为None,则content_key将被用作简单字符串。默认为False。
metadata_func(Callable[Dict, Dict]):一个函数,接受由jq_schema提取的JSON对象和默认元数据,并返回更新后的元数据字典。
text_content(bool):布尔标志,指示内容是否为字符串格式,默认为True。
json_lines(bool):布尔标志,指示输入是否为JSON Lines格式。
"""
try:
import jq
self.jq = jq
except ImportError:
raise ImportError(
"jq package not found, please install it with `pip install jq`"
)
self.file_path = Path(file_path).resolve()
self._jq_schema = jq.compile(jq_schema)
self._is_content_key_jq_parsable = is_content_key_jq_parsable
self._content_key = content_key
self._metadata_func = metadata_func
self._text_content = text_content
self._json_lines = json_lines
[docs] def lazy_load(self) -> Iterator[Document]:
"""从JSON文件中加载并返回文档。"""
index = 0
if self._json_lines:
with self.file_path.open(encoding="utf-8") as f:
for line in f:
line = line.strip()
if line:
for doc in self._parse(line, index):
yield doc
index += 1
else:
for doc in self._parse(self.file_path.read_text(encoding="utf-8"), index):
yield doc
index += 1
def _parse(self, content: str, index: int) -> Iterator[Document]:
"""将给定内容转换为文档。"""
data = self._jq_schema.input(json.loads(content))
# Perform some validation
# This is not a perfect validation, but it should catch most cases
# and prevent the user from getting a cryptic error later on.
if self._content_key is not None:
self._validate_content_key(data)
if self._metadata_func is not None:
self._validate_metadata_func(data)
for i, sample in enumerate(data, index + 1):
text = self._get_text(sample=sample)
metadata = self._get_metadata(
sample=sample, source=str(self.file_path), seq_num=i
)
yield Document(page_content=text, metadata=metadata)
def _get_text(self, sample: Any) -> str:
"""将示例转换为字符串格式"""
if self._content_key is not None:
if self._is_content_key_jq_parsable:
compiled_content_key = self.jq.compile(self._content_key)
content = compiled_content_key.input(sample).first()
else:
content = sample[self._content_key]
else:
content = sample
if self._text_content and not isinstance(content, str):
raise ValueError(
f"Expected page_content is string, got {type(content)} instead. \
Set `text_content=False` if the desired input for \
`page_content` is not a string"
)
# In case the text is None, set it to an empty string
elif isinstance(content, str):
return content
elif isinstance(content, dict):
return json.dumps(content) if content else ""
else:
return str(content) if content is not None else ""
def _get_metadata(
self, sample: Dict[str, Any], **additional_fields: Any
) -> Dict[str, Any]:
"""根据metadata_func的存在返回一个元数据字典
:param sample: 单个数据有效负载
:param additional_fields: 要添加为元数据值的关键字参数
:return:
"""
if self._metadata_func is not None:
return self._metadata_func(sample, additional_fields)
else:
return additional_fields
def _validate_content_key(self, data: Any) -> None:
"""检查内容键是否有效。"""
sample = data.first()
if not isinstance(sample, dict):
raise ValueError(
f"Expected the jq schema to result in a list of objects (dict), \
so sample must be a dict but got `{type(sample)}`"
)
if (
not self._is_content_key_jq_parsable
and sample.get(self._content_key) is None
):
raise ValueError(
f"Expected the jq schema to result in a list of objects (dict) \
with the key `{self._content_key}`"
)
if (
self._is_content_key_jq_parsable
and self.jq.compile(self._content_key).input(sample).text() is None
):
raise ValueError(
f"Expected the jq schema to result in a list of objects (dict) \
with the key `{self._content_key}` which should be parsable by jq"
)
def _validate_metadata_func(self, data: Any) -> None:
"""检查 metadata_func 输出是否有效"""
sample = data.first()
if self._metadata_func is not None:
sample_metadata = self._metadata_func(sample, {})
if not isinstance(sample_metadata, dict):
raise ValueError(
f"Expected the metadata_func to return a dict but got \
`{type(sample_metadata)}`"
)