class JSONReader(BaseReader):
"""JSON阅读器。
读取带有选项的JSON文档,以帮助我们了解节点之间的关系。
Args:
levels_back(int):在JSON树中向后移动的级别数,如果要获取所有级别,则为0。如果levels_back为None,则我们只是格式化JSON并使每行成为一个嵌入
collapse_length(int):JSON片段在输出中折叠的最大字符数(levels_back不能为None)
例如:如果collapse_length = 10,并且输入为{a:[1, 2, 3],b:{"hello":"world","foo":"bar"}},那么a将折叠成一行,而b不会。
建议从大约100开始,然后根据需要进行调整。
is_jsonl(Optional[bool]):如果为True,则表示文件采用JSONL格式。默认为False。
clean_json(Optional[bool]):如果为True,则删除仅包含JSON结构的行。这将删除一些不太有用的行。如果为False,则不删除任何行,并且文档保持有效的JSON对象结构。如果设置了levels_back,则不会清理json,并且此选项将被忽略。默认为True。"""
def __init__(
self,
levels_back: Optional[int] = None,
collapse_length: Optional[int] = None,
ensure_ascii: bool = False,
is_jsonl: Optional[bool] = False,
clean_json: Optional[bool] = True,
) -> None:
"""使用参数进行初始化。"""
super().__init__()
self.levels_back = levels_back
self.collapse_length = collapse_length
self.ensure_ascii = ensure_ascii
self.is_jsonl = is_jsonl
self.clean_json = clean_json
def load_data(
self, input_file: str, extra_info: Optional[Dict] = {}
) -> List[Document]:
"""从输入文件中加载数据。"""
with open(input_file, encoding="utf-8") as f:
load_data = []
if self.is_jsonl:
for line in f:
load_data.append(json.loads(line.strip()))
else:
load_data = [json.load(f)]
documents = []
for data in load_data:
if self.levels_back is None and self.clean_json is True:
# If levels_back isn't set and clean json is set,
# remove lines containing only formatting, we just format and make each
# line an embedding
json_output = json.dumps(
data, indent=0, ensure_ascii=self.ensure_ascii
)
lines = json_output.split("\n")
useful_lines = [
line for line in lines if not re.match(r"^[{}\[\],]*$", line)
]
documents.append(
Document(text="\n".join(useful_lines), metadata=extra_info)
)
elif self.levels_back is None and self.clean_json is False:
# If levels_back isn't set and clean json is False, create documents without cleaning
json_output = json.dumps(data, ensure_ascii=self.ensure_ascii)
documents.append(Document(text=json_output, metadata=extra_info))
elif self.levels_back is not None:
# If levels_back is set, we make the embeddings contain the labels
# from further up the JSON tree
lines = [
*_depth_first_yield(
data,
self.levels_back,
self.collapse_length,
[],
self.ensure_ascii,
)
]
documents.append(
Document(text="\n".join(lines), metadata=extra_info)
)
return documents