Skip to content

Json

JSONReader #

Bases: BaseReader

JSON阅读器。

读取带有选项的JSON文档,以帮助我们了解节点之间的关系。

Source code in llama_index/readers/json/base.py
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
class JSONReader(BaseReader):
    """JSON阅读器。

    读取带有选项的JSON文档,以帮助我们了解节点之间的关系。

    Args:
        levels_back(int):在JSON树中向后移动的级别数,如果要获取所有级别,则为0。如果levels_back为None,则我们只是格式化JSON并使每行成为一个嵌入

        collapse_length(int):JSON片段在输出中折叠的最大字符数(levels_back不能为None)
        例如:如果collapse_length = 10,并且输入为{a:[1, 2, 3],b:{"hello":"world","foo":"bar"}},那么a将折叠成一行,而b不会。
        建议从大约100开始,然后根据需要进行调整。

        is_jsonl(Optional[bool]):如果为True,则表示文件采用JSONL格式。默认为False。

        clean_json(Optional[bool]):如果为True,则删除仅包含JSON结构的行。这将删除一些不太有用的行。如果为False,则不删除任何行,并且文档保持有效的JSON对象结构。如果设置了levels_back,则不会清理json,并且此选项将被忽略。默认为True。"""

    def __init__(
        self,
        levels_back: Optional[int] = None,
        collapse_length: Optional[int] = None,
        ensure_ascii: bool = False,
        is_jsonl: Optional[bool] = False,
        clean_json: Optional[bool] = True,
    ) -> None:
        """使用参数进行初始化。"""
        super().__init__()
        self.levels_back = levels_back
        self.collapse_length = collapse_length
        self.ensure_ascii = ensure_ascii
        self.is_jsonl = is_jsonl
        self.clean_json = clean_json

    def load_data(
        self, input_file: str, extra_info: Optional[Dict] = {}
    ) -> List[Document]:
        """从输入文件中加载数据。"""
        with open(input_file, encoding="utf-8") as f:
            load_data = []
            if self.is_jsonl:
                for line in f:
                    load_data.append(json.loads(line.strip()))
            else:
                load_data = [json.load(f)]

            documents = []
            for data in load_data:
                if self.levels_back is None and self.clean_json is True:
                    # If levels_back isn't set and clean json is set,
                    # remove lines containing only formatting, we just format and make each
                    # line an embedding
                    json_output = json.dumps(
                        data, indent=0, ensure_ascii=self.ensure_ascii
                    )
                    lines = json_output.split("\n")
                    useful_lines = [
                        line for line in lines if not re.match(r"^[{}\[\],]*$", line)
                    ]
                    documents.append(
                        Document(text="\n".join(useful_lines), metadata=extra_info)
                    )

                elif self.levels_back is None and self.clean_json is False:
                    # If levels_back isn't set  and clean json is False, create documents without cleaning
                    json_output = json.dumps(data, ensure_ascii=self.ensure_ascii)
                    documents.append(Document(text=json_output, metadata=extra_info))

                elif self.levels_back is not None:
                    # If levels_back is set, we make the embeddings contain the labels
                    # from further up the JSON tree
                    lines = [
                        *_depth_first_yield(
                            data,
                            self.levels_back,
                            self.collapse_length,
                            [],
                            self.ensure_ascii,
                        )
                    ]
                    documents.append(
                        Document(text="\n".join(lines), metadata=extra_info)
                    )
            return documents

load_data #

load_data(
    input_file: str, extra_info: Optional[Dict] = {}
) -> List[Document]

从输入文件中加载数据。

Source code in llama_index/readers/json/base.py
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
def load_data(
    self, input_file: str, extra_info: Optional[Dict] = {}
) -> List[Document]:
    """从输入文件中加载数据。"""
    with open(input_file, encoding="utf-8") as f:
        load_data = []
        if self.is_jsonl:
            for line in f:
                load_data.append(json.loads(line.strip()))
        else:
            load_data = [json.load(f)]

        documents = []
        for data in load_data:
            if self.levels_back is None and self.clean_json is True:
                # If levels_back isn't set and clean json is set,
                # remove lines containing only formatting, we just format and make each
                # line an embedding
                json_output = json.dumps(
                    data, indent=0, ensure_ascii=self.ensure_ascii
                )
                lines = json_output.split("\n")
                useful_lines = [
                    line for line in lines if not re.match(r"^[{}\[\],]*$", line)
                ]
                documents.append(
                    Document(text="\n".join(useful_lines), metadata=extra_info)
                )

            elif self.levels_back is None and self.clean_json is False:
                # If levels_back isn't set  and clean json is False, create documents without cleaning
                json_output = json.dumps(data, ensure_ascii=self.ensure_ascii)
                documents.append(Document(text=json_output, metadata=extra_info))

            elif self.levels_back is not None:
                # If levels_back is set, we make the embeddings contain the labels
                # from further up the JSON tree
                lines = [
                    *_depth_first_yield(
                        data,
                        self.levels_back,
                        self.collapse_length,
                        [],
                        self.ensure_ascii,
                    )
                ]
                documents.append(
                    Document(text="\n".join(lines), metadata=extra_info)
                )
        return documents