Skip to content

Huggingface fs

初始化参数。

HuggingFaceFSReader #

Bases: BaseReader

Hugging Face文件系统读取器。

使用Hugging Face Hub客户端库中的新文件系统API。

Source code in llama_index/readers/huggingface_fs/base.py
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
class HuggingFaceFSReader(BaseReader):
    """Hugging Face文件系统读取器。

使用Hugging Face Hub客户端库中的新文件系统API。"""

    def __init__(self) -> None:
        from huggingface_hub import HfFileSystem

        self.fs = HfFileSystem()

    def load_dicts(self, path: str) -> List[Dict]:
        """解析文件。"""
        test_data = self.fs.read_bytes(path)

        path = Path(path)
        if ".gz" in path.suffixes:
            import gzip

            with TemporaryDirectory() as tmp:
                tmp = Path(tmp)
                with open(tmp / "tmp.jsonl.gz", "wb") as fp:
                    fp.write(test_data)

                f = gzip.open(tmp / "tmp.jsonl.gz", "rb")
                raw = f.read()
                data = raw.decode()
        else:
            data = test_data.decode()

        text_lines = data.split("\n")
        json_dicts = []
        for t in text_lines:
            try:
                json_dict = json.loads(t)
            except json.decoder.JSONDecodeError:
                continue
            json_dicts.append(json_dict)
        return json_dicts

    def load_df(self, path: str) -> pd.DataFrame:
        """加载pandas数据框。"""
        return pd.DataFrame(self.load_dicts(path))

    def load_data(self, path: str) -> List[Document]:
        """加载数据。"""
        json_dicts = self.load_dicts(path)
        docs = []
        for d in json_dicts:
            docs.append(Document(text=str(d)))
        return docs

load_dicts #

load_dicts(path: str) -> List[Dict]

解析文件。

Source code in llama_index/readers/huggingface_fs/base.py
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
def load_dicts(self, path: str) -> List[Dict]:
    """解析文件。"""
    test_data = self.fs.read_bytes(path)

    path = Path(path)
    if ".gz" in path.suffixes:
        import gzip

        with TemporaryDirectory() as tmp:
            tmp = Path(tmp)
            with open(tmp / "tmp.jsonl.gz", "wb") as fp:
                fp.write(test_data)

            f = gzip.open(tmp / "tmp.jsonl.gz", "rb")
            raw = f.read()
            data = raw.decode()
    else:
        data = test_data.decode()

    text_lines = data.split("\n")
    json_dicts = []
    for t in text_lines:
        try:
            json_dict = json.loads(t)
        except json.decoder.JSONDecodeError:
            continue
        json_dicts.append(json_dict)
    return json_dicts

load_df #

load_df(path: str) -> DataFrame

加载pandas数据框。

Source code in llama_index/readers/huggingface_fs/base.py
56
57
58
def load_df(self, path: str) -> pd.DataFrame:
    """加载pandas数据框。"""
    return pd.DataFrame(self.load_dicts(path))

load_data #

load_data(path: str) -> List[Document]

加载数据。

Source code in llama_index/readers/huggingface_fs/base.py
60
61
62
63
64
65
66
def load_data(self, path: str) -> List[Document]:
    """加载数据。"""
    json_dicts = self.load_dicts(path)
    docs = []
    for d in json_dicts:
        docs.append(Document(text=str(d)))
    return docs