Simple directory reader

简单的读取器，可以从目录中读取不同格式的文件。

SimpleDirectoryReader #

Bases: BaseReader, ResourcesReaderMixin, FileSystemReaderMixin

简单的目录读取器。

从文件目录加载文件。根据文件扩展名自动选择最佳的文件读取器。

Source code in llama_index/core/readers/file/base.py

class SimpleDirectoryReader(BaseReader, ResourcesReaderMixin, FileSystemReaderMixin):
    """简单的目录读取器。

从文件目录加载文件。
根据文件扩展名自动选择最佳的文件读取器。

Args:
    input_dir（str）：目录路径。
    input_files（List）：要读取的文件路径列表
        （可选；覆盖input_dir，exclude）
    exclude（List）：要排除的Python文件路径的glob（可选）
    exclude_hidden（bool）：是否排除隐藏文件（点文件）。
    encoding（str）：文件的编码。
        默认为utf-8。
    errors（str）：如何处理编码和解码错误，
          参见https://docs.python.org/3/library/functions.html#open
    recursive（bool）：是否递归搜索子目录。
        默认为False。
    filename_as_id（bool）：是否将文件名用作文档ID。
        默认为False。
    required_exts（Optional[List[str]]）：所需扩展名列表。
        默认为None。
    file_extractor（Optional[Dict[str, BaseReader]]）：文件扩展名到BaseReader类的映射，
        指定如何将该文件转换为文本。如果未指定，则使用DEFAULT_FILE_READER_CLS中的默认值。
    num_files_limit（Optional[int]）：要读取的最大文件数。
        默认为None。
    file_metadata（Optional[Callable[str, Dict]]）：接受文件名并返回Document的元数据字典的函数。
        默认为None。
    raise_on_error（bool）：是否在无法读取文件时引发错误。
    fs（Optional[fsspec.AbstractFileSystem]）：要使用的文件系统。默认为使用本地文件系统。
        可以更改为使用通过fsspec接口公开的任何远程文件系统。"""

    supported_suffix_fn: Callable = _try_loading_included_file_formats

    def __init__(
        self,
        input_dir: Optional[str] = None,
        input_files: Optional[List] = None,
        exclude: Optional[List] = None,
        exclude_hidden: bool = True,
        errors: str = "ignore",
        recursive: bool = False,
        encoding: str = "utf-8",
        filename_as_id: bool = False,
        required_exts: Optional[List[str]] = None,
        file_extractor: Optional[Dict[str, BaseReader]] = None,
        num_files_limit: Optional[int] = None,
        file_metadata: Optional[Callable[[str], Dict]] = None,
        raise_on_error: bool = False,
        fs: Optional[fsspec.AbstractFileSystem] = None,
    ) -> None:
        """使用参数进行初始化。"""
        super().__init__()

        if not input_dir and not input_files:
            raise ValueError("Must provide either `input_dir` or `input_files`.")

        self.fs = fs or get_default_fs()
        self.errors = errors
        self.encoding = encoding

        self.exclude = exclude
        self.recursive = recursive
        self.exclude_hidden = exclude_hidden
        self.required_exts = required_exts
        self.num_files_limit = num_files_limit
        self.raise_on_error = raise_on_error
        _Path = Path if is_default_fs(self.fs) else PurePosixPath

        if input_files:
            self.input_files = []
            for path in input_files:
                if not self.fs.isfile(path):
                    raise ValueError(f"File {path} does not exist.")
                input_file = _Path(path)
                self.input_files.append(input_file)
        elif input_dir:
            if not self.fs.isdir(input_dir):
                raise ValueError(f"Directory {input_dir} does not exist.")
            self.input_dir = _Path(input_dir)
            self.exclude = exclude
            self.input_files = self._add_files(self.input_dir)

        if file_extractor is not None:
            self.file_extractor = file_extractor
        else:
            self.file_extractor = {}

        self.file_metadata = file_metadata or _DefaultFileMetadataFunc(self.fs)
        self.filename_as_id = filename_as_id

    def is_hidden(self, path: Path) -> bool:
        return any(
            part.startswith(".") and part not in [".", ".."] for part in path.parts
        )

    def _add_files(self, input_dir: Path) -> List[Path]:
        """添加文件。"""
        all_files = set()
        rejected_files = set()
        rejected_dirs = set()
        # Default to POSIX paths for non-default file systems (e.g. S3)
        _Path = Path if is_default_fs(self.fs) else PurePosixPath

        if self.exclude is not None:
            for excluded_pattern in self.exclude:
                if self.recursive:
                    # Recursive glob
                    excluded_glob = _Path(input_dir) / _Path("**") / excluded_pattern
                else:
                    # Non-recursive glob
                    excluded_glob = _Path(input_dir) / excluded_pattern
                for file in self.fs.glob(str(excluded_glob)):
                    if self.fs.isdir(file):
                        rejected_dirs.add(_Path(file))
                    else:
                        rejected_files.add(_Path(file))

        file_refs: List[str] = []
        if self.recursive:
            file_refs = self.fs.glob(str(input_dir) + "/**/*")
        else:
            file_refs = self.fs.glob(str(input_dir) + "/*")

        for ref in file_refs:
            # Manually check if file is hidden or directory instead of
            # in glob for backwards compatibility.
            ref = _Path(ref)
            is_dir = self.fs.isdir(ref)
            skip_because_hidden = self.exclude_hidden and self.is_hidden(ref)
            skip_because_bad_ext = (
                self.required_exts is not None and ref.suffix not in self.required_exts
            )
            skip_because_excluded = ref in rejected_files
            if not skip_because_excluded:
                if is_dir:
                    ref_parent_dir = ref
                else:
                    ref_parent_dir = self.fs._parent(ref)
                for rejected_dir in rejected_dirs:
                    if str(ref_parent_dir).startswith(str(rejected_dir)):
                        skip_because_excluded = True
                        logger.debug(
                            "Skipping %s because it in parent dir %s which is in %s",
                            ref,
                            ref_parent_dir,
                            rejected_dir,
                        )
                        break

            if (
                is_dir
                or skip_because_hidden
                or skip_because_bad_ext
                or skip_because_excluded
            ):
                continue
            else:
                all_files.add(ref)

        new_input_files = sorted(all_files)

        if len(new_input_files) == 0:
            raise ValueError(f"No files found in {input_dir}.")

        if self.num_files_limit is not None and self.num_files_limit > 0:
            new_input_files = new_input_files[0 : self.num_files_limit]

        # print total number of files added
        logger.debug(
            f"> [SimpleDirectoryReader] Total files added: {len(new_input_files)}"
        )

        return new_input_files

    def _exclude_metadata(self, documents: List[Document]) -> List[Document]:
        """从文档中排除元数据。

Args:
    documents (List[Document]): 文档列表。
"""
        for doc in documents:
            # Keep only metadata['file_path'] in both embedding and llm content
            # str, which contain extreme important context that about the chunks.
            # Dates is provided for convenience of postprocessor such as
            # TimeWeightedPostprocessor, but excluded for embedding and LLMprompts
            doc.excluded_embed_metadata_keys.extend(
                [
                    "file_name",
                    "file_type",
                    "file_size",
                    "creation_date",
                    "last_modified_date",
                    "last_accessed_date",
                ]
            )
            doc.excluded_llm_metadata_keys.extend(
                [
                    "file_name",
                    "file_type",
                    "file_size",
                    "creation_date",
                    "last_modified_date",
                    "last_accessed_date",
                ]
            )

        return documents

    def list_resources(self, *args: Any, **kwargs: Any) -> List[Path]:
        """在给定的文件系统中列出文件。"""
        return self.input_files

    def get_resource_info(self, resource_id: str, *args: Any, **kwargs: Any) -> Dict:
        info_result = self.fs.info(resource_id)

        creation_date = _format_file_timestamp(
            info_result.get("created"), include_time=True
        )
        last_modified_date = _format_file_timestamp(
            info_result.get("mtime"), include_time=True
        )

        info_dict = {
            "file_path": resource_id,
            "file_size": info_result.get("size"),
            "creation_date": creation_date,
            "last_modified_date": last_modified_date,
        }

        # Ignore None values
        return {
            meta_key: meta_value
            for meta_key, meta_value in info_dict.items()
            if meta_value is not None
        }

    def load_resource(
        self, resource_id: str, *args: Any, **kwargs: Any
    ) -> List[Document]:
        file_metadata = kwargs.get("file_metadata", self.file_metadata)
        file_extractor = kwargs.get("file_extractor", self.file_extractor)
        filename_as_id = kwargs.get("filename_as_id", self.filename_as_id)
        encoding = kwargs.get("encoding", self.encoding)
        errors = kwargs.get("errors", self.errors)
        raise_on_error = kwargs.get("raise_on_error", self.raise_on_error)
        fs = kwargs.get("fs", self.fs)

        return SimpleDirectoryReader.load_file(
            input_file=Path(resource_id),
            file_metadata=file_metadata,
            file_extractor=file_extractor,
            filename_as_id=filename_as_id,
            encoding=encoding,
            errors=errors,
            raise_on_error=raise_on_error,
            fs=fs,
            **kwargs,
        )

    async def aload_resource(
        self, resource_id: str, *args: Any, **kwargs: Any
    ) -> List[Document]:
        file_metadata = kwargs.get("file_metadata", self.file_metadata)
        file_extractor = kwargs.get("file_extractor", self.file_extractor)
        filename_as_id = kwargs.get("filename_as_id", self.filename_as_id)
        encoding = kwargs.get("encoding", self.encoding)
        errors = kwargs.get("errors", self.errors)
        raise_on_error = kwargs.get("raise_on_error", self.raise_on_error)
        fs = kwargs.get("fs", self.fs)

        return await SimpleDirectoryReader.aload_file(
            input_file=Path(resource_id),
            file_metadata=file_metadata,
            file_extractor=file_extractor,
            filename_as_id=filename_as_id,
            encoding=encoding,
            errors=errors,
            raise_on_error=raise_on_error,
            fs=fs,
            **kwargs,
        )

    def read_file_content(self, input_file: Path, **kwargs) -> bytes:
        """读取文件内容。"""
        fs: fsspec.AbstractFileSystem = kwargs.get("fs", self.fs)
        with fs.open(input_file, errors=self.errors, encoding=self.encoding) as f:
            return f.read()

    @staticmethod
    def load_file(
        input_file: Path,
        file_metadata: Callable[[str], Dict],
        file_extractor: Dict[str, BaseReader],
        filename_as_id: bool = False,
        encoding: str = "utf-8",
        errors: str = "ignore",
        raise_on_error: bool = False,
        fs: Optional[fsspec.AbstractFileSystem] = None,
    ) -> List[Document]:
        """静态方法用于加载文件。

注意：必须作为静态方法以进行并行处理。

Args:
    input_file（Path）：_描述_
    file_metadata（Callable[[str], Dict]）：_描述_
    file_extractor（Dict[str, BaseReader]）：_描述_
    filename_as_id（bool，可选）：_描述_。默认为False。
    encoding（str，可选）：_描述_。默认为"utf-8"。
    errors（str，可选）：_描述_。默认为"ignore"。
    fs（Optional[fsspec.AbstractFileSystem]，可选）：_描述_。默认为None。

input_file（Path）：要读取的文件路径
file_metadata（[Callable[str, Dict]]）：一个接受文件名并返回文档元数据字典的函数。
file_extractor（Dict[str, BaseReader]）：文件扩展名到BaseReader类的映射，指定如何将该文件转换为文本。
filename_as_id（bool）：是否使用文件名作为文档ID。
encoding（str）：文件的编码。
    默认为utf-8。
errors（str）：如何处理编码和解码错误，
      请参阅https://docs.python.org/3/library/functions.html#open
raise_on_error（bool）：是否在无法读取文件时引发错误。
fs（Optional[fsspec.AbstractFileSystem]）：要使用的文件系统。默认为使用本地文件系统。可以更改为使用任何远程文件系统。

Returns:
    List[Document]：加载的文档
"""
        # TODO: make this less redundant
        default_file_reader_cls = SimpleDirectoryReader.supported_suffix_fn()
        default_file_reader_suffix = list(default_file_reader_cls.keys())
        metadata: Optional[dict] = None
        documents: List[Document] = []

        if file_metadata is not None:
            metadata = file_metadata(str(input_file))

        file_suffix = input_file.suffix.lower()
        if file_suffix in default_file_reader_suffix or file_suffix in file_extractor:
            # use file readers
            if file_suffix not in file_extractor:
                # instantiate file reader if not already
                reader_cls = default_file_reader_cls[file_suffix]
                file_extractor[file_suffix] = reader_cls()
            reader = file_extractor[file_suffix]

            # load data -- catch all errors except for ImportError
            try:
                kwargs = {"extra_info": metadata}
                if fs and not is_default_fs(fs):
                    kwargs["fs"] = fs
                docs = reader.load_data(input_file, **kwargs)
            except ImportError as e:
                # ensure that ImportError is raised so user knows
                # about missing dependencies
                raise ImportError(str(e))
            except Exception as e:
                if raise_on_error:
                    raise Exception("Error loading file") from e
                # otherwise, just skip the file and report the error
                print(
                    f"Failed to load file {input_file} with error: {e}. Skipping...",
                    flush=True,
                )
                return []

            # iterate over docs if needed
            if filename_as_id:
                for i, doc in enumerate(docs):
                    doc.id_ = f"{input_file!s}_part_{i}"

            documents.extend(docs)
        else:
            # do standard read
            fs = fs or get_default_fs()
            with fs.open(input_file, errors=errors, encoding=encoding) as f:
                data = f.read().decode(encoding, errors=errors)

            doc = Document(text=data, metadata=metadata or {})
            if filename_as_id:
                doc.id_ = str(input_file)

            documents.append(doc)

        return documents

    async def aload_file(self, input_file: Path) -> List[Document]:
        """异步加载文件。"""
        # TODO: make this less redundant
        default_file_reader_cls = SimpleDirectoryReader.supported_suffix_fn()
        default_file_reader_suffix = list(default_file_reader_cls.keys())
        metadata: Optional[dict] = None
        documents: List[Document] = []

        if self.file_metadata is not None:
            metadata = self.file_metadata(str(input_file))

        file_suffix = input_file.suffix.lower()
        if (
            file_suffix in default_file_reader_suffix
            or file_suffix in self.file_extractor
        ):
            # use file readers
            if file_suffix not in self.file_extractor:
                # instantiate file reader if not already
                reader_cls = default_file_reader_cls[file_suffix]
                self.file_extractor[file_suffix] = reader_cls()
            reader = self.file_extractor[file_suffix]

            # load data -- catch all errors except for ImportError
            try:
                kwargs = {"extra_info": metadata}
                if self.fs and not is_default_fs(self.fs):
                    kwargs["fs"] = self.fs
                docs = await reader.aload_data(input_file, **kwargs)
            except ImportError as e:
                # ensure that ImportError is raised so user knows
                # about missing dependencies
                raise ImportError(str(e))
            except Exception as e:
                if self.raise_on_error:
                    raise
                # otherwise, just skip the file and report the error
                print(
                    f"Failed to load file {input_file} with error: {e}. Skipping...",
                    flush=True,
                )
                return []

            # iterate over docs if needed
            if self.filename_as_id:
                for i, doc in enumerate(docs):
                    doc.id_ = f"{input_file!s}_part_{i}"

            documents.extend(docs)
        else:
            # do standard read
            fs = self.fs or get_default_fs()
            with fs.open(input_file, errors=self.errors, encoding=self.encoding) as f:
                data = f.read().decode(self.encoding, errors=self.errors)

            doc = Document(text=data, metadata=metadata or {})
            if self.filename_as_id:
                doc.id_ = str(input_file)

            documents.append(doc)

        return documents

    def load_data(
        self,
        show_progress: bool = False,
        num_workers: Optional[int] = None,
        fs: Optional[fsspec.AbstractFileSystem] = None,
    ) -> List[Document]:
        """从输入目录加载数据。

Args:
    show_progress (bool): 是否显示tqdm进度条。默认为False。
    num_workers  (Optional[int]): 用于并行加载数据的工作进程数。
    fs (Optional[fsspec.AbstractFileSystem]): 要使用的文件系统。如果在构造函数中指定了fs，则会覆盖此处的fs参数。

Returns:
    List[Document]: 文档列表。
"""
        documents = []

        files_to_process = self.input_files
        fs = fs or self.fs

        if num_workers and num_workers > 1:
            if num_workers > multiprocessing.cpu_count():
                warnings.warn(
                    "Specified num_workers exceed number of CPUs in the system. "
                    "Setting `num_workers` down to the maximum CPU count."
                )
            with multiprocessing.get_context("spawn").Pool(num_workers) as p:
                results = p.starmap(
                    SimpleDirectoryReader.load_file,
                    zip(
                        files_to_process,
                        repeat(self.file_metadata),
                        repeat(self.file_extractor),
                        repeat(self.filename_as_id),
                        repeat(self.encoding),
                        repeat(self.errors),
                        repeat(self.raise_on_error),
                        repeat(fs),
                    ),
                )
                documents = reduce(lambda x, y: x + y, results)

        else:
            if show_progress:
                files_to_process = tqdm(
                    self.input_files, desc="Loading files", unit="file"
                )
            for input_file in files_to_process:
                documents.extend(
                    SimpleDirectoryReader.load_file(
                        input_file=input_file,
                        file_metadata=self.file_metadata,
                        file_extractor=self.file_extractor,
                        filename_as_id=self.filename_as_id,
                        encoding=self.encoding,
                        errors=self.errors,
                        raise_on_error=self.raise_on_error,
                        fs=fs,
                    )
                )

        return self._exclude_metadata(documents)

    async def aload_data(
        self,
        show_progress: bool = False,
        num_workers: Optional[int] = None,
        fs: Optional[fsspec.AbstractFileSystem] = None,
    ) -> List[Document]:
        """从输入目录加载数据。

Args:
    show_progress (bool): 是否显示tqdm进度条。默认为False。
    num_workers  (Optional[int]): 用于并行加载数据的工作进程数。
    fs (Optional[fsspec.AbstractFileSystem]): 要使用的文件系统。如果在构造函数中指定了fs，则会覆盖此处的fs参数。

Returns:
    List[Document]: 文档列表。
"""
        files_to_process = self.input_files
        fs = fs or self.fs

        coroutines = [self.aload_file(input_file) for input_file in files_to_process]
        if num_workers:
            document_lists = await run_jobs(
                coroutines, show_progress=show_progress, workers=num_workers
            )
        elif show_progress:
            _asyncio = get_asyncio_module(show_progress=show_progress)
            document_lists = await _asyncio.gather(*coroutines)
        else:
            document_lists = await asyncio.gather(*coroutines)
        documents = [doc for doc_list in document_lists for doc in doc_list]

        return self._exclude_metadata(documents)

    def iter_data(
        self, show_progress: bool = False
    ) -> Generator[List[Document], Any, Any]:
        """从输入目录迭代加载数据。

Args:
    show_progress（布尔值）：是否显示tqdm进度条。默认为False。

Returns:
    生成器[List[Document]]：文档列表。
"""
        files_to_process = self.input_files

        if show_progress:
            files_to_process = tqdm(self.input_files, desc="Loading files", unit="file")

        for input_file in files_to_process:
            documents = SimpleDirectoryReader.load_file(
                input_file=input_file,
                file_metadata=self.file_metadata,
                file_extractor=self.file_extractor,
                filename_as_id=self.filename_as_id,
                encoding=self.encoding,
                errors=self.errors,
                raise_on_error=self.raise_on_error,
                fs=self.fs,
            )

            documents = self._exclude_metadata(documents)

            if len(documents) > 0:
                yield documents

list_resources #

list_resources(*args: Any, **kwargs: Any) -> List[Path]

在给定的文件系统中列出文件。

Source code in llama_index/core/readers/file/base.py

def list_resources(self, *args: Any, **kwargs: Any) -> List[Path]:
    """在给定的文件系统中列出文件。"""
    return self.input_files

read_file_content #

read_file_content(input_file: Path, **kwargs) -> bytes

读取文件内容。

Source code in llama_index/core/readers/file/base.py

def read_file_content(self, input_file: Path, **kwargs) -> bytes:
    """读取文件内容。"""
    fs: fsspec.AbstractFileSystem = kwargs.get("fs", self.fs)
    with fs.open(input_file, errors=self.errors, encoding=self.encoding) as f:
        return f.read()

load_file `staticmethod` #

load_file(
    input_file: Path,
    file_metadata: Callable[[str], Dict],
    file_extractor: Dict[str, BaseReader],
    filename_as_id: bool = False,
    encoding: str = "utf-8",
    errors: str = "ignore",
    raise_on_error: bool = False,
    fs: Optional[AbstractFileSystem] = None,
) -> List[Document]

静态方法用于加载文件。

注意：必须作为静态方法以进行并行处理。

input_file（Path）：要读取的文件路径 file_metadata（[Callable[str, Dict]]）：一个接受文件名并返回文档元数据字典的函数。 file_extractor（Dict[str, BaseReader]）：文件扩展名到BaseReader类的映射，指定如何将该文件转换为文本。 filename_as_id（bool）：是否使用文件名作为文档ID。 encoding（str）：文件的编码。默认为utf-8。 errors（str）：如何处理编码和解码错误，请参阅https://docs.python.org/3/library/functions.html#open raise_on_error（bool）：是否在无法读取文件时引发错误。 fs（Optional[fsspec.AbstractFileSystem]）：要使用的文件系统。默认为使用本地文件系统。可以更改为使用任何远程文件系统。

Returns:

Type	Description
`List[Document]`	List[Document]：加载的文档

Source code in llama_index/core/readers/file/base.py

    @staticmethod
    def load_file(
        input_file: Path,
        file_metadata: Callable[[str], Dict],
        file_extractor: Dict[str, BaseReader],
        filename_as_id: bool = False,
        encoding: str = "utf-8",
        errors: str = "ignore",
        raise_on_error: bool = False,
        fs: Optional[fsspec.AbstractFileSystem] = None,
    ) -> List[Document]:
        """静态方法用于加载文件。

注意：必须作为静态方法以进行并行处理。

Args:
    input_file（Path）：_描述_
    file_metadata（Callable[[str], Dict]）：_描述_
    file_extractor（Dict[str, BaseReader]）：_描述_
    filename_as_id（bool，可选）：_描述_。默认为False。
    encoding（str，可选）：_描述_。默认为"utf-8"。
    errors（str，可选）：_描述_。默认为"ignore"。
    fs（Optional[fsspec.AbstractFileSystem]，可选）：_描述_。默认为None。

input_file（Path）：要读取的文件路径
file_metadata（[Callable[str, Dict]]）：一个接受文件名并返回文档元数据字典的函数。
file_extractor（Dict[str, BaseReader]）：文件扩展名到BaseReader类的映射，指定如何将该文件转换为文本。
filename_as_id（bool）：是否使用文件名作为文档ID。
encoding（str）：文件的编码。
    默认为utf-8。
errors（str）：如何处理编码和解码错误，
      请参阅https://docs.python.org/3/library/functions.html#open
raise_on_error（bool）：是否在无法读取文件时引发错误。
fs（Optional[fsspec.AbstractFileSystem]）：要使用的文件系统。默认为使用本地文件系统。可以更改为使用任何远程文件系统。

Returns:
    List[Document]：加载的文档
"""
        # TODO: make this less redundant
        default_file_reader_cls = SimpleDirectoryReader.supported_suffix_fn()
        default_file_reader_suffix = list(default_file_reader_cls.keys())
        metadata: Optional[dict] = None
        documents: List[Document] = []

        if file_metadata is not None:
            metadata = file_metadata(str(input_file))

        file_suffix = input_file.suffix.lower()
        if file_suffix in default_file_reader_suffix or file_suffix in file_extractor:
            # use file readers
            if file_suffix not in file_extractor:
                # instantiate file reader if not already
                reader_cls = default_file_reader_cls[file_suffix]
                file_extractor[file_suffix] = reader_cls()
            reader = file_extractor[file_suffix]

            # load data -- catch all errors except for ImportError
            try:
                kwargs = {"extra_info": metadata}
                if fs and not is_default_fs(fs):
                    kwargs["fs"] = fs
                docs = reader.load_data(input_file, **kwargs)
            except ImportError as e:
                # ensure that ImportError is raised so user knows
                # about missing dependencies
                raise ImportError(str(e))
            except Exception as e:
                if raise_on_error:
                    raise Exception("Error loading file") from e
                # otherwise, just skip the file and report the error
                print(
                    f"Failed to load file {input_file} with error: {e}. Skipping...",
                    flush=True,
                )
                return []

            # iterate over docs if needed
            if filename_as_id:
                for i, doc in enumerate(docs):
                    doc.id_ = f"{input_file!s}_part_{i}"

            documents.extend(docs)
        else:
            # do standard read
            fs = fs or get_default_fs()
            with fs.open(input_file, errors=errors, encoding=encoding) as f:
                data = f.read().decode(encoding, errors=errors)

            doc = Document(text=data, metadata=metadata or {})
            if filename_as_id:
                doc.id_ = str(input_file)

            documents.append(doc)

        return documents

aload_file `async` #

aload_file(input_file: Path) -> List[Document]

异步加载文件。

Source code in llama_index/core/readers/file/base.py

async def aload_file(self, input_file: Path) -> List[Document]:
    """异步加载文件。"""
    # TODO: make this less redundant
    default_file_reader_cls = SimpleDirectoryReader.supported_suffix_fn()
    default_file_reader_suffix = list(default_file_reader_cls.keys())
    metadata: Optional[dict] = None
    documents: List[Document] = []

    if self.file_metadata is not None:
        metadata = self.file_metadata(str(input_file))

    file_suffix = input_file.suffix.lower()
    if (
        file_suffix in default_file_reader_suffix
        or file_suffix in self.file_extractor
    ):
        # use file readers
        if file_suffix not in self.file_extractor:
            # instantiate file reader if not already
            reader_cls = default_file_reader_cls[file_suffix]
            self.file_extractor[file_suffix] = reader_cls()
        reader = self.file_extractor[file_suffix]

        # load data -- catch all errors except for ImportError
        try:
            kwargs = {"extra_info": metadata}
            if self.fs and not is_default_fs(self.fs):
                kwargs["fs"] = self.fs
            docs = await reader.aload_data(input_file, **kwargs)
        except ImportError as e:
            # ensure that ImportError is raised so user knows
            # about missing dependencies
            raise ImportError(str(e))
        except Exception as e:
            if self.raise_on_error:
                raise
            # otherwise, just skip the file and report the error
            print(
                f"Failed to load file {input_file} with error: {e}. Skipping...",
                flush=True,
            )
            return []

        # iterate over docs if needed
        if self.filename_as_id:
            for i, doc in enumerate(docs):
                doc.id_ = f"{input_file!s}_part_{i}"

        documents.extend(docs)
    else:
        # do standard read
        fs = self.fs or get_default_fs()
        with fs.open(input_file, errors=self.errors, encoding=self.encoding) as f:
            data = f.read().decode(self.encoding, errors=self.errors)

        doc = Document(text=data, metadata=metadata or {})
        if self.filename_as_id:
            doc.id_ = str(input_file)

        documents.append(doc)

    return documents

load_data #

load_data(
    show_progress: bool = False,
    num_workers: Optional[int] = None,
    fs: Optional[AbstractFileSystem] = None,
) -> List[Document]

从输入目录加载数据。

Parameters:

Name	Type	Description	Default
`show_progress`	`bool`	是否显示tqdm进度条。默认为False。	`False`
`num_workers`	`(Optional[int]`	用于并行加载数据的工作进程数。	`None`
`fs`	`Optional[AbstractFileSystem]`	要使用的文件系统。如果在构造函数中指定了fs，则会覆盖此处的fs参数。	`None`

Returns:

Type	Description
`List[Document]`	List[Document]: 文档列表。

Source code in llama_index/core/readers/file/base.py

    def load_data(
        self,
        show_progress: bool = False,
        num_workers: Optional[int] = None,
        fs: Optional[fsspec.AbstractFileSystem] = None,
    ) -> List[Document]:
        """从输入目录加载数据。

Args:
    show_progress (bool): 是否显示tqdm进度条。默认为False。
    num_workers  (Optional[int]): 用于并行加载数据的工作进程数。
    fs (Optional[fsspec.AbstractFileSystem]): 要使用的文件系统。如果在构造函数中指定了fs，则会覆盖此处的fs参数。

Returns:
    List[Document]: 文档列表。
"""
        documents = []

        files_to_process = self.input_files
        fs = fs or self.fs

        if num_workers and num_workers > 1:
            if num_workers > multiprocessing.cpu_count():
                warnings.warn(
                    "Specified num_workers exceed number of CPUs in the system. "
                    "Setting `num_workers` down to the maximum CPU count."
                )
            with multiprocessing.get_context("spawn").Pool(num_workers) as p:
                results = p.starmap(
                    SimpleDirectoryReader.load_file,
                    zip(
                        files_to_process,
                        repeat(self.file_metadata),
                        repeat(self.file_extractor),
                        repeat(self.filename_as_id),
                        repeat(self.encoding),
                        repeat(self.errors),
                        repeat(self.raise_on_error),
                        repeat(fs),
                    ),
                )
                documents = reduce(lambda x, y: x + y, results)

        else:
            if show_progress:
                files_to_process = tqdm(
                    self.input_files, desc="Loading files", unit="file"
                )
            for input_file in files_to_process:
                documents.extend(
                    SimpleDirectoryReader.load_file(
                        input_file=input_file,
                        file_metadata=self.file_metadata,
                        file_extractor=self.file_extractor,
                        filename_as_id=self.filename_as_id,
                        encoding=self.encoding,
                        errors=self.errors,
                        raise_on_error=self.raise_on_error,
                        fs=fs,
                    )
                )

        return self._exclude_metadata(documents)

aload_data `async` #

aload_data(
    show_progress: bool = False,
    num_workers: Optional[int] = None,
    fs: Optional[AbstractFileSystem] = None,
) -> List[Document]

从输入目录加载数据。

Parameters:

Name	Type	Description	Default
`show_progress`	`bool`	是否显示tqdm进度条。默认为False。	`False`
`num_workers`	`(Optional[int]`	用于并行加载数据的工作进程数。	`None`
`fs`	`Optional[AbstractFileSystem]`	要使用的文件系统。如果在构造函数中指定了fs，则会覆盖此处的fs参数。	`None`

Returns:

Type	Description
`List[Document]`	List[Document]: 文档列表。

Source code in llama_index/core/readers/file/base.py

    async def aload_data(
        self,
        show_progress: bool = False,
        num_workers: Optional[int] = None,
        fs: Optional[fsspec.AbstractFileSystem] = None,
    ) -> List[Document]:
        """从输入目录加载数据。

Args:
    show_progress (bool): 是否显示tqdm进度条。默认为False。
    num_workers  (Optional[int]): 用于并行加载数据的工作进程数。
    fs (Optional[fsspec.AbstractFileSystem]): 要使用的文件系统。如果在构造函数中指定了fs，则会覆盖此处的fs参数。

Returns:
    List[Document]: 文档列表。
"""
        files_to_process = self.input_files
        fs = fs or self.fs

        coroutines = [self.aload_file(input_file) for input_file in files_to_process]
        if num_workers:
            document_lists = await run_jobs(
                coroutines, show_progress=show_progress, workers=num_workers
            )
        elif show_progress:
            _asyncio = get_asyncio_module(show_progress=show_progress)
            document_lists = await _asyncio.gather(*coroutines)
        else:
            document_lists = await asyncio.gather(*coroutines)
        documents = [doc for doc_list in document_lists for doc in doc_list]

        return self._exclude_metadata(documents)

iter_data #

iter_data(
    show_progress: bool = False,
) -> Generator[List[Document], Any, Any]

从输入目录迭代加载数据。

Returns:

Type	Description
`Generator[List[Document], Any, Any]`	生成器[List[Document]]：文档列表。

Source code in llama_index/core/readers/file/base.py

    def iter_data(
        self, show_progress: bool = False
    ) -> Generator[List[Document], Any, Any]:
        """从输入目录迭代加载数据。

Args:
    show_progress（布尔值）：是否显示tqdm进度条。默认为False。

Returns:
    生成器[List[Document]]：文档列表。
"""
        files_to_process = self.input_files

        if show_progress:
            files_to_process = tqdm(self.input_files, desc="Loading files", unit="file")

        for input_file in files_to_process:
            documents = SimpleDirectoryReader.load_file(
                input_file=input_file,
                file_metadata=self.file_metadata,
                file_extractor=self.file_extractor,
                filename_as_id=self.filename_as_id,
                encoding=self.encoding,
                errors=self.errors,
                raise_on_error=self.raise_on_error,
                fs=self.fs,
            )

            documents = self._exclude_metadata(documents)

            if len(documents) > 0:
                yield documents

Simple directory reader