Source code for langchain_community.document_loaders.blob_loaders.file_system

"""用于从本地文件系统加载blob。"""

from pathlib import Path
from typing import Callable, Iterable, Iterator, Optional, Sequence, TypeVar, Union

from langchain_community.document_loaders.blob_loaders.schema import Blob, BlobLoader

T = TypeVar("T")


def _make_iterator(
    length_func: Callable[[], int], show_progress: bool = False
) -> Callable[[Iterable[T]], Iterator[T]]:
    """创建一个函数，可以选择性地在tqdm中包装一个可迭代对象。"""
    iterator: Callable[[Iterable[T]], Iterator[T]]
    if show_progress:
        try:
            from tqdm.auto import tqdm
        except ImportError:
            raise ImportError(
                "You must install tqdm to use show_progress=True."
                "You can install tqdm with `pip install tqdm`."
            )

        # Make sure to provide `total` here so that tqdm can show
        # a progress bar that takes into account the total number of files.
        def _with_tqdm(iterable: Iterable[T]) -> Iterator[T]:
            """将可迭代对象包装在tqdm进度条中。"""
            return tqdm(iterable, total=length_func())

        iterator = _with_tqdm
    else:
        iterator = iter

    return iterator


# PUBLIC API


[docs]class FileSystemBlobLoader(BlobLoader):
    """在本地文件系统中加载blob。

    示例：

    .. code-block:: python

        from langchain_community.document_loaders.blob_loaders import FileSystemBlobLoader
        loader = FileSystemBlobLoader("/path/to/directory")
        for blob in loader.yield_blobs():
            print(blob)  # noqa: T201"""  # noqa: E501

[docs]    def __init__(
        self,
        path: Union[str, Path],
        *,
        glob: str = "**/[!.]*",
        exclude: Sequence[str] = (),
        suffixes: Optional[Sequence[str]] = None,
        show_progress: bool = False,
    ) -> None:
        """初始化目录路径和对其进行全局搜索的方式。

参数：
    path：要加载的目录路径或要加载的文件路径。
          如果提供文件路径，则忽略glob/exclude/suffixes。
    glob：相对于指定路径的全局搜索模式
          默认设置为选择所有非隐藏文件
    exclude：要从结果中排除的模式，使用glob语法
    suffixes：提供以仅保留具有这些后缀的文件
              想要保留具有不同后缀的文件时很有用
              后缀必须包括点，例如".txt"
    show_progress：如果为True，则在加载文件时显示进度条。
                   这将强制迭代所有匹配的文件
                   在加载文件之前对它们进行计数。

示例：

    .. code-block:: python
        from langchain_community.document_loaders.blob_loaders import FileSystemBlobLoader

        # 加载单个文件。
        loader = FileSystemBlobLoader("/path/to/file.txt")

        # 递归加载目录中的所有文本文件。
        loader = FileSystemBlobLoader("/path/to/directory", glob="**/*.txt")

        # 递归加载目录中的所有非隐藏文件。
        loader = FileSystemBlobLoader("/path/to/directory", glob="**/[!.]*")

        # 加载目录中的所有文件，不进行递归。
        loader = FileSystemBlobLoader("/path/to/directory", glob="*")

        # 递归加载目录中的所有文件，但不包括py或pyc文件。
        loader = FileSystemBlobLoader(
            "/path/to/directory",
            glob="**/*.txt",
            exclude=[" **/*.py", "** /*.pyc"]
        )
"""  # noqa: E501
        if isinstance(path, Path):
            _path = path
        elif isinstance(path, str):
            _path = Path(path)
        else:
            raise TypeError(f"Expected str or Path, got {type(path)}")

        self.path = _path.expanduser()  # Expand user to handle ~
        self.glob = glob
        self.suffixes = set(suffixes or [])
        self.show_progress = show_progress
        self.exclude = exclude

[docs]    def yield_blobs(
        self,
    ) -> Iterable[Blob]:
        """产生与请求模式匹配的块。"""
        iterator = _make_iterator(
            length_func=self.count_matching_files, show_progress=self.show_progress
        )

        for path in iterator(self._yield_paths()):
            yield Blob.from_path(path)

    def _yield_paths(self) -> Iterable[Path]:
        """返回与请求模式匹配的路径。"""
        if self.path.is_file():
            yield self.path
            return

        paths = self.path.glob(self.glob)
        for path in paths:
            if self.exclude:
                if any(path.match(glob) for glob in self.exclude):
                    continue
            if path.is_file():
                if self.suffixes and path.suffix not in self.suffixes:
                    continue
                yield path

[docs]    def count_matching_files(self) -> int:
        """计算与模式匹配的文件数量，而无需加载它们。"""
        # Carry out a full iteration to count the files without
        # materializing anything expensive in memory.
        num = 0
        for _ in self._yield_paths():
            num += 1
        return num