Source code for langchain_community.document_loaders.blob_loaders.file_system

"""用于从本地文件系统加载blob。"""

from pathlib import Path
from typing import Callable, Iterable, Iterator, Optional, Sequence, TypeVar, Union

from langchain_community.document_loaders.blob_loaders.schema import Blob, BlobLoader

T = TypeVar("T")


def _make_iterator(
    length_func: Callable[[], int], show_progress: bool = False
) -> Callable[[Iterable[T]], Iterator[T]]:
    """创建一个函数,可以选择性地在tqdm中包装一个可迭代对象。"""
    iterator: Callable[[Iterable[T]], Iterator[T]]
    if show_progress:
        try:
            from tqdm.auto import tqdm
        except ImportError:
            raise ImportError(
                "You must install tqdm to use show_progress=True."
                "You can install tqdm with `pip install tqdm`."
            )

        # Make sure to provide `total` here so that tqdm can show
        # a progress bar that takes into account the total number of files.
        def _with_tqdm(iterable: Iterable[T]) -> Iterator[T]:
            """将可迭代对象包装在tqdm进度条中。"""
            return tqdm(iterable, total=length_func())

        iterator = _with_tqdm
    else:
        iterator = iter

    return iterator


# PUBLIC API


[docs]class FileSystemBlobLoader(BlobLoader): """在本地文件系统中加载blob。 示例: .. code-block:: python from langchain_community.document_loaders.blob_loaders import FileSystemBlobLoader loader = FileSystemBlobLoader("/path/to/directory") for blob in loader.yield_blobs(): print(blob) # noqa: T201""" # noqa: E501
[docs] def __init__( self, path: Union[str, Path], *, glob: str = "**/[!.]*", exclude: Sequence[str] = (), suffixes: Optional[Sequence[str]] = None, show_progress: bool = False, ) -> None: """初始化目录路径和对其进行全局搜索的方式。 参数: path:要加载的目录路径或要加载的文件路径。 如果提供文件路径,则忽略glob/exclude/suffixes。 glob:相对于指定路径的全局搜索模式 默认设置为选择所有非隐藏文件 exclude:要从结果中排除的模式,使用glob语法 suffixes:提供以仅保留具有这些后缀的文件 想要保留具有不同后缀的文件时很有用 后缀必须包括点,例如".txt" show_progress:如果为True,则在加载文件时显示进度条。 这将强制迭代所有匹配的文件 在加载文件之前对它们进行计数。 示例: .. code-block:: python from langchain_community.document_loaders.blob_loaders import FileSystemBlobLoader # 加载单个文件。 loader = FileSystemBlobLoader("/path/to/file.txt") # 递归加载目录中的所有文本文件。 loader = FileSystemBlobLoader("/path/to/directory", glob="**/*.txt") # 递归加载目录中的所有非隐藏文件。 loader = FileSystemBlobLoader("/path/to/directory", glob="**/[!.]*") # 加载目录中的所有文件,不进行递归。 loader = FileSystemBlobLoader("/path/to/directory", glob="*") # 递归加载目录中的所有文件,但不包括py或pyc文件。 loader = FileSystemBlobLoader( "/path/to/directory", glob="**/*.txt", exclude=[" **/*.py", "** /*.pyc"] ) """ # noqa: E501 if isinstance(path, Path): _path = path elif isinstance(path, str): _path = Path(path) else: raise TypeError(f"Expected str or Path, got {type(path)}") self.path = _path.expanduser() # Expand user to handle ~ self.glob = glob self.suffixes = set(suffixes or []) self.show_progress = show_progress self.exclude = exclude
[docs] def yield_blobs( self, ) -> Iterable[Blob]: """产生与请求模式匹配的块。""" iterator = _make_iterator( length_func=self.count_matching_files, show_progress=self.show_progress ) for path in iterator(self._yield_paths()): yield Blob.from_path(path)
def _yield_paths(self) -> Iterable[Path]: """返回与请求模式匹配的路径。""" if self.path.is_file(): yield self.path return paths = self.path.glob(self.glob) for path in paths: if self.exclude: if any(path.match(glob) for glob in self.exclude): continue if path.is_file(): if self.suffixes and path.suffix not in self.suffixes: continue yield path
[docs] def count_matching_files(self) -> int: """计算与模式匹配的文件数量,而无需加载它们。""" # Carry out a full iteration to count the files without # materializing anything expensive in memory. num = 0 for _ in self._yield_paths(): num += 1 return num