Source code for langchain_community.document_loaders.blob_loaders.file_system
"""用于从本地文件系统加载blob。"""
from pathlib import Path
from typing import Callable, Iterable, Iterator, Optional, Sequence, TypeVar, Union
from langchain_community.document_loaders.blob_loaders.schema import Blob, BlobLoader
T = TypeVar("T")
def _make_iterator(
length_func: Callable[[], int], show_progress: bool = False
) -> Callable[[Iterable[T]], Iterator[T]]:
"""创建一个函数,可以选择性地在tqdm中包装一个可迭代对象。"""
iterator: Callable[[Iterable[T]], Iterator[T]]
if show_progress:
try:
from tqdm.auto import tqdm
except ImportError:
raise ImportError(
"You must install tqdm to use show_progress=True."
"You can install tqdm with `pip install tqdm`."
)
# Make sure to provide `total` here so that tqdm can show
# a progress bar that takes into account the total number of files.
def _with_tqdm(iterable: Iterable[T]) -> Iterator[T]:
"""将可迭代对象包装在tqdm进度条中。"""
return tqdm(iterable, total=length_func())
iterator = _with_tqdm
else:
iterator = iter
return iterator
# PUBLIC API
[docs]class FileSystemBlobLoader(BlobLoader):
"""在本地文件系统中加载blob。
示例:
.. code-block:: python
from langchain_community.document_loaders.blob_loaders import FileSystemBlobLoader
loader = FileSystemBlobLoader("/path/to/directory")
for blob in loader.yield_blobs():
print(blob) # noqa: T201""" # noqa: E501
[docs] def __init__(
self,
path: Union[str, Path],
*,
glob: str = "**/[!.]*",
exclude: Sequence[str] = (),
suffixes: Optional[Sequence[str]] = None,
show_progress: bool = False,
) -> None:
"""初始化目录路径和对其进行全局搜索的方式。
参数:
path:要加载的目录路径或要加载的文件路径。
如果提供文件路径,则忽略glob/exclude/suffixes。
glob:相对于指定路径的全局搜索模式
默认设置为选择所有非隐藏文件
exclude:要从结果中排除的模式,使用glob语法
suffixes:提供以仅保留具有这些后缀的文件
想要保留具有不同后缀的文件时很有用
后缀必须包括点,例如".txt"
show_progress:如果为True,则在加载文件时显示进度条。
这将强制迭代所有匹配的文件
在加载文件之前对它们进行计数。
示例:
.. code-block:: python
from langchain_community.document_loaders.blob_loaders import FileSystemBlobLoader
# 加载单个文件。
loader = FileSystemBlobLoader("/path/to/file.txt")
# 递归加载目录中的所有文本文件。
loader = FileSystemBlobLoader("/path/to/directory", glob="**/*.txt")
# 递归加载目录中的所有非隐藏文件。
loader = FileSystemBlobLoader("/path/to/directory", glob="**/[!.]*")
# 加载目录中的所有文件,不进行递归。
loader = FileSystemBlobLoader("/path/to/directory", glob="*")
# 递归加载目录中的所有文件,但不包括py或pyc文件。
loader = FileSystemBlobLoader(
"/path/to/directory",
glob="**/*.txt",
exclude=[" **/*.py", "** /*.pyc"]
)
""" # noqa: E501
if isinstance(path, Path):
_path = path
elif isinstance(path, str):
_path = Path(path)
else:
raise TypeError(f"Expected str or Path, got {type(path)}")
self.path = _path.expanduser() # Expand user to handle ~
self.glob = glob
self.suffixes = set(suffixes or [])
self.show_progress = show_progress
self.exclude = exclude
[docs] def yield_blobs(
self,
) -> Iterable[Blob]:
"""产生与请求模式匹配的块。"""
iterator = _make_iterator(
length_func=self.count_matching_files, show_progress=self.show_progress
)
for path in iterator(self._yield_paths()):
yield Blob.from_path(path)
def _yield_paths(self) -> Iterable[Path]:
"""返回与请求模式匹配的路径。"""
if self.path.is_file():
yield self.path
return
paths = self.path.glob(self.glob)
for path in paths:
if self.exclude:
if any(path.match(glob) for glob in self.exclude):
continue
if path.is_file():
if self.suffixes and path.suffix not in self.suffixes:
continue
yield path
[docs] def count_matching_files(self) -> int:
"""计算与模式匹配的文件数量,而无需加载它们。"""
# Carry out a full iteration to count the files without
# materializing anything expensive in memory.
num = 0
for _ in self._yield_paths():
num += 1
return num