langchain_community.document_loaders.text

import logging
from pathlib import Path
from typing import Iterator, Optional, Union

from langchain_core.documents import Document

from langchain_community.document_loaders.base import BaseLoader
from langchain_community.document_loaders.helpers import detect_file_encodings

logger = logging.getLogger(__name__)


[docs]class TextLoader(BaseLoader):
    """加载文本文件。

    参数：
        file_path：要加载的文件路径。

        encoding：要使用的文件编码。如果为`None`，则将使用默认的系统编码。

        autodetect_encoding：是否尝试自动检测文件编码，如果指定的编码失败。"""

[docs]    def __init__(
        self,
        file_path: Union[str, Path],
        encoding: Optional[str] = None,
        autodetect_encoding: bool = False,
    ):
        """使用文件路径进行初始化。"""
        self.file_path = file_path
        self.encoding = encoding
        self.autodetect_encoding = autodetect_encoding

[docs]    def lazy_load(self) -> Iterator[Document]:
        """从文件路径加载。"""
        text = ""
        try:
            with open(self.file_path, encoding=self.encoding) as f:
                text = f.read()
        except UnicodeDecodeError as e:
            if self.autodetect_encoding:
                detected_encodings = detect_file_encodings(self.file_path)
                for encoding in detected_encodings:
                    logger.debug(f"Trying encoding: {encoding.encoding}")
                    try:
                        with open(self.file_path, encoding=encoding.encoding) as f:
                            text = f.read()
                        break
                    except UnicodeDecodeError:
                        continue
            else:
                raise RuntimeError(f"Error loading {self.file_path}") from e
        except Exception as e:
            raise RuntimeError(f"Error loading {self.file_path}") from e

        metadata = {"source": str(self.file_path)}
        yield Document(page_content=text, metadata=metadata)
Source code for langchain_community.document_loaders.text