Source code for langchain_community.document_loaders.helpers
"""文档加载器助手。"""
import concurrent.futures
from pathlib import Path
from typing import List, NamedTuple, Optional, Union, cast
[docs]class FileEncoding(NamedTuple):
"""文件编码作为NamedTuple。"""
encoding: Optional[str]
"""文件的编码。"""
confidence: float
"""编码的置信度。"""
language: Optional[str]
"""文件的语言。"""
[docs]def detect_file_encodings(
file_path: Union[str, Path], timeout: int = 5
) -> List[FileEncoding]:
"""尝试检测文件的编码。
返回一个按置信度排序的检测到的编码的`FileEncoding`元组列表。
参数:
file_path:要检测编码的文件路径。
timeout:编码检测的超时时间(秒)。
"""
import chardet
file_path = str(file_path)
def read_and_detect(file_path: str) -> List[dict]:
with open(file_path, "rb") as f:
rawdata = f.read()
return cast(List[dict], chardet.detect_all(rawdata))
with concurrent.futures.ThreadPoolExecutor() as executor:
future = executor.submit(read_and_detect, file_path)
try:
encodings = future.result(timeout=timeout)
except concurrent.futures.TimeoutError:
raise TimeoutError(
f"Timeout reached while detecting encoding for {file_path}"
)
if all(encoding["encoding"] is None for encoding in encodings):
raise RuntimeError(f"Could not detect encoding for {file_path}")
return [FileEncoding(**enc) for enc in encodings if enc["encoding"] is not None]