Source code for langchain_text_splitters.spacy
from __future__ import annotations
from typing import Any, List
from langchain_text_splitters.base import TextSplitter
[docs]class SpacyTextSplitter(TextSplitter):
"""使用Spacy包来分割文本。
默认情况下,使用Spacy的`en_core_web_sm`模型,
其默认的max_length为1000000(这是该模型所能处理的最大字符长度,对于大文件可以增加)。
为了更快速但可能不太准确地分割文本,可以使用`pipeline='sentencizer'`。
"""
[docs] def __init__(
self,
separator: str = "\n\n",
pipeline: str = "en_core_web_sm",
max_length: int = 1_000_000,
**kwargs: Any,
) -> None:
"""初始化spacy文本分割器。"""
super().__init__(**kwargs)
self._tokenizer = _make_spacy_pipeline_for_splitting(
pipeline, max_length=max_length
)
self._separator = separator
[docs] def split_text(self, text: str) -> List[str]:
"""分割输入的文本并返回各个部分。"""
splits = (s.text for s in self._tokenizer(text).sents)
return self._merge_splits(splits, self._separator)
def _make_spacy_pipeline_for_splitting(
pipeline: str, *, max_length: int = 1_000_000
) -> Any: # avoid importing spacy
try:
import spacy
except ImportError:
raise ImportError(
"Spacy is not installed, please install it with `pip install spacy`."
)
if pipeline == "sentencizer":
from spacy.lang.en import English
sentencizer: Any = English()
sentencizer.add_pipe("sentencizer")
else:
sentencizer = spacy.load(pipeline, exclude=["ner", "tagger"])
sentencizer.max_length = max_length
return sentencizer