Source code for langchain_text_splitters.spacy

from __future__ import annotations

from typing import Any, List

from langchain_text_splitters.base import TextSplitter


[docs]class SpacyTextSplitter(TextSplitter): """使用Spacy包来分割文本。 默认情况下,使用Spacy的`en_core_web_sm`模型, 其默认的max_length为1000000(这是该模型所能处理的最大字符长度,对于大文件可以增加)。 为了更快速但可能不太准确地分割文本,可以使用`pipeline='sentencizer'`。 """
[docs] def __init__( self, separator: str = "\n\n", pipeline: str = "en_core_web_sm", max_length: int = 1_000_000, **kwargs: Any, ) -> None: """初始化spacy文本分割器。""" super().__init__(**kwargs) self._tokenizer = _make_spacy_pipeline_for_splitting( pipeline, max_length=max_length ) self._separator = separator
[docs] def split_text(self, text: str) -> List[str]: """分割输入的文本并返回各个部分。""" splits = (s.text for s in self._tokenizer(text).sents) return self._merge_splits(splits, self._separator)
def _make_spacy_pipeline_for_splitting( pipeline: str, *, max_length: int = 1_000_000 ) -> Any: # avoid importing spacy try: import spacy except ImportError: raise ImportError( "Spacy is not installed, please install it with `pip install spacy`." ) if pipeline == "sentencizer": from spacy.lang.en import English sentencizer: Any = English() sentencizer.add_pipe("sentencizer") else: sentencizer = spacy.load(pipeline, exclude=["ner", "tagger"]) sentencizer.max_length = max_length return sentencizer