langchain_text_splitters.spacy — 🦜🔗 LangChain 0.2.0

from __future__ import annotations

from typing import Any, List

from langchain_text_splitters.base import TextSplitter


[docs]class SpacyTextSplitter(TextSplitter):
    """使用Spacy包来分割文本。

默认情况下，使用Spacy的`en_core_web_sm`模型，
其默认的max_length为1000000（这是该模型所能处理的最大字符长度，对于大文件可以增加）。
为了更快速但可能不太准确地分割文本，可以使用`pipeline='sentencizer'`。
"""

[docs]    def __init__(
        self,
        separator: str = "\n\n",
        pipeline: str = "en_core_web_sm",
        max_length: int = 1_000_000,
        **kwargs: Any,
    ) -> None:
        """初始化spacy文本分割器。"""
        super().__init__(**kwargs)
        self._tokenizer = _make_spacy_pipeline_for_splitting(
            pipeline, max_length=max_length
        )
        self._separator = separator

[docs]    def split_text(self, text: str) -> List[str]:
        """分割输入的文本并返回各个部分。"""
        splits = (s.text for s in self._tokenizer(text).sents)
        return self._merge_splits(splits, self._separator)


def _make_spacy_pipeline_for_splitting(
    pipeline: str, *, max_length: int = 1_000_000
) -> Any:  # avoid importing spacy
    try:
        import spacy
    except ImportError:
        raise ImportError(
            "Spacy is not installed, please install it with `pip install spacy`."
        )
    if pipeline == "sentencizer":
        from spacy.lang.en import English

        sentencizer: Any = English()
        sentencizer.add_pipe("sentencizer")
    else:
        sentencizer = spacy.load(pipeline, exclude=["ner", "tagger"])
        sentencizer.max_length = max_length
    return sentencizer
Source code for langchain_text_splitters.spacy