Source code for langchain_core.documents.transformers
from __future__ import annotations
from abc import ABC, abstractmethod
from typing import TYPE_CHECKING, Any, Sequence
from langchain_core.runnables.config import run_in_executor
if TYPE_CHECKING:
from langchain_core.documents import Document
[docs]class BaseDocumentTransformer(ABC):
"""抽象基类,用于文档转换系统。
文档转换系统接受一系列文档并返回一系列转换后的文档。
示例:
.. code-block:: python
class EmbeddingsRedundantFilter(BaseDocumentTransformer, BaseModel):
embeddings: Embeddings
similarity_fn: Callable = cosine_similarity
similarity_threshold: float = 0.95
class Config:
arbitrary_types_allowed = True
def transform_documents(
self, documents: Sequence[Document], **kwargs: Any
) -> Sequence[Document]:
stateful_documents = get_stateful_documents(documents)
embedded_documents = _get_embeddings_from_stateful_docs(
self.embeddings, stateful_documents
)
included_idxs = _filter_similar_embeddings(
embedded_documents, self.similarity_fn, self.similarity_threshold
)
return [stateful_documents[i] for i in sorted(included_idxs)]
async def atransform_documents(
self, documents: Sequence[Document], **kwargs: Any
) -> Sequence[Document]:
raise NotImplementedError
""" # noqa: E501
[docs] @abstractmethod
def transform_documents(
self, documents: Sequence[Document], **kwargs: Any
) -> Sequence[Document]:
"""转换文档列表。
参数:
documents:要转换的文档序列。
返回:
转换后的文档列表。
"""
[docs] async def atransform_documents(
self, documents: Sequence[Document], **kwargs: Any
) -> Sequence[Document]:
"""异步转换文档列表。
参数:
documents:要转换的文档序列。
返回:
转换后的文档列表。
"""
return await run_in_executor(
None, self.transform_documents, documents, **kwargs
)