Source code for langchain_community.document_transformers.markdownify

import re
from typing import Any, List, Optional, Sequence, Union

from langchain_core.documents import BaseDocumentTransformer, Document


[docs]class MarkdownifyTransformer(BaseDocumentTransformer): """将HTML文档转换为Markdown格式,可使用markdownify库处理链接、图片、其他标签和标题样式的自定义选项。 参数: strip: 要剥离的标签列表。此选项不能与convert选项一起使用。 convert: 要转换的标签列表。此选项不能与strip选项一起使用。 autolinks: 一个布尔值,指示是否在a标签的内容匹配其href时使用“自动链接”样式。默认为True。 heading_style: 定义标题应如何转换。接受的值为ATX、ATX_CLOSED、SETEXT和UNDERLINED(是SETEXT的别名)。默认为ATX。 **kwargs: 传递给markdownify的其他选项。 示例: .. code-block:: python from langchain_community.document_transformers import MarkdownifyTransformer markdownify = MarkdownifyTransformer() docs_transform = markdownify.transform_documents(docs) 更多配置选项可以在markdownify的GitHub页面找到: https://github.com/matthewwithanm/python-markdownify""" # noqa: E501
[docs] def __init__( self, strip: Optional[Union[str, List[str]]] = None, convert: Optional[Union[str, List[str]]] = None, autolinks: bool = True, heading_style: str = "ATX", **kwargs: Any, ) -> None: self.strip = [strip] if isinstance(strip, str) else strip self.convert = [convert] if isinstance(convert, str) else convert self.autolinks = autolinks self.heading_style = heading_style self.additional_options = kwargs
[docs] def transform_documents( self, documents: Sequence[Document], **kwargs: Any, ) -> Sequence[Document]: try: from markdownify import markdownify except ImportError: raise ImportError( """markdownify package not found, please install it with `pip install markdownify`""" ) converted_documents = [] for doc in documents: markdown_content = ( markdownify( html=doc.page_content, strip=self.strip, convert=self.convert, autolinks=self.autolinks, heading_style=self.heading_style, **self.additional_options, ) .replace("\xa0", " ") .strip() ) cleaned_markdown = re.sub(r"\n\s*\n", "\n\n", markdown_content) converted_documents.append( Document(cleaned_markdown, metadata=doc.metadata) ) return converted_documents
[docs] async def atransform_documents( self, documents: Sequence[Document], **kwargs: Any, ) -> Sequence[Document]: raise NotImplementedError