Source code for langchain_community.document_transformers.markdownify
import re
from typing import Any, List, Optional, Sequence, Union
from langchain_core.documents import BaseDocumentTransformer, Document
[docs]class MarkdownifyTransformer(BaseDocumentTransformer):
"""将HTML文档转换为Markdown格式,可使用markdownify库处理链接、图片、其他标签和标题样式的自定义选项。
参数:
strip: 要剥离的标签列表。此选项不能与convert选项一起使用。
convert: 要转换的标签列表。此选项不能与strip选项一起使用。
autolinks: 一个布尔值,指示是否在a标签的内容匹配其href时使用“自动链接”样式。默认为True。
heading_style: 定义标题应如何转换。接受的值为ATX、ATX_CLOSED、SETEXT和UNDERLINED(是SETEXT的别名)。默认为ATX。
**kwargs: 传递给markdownify的其他选项。
示例:
.. code-block:: python
from langchain_community.document_transformers import MarkdownifyTransformer
markdownify = MarkdownifyTransformer()
docs_transform = markdownify.transform_documents(docs)
更多配置选项可以在markdownify的GitHub页面找到:
https://github.com/matthewwithanm/python-markdownify""" # noqa: E501
[docs] def __init__(
self,
strip: Optional[Union[str, List[str]]] = None,
convert: Optional[Union[str, List[str]]] = None,
autolinks: bool = True,
heading_style: str = "ATX",
**kwargs: Any,
) -> None:
self.strip = [strip] if isinstance(strip, str) else strip
self.convert = [convert] if isinstance(convert, str) else convert
self.autolinks = autolinks
self.heading_style = heading_style
self.additional_options = kwargs
[docs] def transform_documents(
self,
documents: Sequence[Document],
**kwargs: Any,
) -> Sequence[Document]:
try:
from markdownify import markdownify
except ImportError:
raise ImportError(
"""markdownify package not found, please
install it with `pip install markdownify`"""
)
converted_documents = []
for doc in documents:
markdown_content = (
markdownify(
html=doc.page_content,
strip=self.strip,
convert=self.convert,
autolinks=self.autolinks,
heading_style=self.heading_style,
**self.additional_options,
)
.replace("\xa0", " ")
.strip()
)
cleaned_markdown = re.sub(r"\n\s*\n", "\n\n", markdown_content)
converted_documents.append(
Document(cleaned_markdown, metadata=doc.metadata)
)
return converted_documents
[docs] async def atransform_documents(
self,
documents: Sequence[Document],
**kwargs: Any,
) -> Sequence[Document]:
raise NotImplementedError