Source code for langchain_community.document_loaders.mediawikidump

import logging
from pathlib import Path
from typing import Iterator, Optional, Sequence, Union

from langchain_core.documents import Document

from langchain_community.document_loaders.base import BaseLoader

logger = logging.getLogger(__name__)


[docs]class MWDumpLoader(BaseLoader): """从`XML`文件中加载`MediaWiki`转储。 示例: .. code-block:: python from langchain_text_splitters import RecursiveCharacterTextSplitter from langchain_community.document_loaders import MWDumpLoader loader = MWDumpLoader( file_path="myWiki.xml", encoding="utf8" ) docs = loader.load() text_splitter = RecursiveCharacterTextSplitter( chunk_size=1000, chunk_overlap=0 ) texts = text_splitter.split_documents(docs) :param file_path: XML本地文件路径 :type file_path: str :param encoding: 字符集编码,默认为"utf8" :type encoding: str, optional :param namespaces: 要解析的页面的命名空间。 请参阅https://www.mediawiki.org/wiki/Help:Namespaces#Localisation 以获取所有常见命名空间的列表 :type namespaces: List[int],optional :param skip_redirects: TR=rue跳过重定向到其他页面的页面, False保留它们。默认为False :type skip_redirects: bool, optional :param stop_on_error: False跳过导致解析错误的页面, True停止。默认为True :type stop_on_error: bool, optional"""
[docs] def __init__( self, file_path: Union[str, Path], encoding: Optional[str] = "utf8", namespaces: Optional[Sequence[int]] = None, skip_redirects: Optional[bool] = False, stop_on_error: Optional[bool] = True, ): self.file_path = file_path if isinstance(file_path, str) else str(file_path) self.encoding = encoding # Namespaces range from -2 to 15, inclusive. self.namespaces = namespaces self.skip_redirects = skip_redirects self.stop_on_error = stop_on_error
def _load_dump_file(self): # type: ignore[no-untyped-def] try: import mwxml except ImportError as e: raise ImportError( "Unable to import 'mwxml'. Please install with" " `pip install mwxml`." ) from e return mwxml.Dump.from_file(open(self.file_path, encoding=self.encoding)) def _load_single_page_from_dump(self, page) -> Document: # type: ignore[no-untyped-def, return] """解析单个页面。""" try: import mwparserfromhell except ImportError as e: raise ImportError( "Unable to import 'mwparserfromhell'. Please install with" " `pip install mwparserfromhell`." ) from e for revision in page: code = mwparserfromhell.parse(revision.text) text = code.strip_code( normalize=True, collapse=True, keep_template_params=False ) metadata = {"source": page.title} return Document(page_content=text, metadata=metadata)
[docs] def lazy_load( self, ) -> Iterator[Document]: """从文件路径中进行延迟加载。""" dump = self._load_dump_file() for page in dump.pages: if self.skip_redirects and page.redirect: continue if self.namespaces and page.namespace not in self.namespaces: continue try: yield self._load_single_page_from_dump(page) except Exception as e: logger.error("Parsing error: {}".format(e)) if self.stop_on_error: raise e else: continue