Source code for langchain_community.document_loaders.spider

from typing import Iterator, Literal, Optional

from langchain_core.document_loaders import BaseLoader
from langchain_core.documents import Document
from langchain_core.utils import get_from_env


[docs]class SpiderLoader(BaseLoader): """使用Spider AI将网页加载为文档。 必须安装Python包`spider-client`并拥有Spider API密钥。 更多信息请参见https://spider.cloud。"""
[docs] def __init__( self, url: str, *, api_key: Optional[str] = None, mode: Literal["scrape", "crawl"] = "scrape", params: Optional[dict] = None, ): """初始化API密钥和URL。 参数: url: 要处理的URL。 api_key: Spider API密钥。如果未指定,将从环境变量`SPIDER_API_KEY`中读取。 mode: 加载程序运行的模式。默认为"scrape"。 选项包括"scrape"(单个页面)和"crawl"(随后深入爬取子页面)。 params: Spider API的额外参数。 """ if params is None: params = { "return_format": "markdown", "metadata": True, } # Using the metadata param slightly slows down the output try: from spider import Spider except ImportError: raise ImportError( "`spider` package not found, please run `pip install spider-client`" ) if mode not in ("scrape", "crawl"): raise ValueError( f"Unrecognized mode '{mode}'. Expected one of 'scrape', 'crawl'." ) # Use the environment variable if the API key isn't provided api_key = api_key or get_from_env("api_key", "SPIDER_API_KEY") self.spider = Spider(api_key=api_key) self.url = url self.mode = mode self.params = params
[docs] def lazy_load(self) -> Iterator[Document]: """根据指定的模式加载文档。""" spider_docs = [] if self.mode == "scrape": # Scrape a single page response = self.spider.scrape_url(self.url, params=self.params) if response: spider_docs.append(response) elif self.mode == "crawl": # Crawl multiple pages response = self.spider.crawl_url(self.url, params=self.params) if response: spider_docs.extend(response) for doc in spider_docs: if self.mode == "scrape": # Ensure page_content is also not None page_content = doc[0].get("content", "") # Ensure metadata is also not None metadata = doc[0].get("metadata", {}) if page_content is not None: yield Document(page_content=page_content, metadata=metadata) if self.mode == "crawl": # Ensure page_content is also not None page_content = doc.get("content", "") # Ensure metadata is also not None metadata = doc.get("metadata", {}) if page_content is not None: yield Document( page_content=page_content, metadata=metadata, )