Source code for langchain_community.document_loaders.chromium

import asyncio
import logging
from typing import AsyncIterator, Iterator, List

from langchain_core.documents import Document

from langchain_community.document_loaders.base import BaseLoader

logger = logging.getLogger(__name__)


[docs]class AsyncChromiumLoader(BaseLoader): """使用无头Chromium实例从URL中抓取HTML页面。"""
[docs] def __init__(self, urls: List[str], *, headless: bool = True): """用URL路径列表初始化加载器。 参数: urls:要从中提取内容的URL列表。 headless:是否在无头模式下运行浏览器。 引发: ImportError:如果未安装所需的“playwright”包。 """ self.urls = urls self.headless = headless try: import playwright # noqa: F401 except ImportError: raise ImportError( "playwright is required for AsyncChromiumLoader. " "Please install it with `pip install playwright`." )
[docs] async def ascrape_playwright(self, url: str) -> str: """使用Playwright的异步API异步地抓取给定URL的内容。 参数: url (str): 要抓取的URL。 返回: str: 抓取的HTML内容或如果发生异常则返回错误消息。 """ from playwright.async_api import async_playwright logger.info("Starting scraping...") results = "" async with async_playwright() as p: browser = await p.chromium.launch(headless=self.headless) try: page = await browser.new_page() await page.goto(url) results = await page.content() # Simply get the HTML content logger.info("Content scraped") except Exception as e: results = f"Error: {e}" await browser.close() return results
[docs] def lazy_load(self) -> Iterator[Document]: """从提供的URL中延迟加载文本内容。 该方法逐个生成文档,而不是等待所有URL被抓取后再返回。 生成: Document: 封装在Document对象中的抓取内容。 """ for url in self.urls: html_content = asyncio.run(self.ascrape_playwright(url)) metadata = {"source": url} yield Document(page_content=html_content, metadata=metadata)
[docs] async def alazy_load(self) -> AsyncIterator[Document]: """从提供的URL异步加载文本内容。 该方法利用asyncio同时启动所有提供的URL的抓取,通过利用并发的异步请求来提高性能。每个文档在其内容可用时被产生,封装了抓取的内容。 产生: Document: 包含抓取内容的Document对象,以及其源URL作为元数据。 """ tasks = [self.ascrape_playwright(url) for url in self.urls] results = await asyncio.gather(*tasks) for url, content in zip(self.urls, results): metadata = {"source": url} yield Document(page_content=content, metadata=metadata)