Source code for langchain_community.document_loaders.chromium
import asyncio
import logging
from typing import AsyncIterator, Iterator, List
from langchain_core.documents import Document
from langchain_community.document_loaders.base import BaseLoader
logger = logging.getLogger(__name__)
[docs]class AsyncChromiumLoader(BaseLoader):
"""使用无头Chromium实例从URL中抓取HTML页面。"""
[docs] def __init__(self, urls: List[str], *, headless: bool = True):
"""用URL路径列表初始化加载器。
参数:
urls:要从中提取内容的URL列表。
headless:是否在无头模式下运行浏览器。
引发:
ImportError:如果未安装所需的“playwright”包。
"""
self.urls = urls
self.headless = headless
try:
import playwright # noqa: F401
except ImportError:
raise ImportError(
"playwright is required for AsyncChromiumLoader. "
"Please install it with `pip install playwright`."
)
[docs] async def ascrape_playwright(self, url: str) -> str:
"""使用Playwright的异步API异步地抓取给定URL的内容。
参数:
url (str): 要抓取的URL。
返回:
str: 抓取的HTML内容或如果发生异常则返回错误消息。
"""
from playwright.async_api import async_playwright
logger.info("Starting scraping...")
results = ""
async with async_playwright() as p:
browser = await p.chromium.launch(headless=self.headless)
try:
page = await browser.new_page()
await page.goto(url)
results = await page.content() # Simply get the HTML content
logger.info("Content scraped")
except Exception as e:
results = f"Error: {e}"
await browser.close()
return results
[docs] def lazy_load(self) -> Iterator[Document]:
"""从提供的URL中延迟加载文本内容。
该方法逐个生成文档,而不是等待所有URL被抓取后再返回。
生成:
Document: 封装在Document对象中的抓取内容。
"""
for url in self.urls:
html_content = asyncio.run(self.ascrape_playwright(url))
metadata = {"source": url}
yield Document(page_content=html_content, metadata=metadata)
[docs] async def alazy_load(self) -> AsyncIterator[Document]:
"""从提供的URL异步加载文本内容。
该方法利用asyncio同时启动所有提供的URL的抓取,通过利用并发的异步请求来提高性能。每个文档在其内容可用时被产生,封装了抓取的内容。
产生:
Document: 包含抓取内容的Document对象,以及其源URL作为元数据。
"""
tasks = [self.ascrape_playwright(url) for url in self.urls]
results = await asyncio.gather(*tasks)
for url, content in zip(self.urls, results):
metadata = {"source": url}
yield Document(page_content=content, metadata=metadata)