Source code for langchain_community.document_loaders.browserless

from typing import Iterator, List, Union

import requests
from langchain_core.documents import Document

from langchain_community.document_loaders.base import BaseLoader


[docs]class BrowserlessLoader(BaseLoader): """使用`Browserless`的/content端点加载网页。"""
[docs] def __init__( self, api_token: str, urls: Union[str, List[str]], text_content: bool = True ): """使用API令牌和要抓取的URL进行初始化""" self.api_token = api_token """Browserless API token.""" self.urls = urls """List of URLs to scrape.""" self.text_content = text_content
[docs] def lazy_load(self) -> Iterator[Document]: """从URL中延迟加载文档。""" for url in self.urls: if self.text_content: response = requests.post( "https://chrome.browserless.io/scrape", params={ "token": self.api_token, }, json={ "url": url, "elements": [ { "selector": "body", } ], }, ) yield Document( page_content=response.json()["data"][0]["results"][0]["text"], metadata={ "source": url, }, ) else: response = requests.post( "https://chrome.browserless.io/content", params={ "token": self.api_token, }, json={ "url": url, }, ) yield Document( page_content=response.text, metadata={ "source": url, }, )