Source code for langchain_community.document_loaders.diffbot

import logging
from typing import Any, List

import requests
from langchain_core.documents import Document

from langchain_community.document_loaders.base import BaseLoader

logger = logging.getLogger(__name__)


[docs]class DiffbotLoader(BaseLoader): """加载`Diffbot`的json文件。"""
[docs] def __init__( self, api_token: str, urls: List[str], continue_on_failure: bool = True ): """使用API令牌、ids和密钥进行初始化。 参数: api_token: Diffbot API令牌。 urls: 要加载的URL列表。 continue_on_failure: 如果一个URL加载失败,是否继续加载其他URL。默认为True。 """ self.api_token = api_token self.urls = urls self.continue_on_failure = continue_on_failure
def _diffbot_api_url(self, diffbot_api: str) -> str: return f"https://api.diffbot.com/v3/{diffbot_api}" def _get_diffbot_data(self, url: str) -> Any: """从Diffbot REST API获取Diffbot文件。""" # TODO: Add support for other Diffbot APIs diffbot_url = self._diffbot_api_url("article") params = { "token": self.api_token, "url": url, } response = requests.get(diffbot_url, params=params, timeout=10) # TODO: handle non-ok errors return response.json() if response.ok else {}
[docs] def load(self) -> List[Document]: """从所有的URL中提取Diffbot的文本并返回文档""" docs: List[Document] = list() for url in self.urls: try: data = self._get_diffbot_data(url) text = data["objects"][0]["text"] if "objects" in data else "" metadata = {"source": url} docs.append(Document(page_content=text, metadata=metadata)) except Exception as e: if self.continue_on_failure: logger.error(f"Error fetching or processing {url}, exception: {e}") else: raise e return docs