Source code for langchain_community.document_loaders.blackboard

import contextlib
import re
from pathlib import Path
from typing import Any, List, Optional, Tuple
from urllib.parse import unquote

from langchain_core.documents import Document

from langchain_community.document_loaders.directory import DirectoryLoader
from langchain_community.document_loaders.pdf import PyPDFLoader
from langchain_community.document_loaders.web_base import WebBaseLoader


[docs]class BlackboardLoader(WebBaseLoader): """加载一个`Blackboard`课程。 这个加载器不兼容所有的Blackboard课程。它只兼容使用新Blackboard界面的课程。 要使用这个加载器,你必须有BbRouter cookie。你可以通过登录课程然后从浏览器的开发者工具中复制BbRouter cookie的值来获取这个cookie。 示例: .. code-block:: python from langchain_community.document_loaders import BlackboardLoader loader = BlackboardLoader( blackboard_course_url="https://blackboard.example.com/webapps/blackboard/execute/announcement?method=search&context=course_entry&course_id=_123456_1", bbrouter="expires:12345...", ) documents = loader.load()""" # noqa: E501
[docs] def __init__( self, blackboard_course_url: str, bbrouter: str, load_all_recursively: bool = True, basic_auth: Optional[Tuple[str, str]] = None, cookies: Optional[dict] = None, continue_on_failure: bool = False, ): """初始化为黑板课程网址。 大多数黑板课程都需要BbRouter cookie。 参数: blackboard_course_url:黑板课程网址。 bbrouter:BbRouter cookie。 load_all_recursively:如果为True,则递归加载所有文档。 basic_auth:基本身份验证凭据。 cookies:Cookies。 continue_on_failure:是否在加载网址时发生错误时继续加载站点地图,发出警告而不是引发异常。将此设置为True可以使加载器更加健壮,但也可能导致数据丢失。默认值:False 引发: ValueError:如果黑板课程网址无效。 """ super().__init__( web_paths=(blackboard_course_url), continue_on_failure=continue_on_failure ) # Get base url try: self.base_url = blackboard_course_url.split("/webapps/blackboard")[0] except IndexError: raise IndexError( "Invalid blackboard course url. " "Please provide a url that starts with " "https://<blackboard_url>/webapps/blackboard" ) if basic_auth is not None: self.session.auth = basic_auth # Combine cookies if cookies is None: cookies = {} cookies.update({"BbRouter": bbrouter}) self.session.cookies.update(cookies) self.load_all_recursively = load_all_recursively self.check_bs4()
[docs] def check_bs4(self) -> None: """检查是否安装了BeautifulSoup4。 引发: ImportError: 如果未安装BeautifulSoup4。 """ try: import bs4 # noqa: F401 except ImportError: raise ImportError( "BeautifulSoup4 is required for BlackboardLoader. " "Please install it with `pip install beautifulsoup4`." )
[docs] def load(self) -> List[Document]: """将数据加载到Document对象中。 返回: Document对象的列表。 """ if self.load_all_recursively: soup_info = self.scrape() self.folder_path = self._get_folder_path(soup_info) relative_paths = self._get_paths(soup_info) documents = [] for path in relative_paths: url = self.base_url + path print(f"Fetching documents from {url}") # noqa: T201 soup_info = self._scrape(url) with contextlib.suppress(ValueError): documents.extend(self._get_documents(soup_info)) return documents else: print(f"Fetching documents from {self.web_path}") # noqa: T201 soup_info = self.scrape() self.folder_path = self._get_folder_path(soup_info) return self._get_documents(soup_info)
def _get_folder_path(self, soup: Any) -> str: """获取保存文档的文件夹路径。 参数: soup: BeautifulSoup4的soup对象。 返回: 文件夹路径。 """ # Get the course name course_name = soup.find("span", {"id": "crumb_1"}) if course_name is None: raise ValueError("No course name found.") course_name = course_name.text.strip() # Prepare the folder path course_name_clean = ( unquote(course_name) .replace(" ", "_") .replace("/", "_") .replace(":", "_") .replace(",", "_") .replace("?", "_") .replace("'", "_") .replace("!", "_") .replace('"', "_") ) # Get the folder path folder_path = Path(".") / course_name_clean return str(folder_path) def _get_documents(self, soup: Any) -> List[Document]: """从页面中提取内容并返回文档。 参数: soup: BeautifulSoup4的soup对象。 返回: 文档列表。 """ attachments = self._get_attachments(soup) self._download_attachments(attachments) documents = self._load_documents() return documents def _get_attachments(self, soup: Any) -> List[str]: """从页面中获取所有附件。 参数: soup: BeautifulSoup4的soup对象。 返回: 附件列表。 """ from bs4 import BeautifulSoup, Tag # Get content list content_list: BeautifulSoup content_list = soup.find("ul", {"class": "contentList"}) if content_list is None: raise ValueError("No content list found.") # Get all attachments attachments = [] attachment: Tag for attachment in content_list.find_all("ul", {"class": "attachments"}): link: Tag for link in attachment.find_all("a"): href = link.get("href") # Only add if href is not None and does not start with # if href is not None and not href.startswith("#"): attachments.append(href) return attachments def _download_attachments(self, attachments: List[str]) -> None: """下载所有附件。 参数: attachments:附件列表。 """ # Make sure the folder exists Path(self.folder_path).mkdir(parents=True, exist_ok=True) # Download all attachments for attachment in attachments: self.download(attachment) def _load_documents(self) -> List[Document]: """加载文件夹中的所有文档。 返回: 文档列表。 """ # Create the document loader loader = DirectoryLoader( path=self.folder_path, glob="*.pdf", loader_cls=PyPDFLoader, # type: ignore ) # Load the documents documents = loader.load() # Return all documents return documents def _get_paths(self, soup: Any) -> List[str]: """获取导航栏中的所有相对路径。""" relative_paths = [] course_menu = soup.find("ul", {"class": "courseMenu"}) if course_menu is None: raise ValueError("No course menu found.") for link in course_menu.find_all("a"): href = link.get("href") if href is not None and href.startswith("/"): relative_paths.append(href) return relative_paths
[docs] def download(self, path: str) -> None: """从URL下载文件。 参数: path:文件的路径。 """ # Get the file content response = self.session.get(self.base_url + path, allow_redirects=True) # Get the filename filename = self.parse_filename(response.url) # Write the file to disk with open(Path(self.folder_path) / filename, "wb") as f: f.write(response.content)
[docs] def parse_filename(self, url: str) -> str: """从url中解析文件名。 参数: url:要从中解析文件名的url。 返回: 文件名。 """ if (url_path := Path(url)) and url_path.suffix == ".pdf": return url_path.name else: return self._parse_filename_from_url(url)
def _parse_filename_from_url(self, url: str) -> str: """从url中解析文件名。 参数: url:要从中解析文件名的url。 返回: 文件名。 引发: ValueError:如果无法解析文件名。 """ filename_matches = re.search(r"filename%2A%3DUTF-8%27%27(.+)", url) if filename_matches: filename = filename_matches.group(1) else: raise ValueError(f"Could not parse filename from {url}") if ".pdf" not in filename: raise ValueError(f"Incorrect file type: {filename}") filename = filename.split(".pdf")[0] + ".pdf" filename = unquote(filename) filename = filename.replace("%20", " ") return filename
if __name__ == "__main__": loader = BlackboardLoader( "https://<YOUR BLACKBOARD URL" " HERE>/webapps/blackboard/content/listContent.jsp?course_id=_<YOUR COURSE ID" " HERE>_1&content_id=_<YOUR CONTENT ID HERE>_1&mode=reset", "<YOUR BBROUTER COOKIE HERE>", load_all_recursively=True, ) documents = loader.load() print(f"Loaded {len(documents)} pages of PDFs from {loader.web_path}") # noqa: T201