import contextlib
import re
from pathlib import Path
from typing import Any, List, Optional, Tuple
from urllib.parse import unquote
from langchain_core.documents import Document
from langchain_community.document_loaders.directory import DirectoryLoader
from langchain_community.document_loaders.pdf import PyPDFLoader
from langchain_community.document_loaders.web_base import WebBaseLoader
[docs]class BlackboardLoader(WebBaseLoader):
"""加载一个`Blackboard`课程。
这个加载器不兼容所有的Blackboard课程。它只兼容使用新Blackboard界面的课程。
要使用这个加载器,你必须有BbRouter cookie。你可以通过登录课程然后从浏览器的开发者工具中复制BbRouter cookie的值来获取这个cookie。
示例:
.. code-block:: python
from langchain_community.document_loaders import BlackboardLoader
loader = BlackboardLoader(
blackboard_course_url="https://blackboard.example.com/webapps/blackboard/execute/announcement?method=search&context=course_entry&course_id=_123456_1",
bbrouter="expires:12345...",
)
documents = loader.load()""" # noqa: E501
[docs] def __init__(
self,
blackboard_course_url: str,
bbrouter: str,
load_all_recursively: bool = True,
basic_auth: Optional[Tuple[str, str]] = None,
cookies: Optional[dict] = None,
continue_on_failure: bool = False,
):
"""初始化为黑板课程网址。
大多数黑板课程都需要BbRouter cookie。
参数:
blackboard_course_url:黑板课程网址。
bbrouter:BbRouter cookie。
load_all_recursively:如果为True,则递归加载所有文档。
basic_auth:基本身份验证凭据。
cookies:Cookies。
continue_on_failure:是否在加载网址时发生错误时继续加载站点地图,发出警告而不是引发异常。将此设置为True可以使加载器更加健壮,但也可能导致数据丢失。默认值:False
引发:
ValueError:如果黑板课程网址无效。
"""
super().__init__(
web_paths=(blackboard_course_url), continue_on_failure=continue_on_failure
)
# Get base url
try:
self.base_url = blackboard_course_url.split("/webapps/blackboard")[0]
except IndexError:
raise IndexError(
"Invalid blackboard course url. "
"Please provide a url that starts with "
"https://<blackboard_url>/webapps/blackboard"
)
if basic_auth is not None:
self.session.auth = basic_auth
# Combine cookies
if cookies is None:
cookies = {}
cookies.update({"BbRouter": bbrouter})
self.session.cookies.update(cookies)
self.load_all_recursively = load_all_recursively
self.check_bs4()
[docs] def check_bs4(self) -> None:
"""检查是否安装了BeautifulSoup4。
引发:
ImportError: 如果未安装BeautifulSoup4。
"""
try:
import bs4 # noqa: F401
except ImportError:
raise ImportError(
"BeautifulSoup4 is required for BlackboardLoader. "
"Please install it with `pip install beautifulsoup4`."
)
[docs] def load(self) -> List[Document]:
"""将数据加载到Document对象中。
返回:
Document对象的列表。
"""
if self.load_all_recursively:
soup_info = self.scrape()
self.folder_path = self._get_folder_path(soup_info)
relative_paths = self._get_paths(soup_info)
documents = []
for path in relative_paths:
url = self.base_url + path
print(f"Fetching documents from {url}") # noqa: T201
soup_info = self._scrape(url)
with contextlib.suppress(ValueError):
documents.extend(self._get_documents(soup_info))
return documents
else:
print(f"Fetching documents from {self.web_path}") # noqa: T201
soup_info = self.scrape()
self.folder_path = self._get_folder_path(soup_info)
return self._get_documents(soup_info)
def _get_folder_path(self, soup: Any) -> str:
"""获取保存文档的文件夹路径。
参数:
soup: BeautifulSoup4的soup对象。
返回:
文件夹路径。
"""
# Get the course name
course_name = soup.find("span", {"id": "crumb_1"})
if course_name is None:
raise ValueError("No course name found.")
course_name = course_name.text.strip()
# Prepare the folder path
course_name_clean = (
unquote(course_name)
.replace(" ", "_")
.replace("/", "_")
.replace(":", "_")
.replace(",", "_")
.replace("?", "_")
.replace("'", "_")
.replace("!", "_")
.replace('"', "_")
)
# Get the folder path
folder_path = Path(".") / course_name_clean
return str(folder_path)
def _get_documents(self, soup: Any) -> List[Document]:
"""从页面中提取内容并返回文档。
参数:
soup: BeautifulSoup4的soup对象。
返回:
文档列表。
"""
attachments = self._get_attachments(soup)
self._download_attachments(attachments)
documents = self._load_documents()
return documents
def _get_attachments(self, soup: Any) -> List[str]:
"""从页面中获取所有附件。
参数:
soup: BeautifulSoup4的soup对象。
返回:
附件列表。
"""
from bs4 import BeautifulSoup, Tag
# Get content list
content_list: BeautifulSoup
content_list = soup.find("ul", {"class": "contentList"})
if content_list is None:
raise ValueError("No content list found.")
# Get all attachments
attachments = []
attachment: Tag
for attachment in content_list.find_all("ul", {"class": "attachments"}):
link: Tag
for link in attachment.find_all("a"):
href = link.get("href")
# Only add if href is not None and does not start with #
if href is not None and not href.startswith("#"):
attachments.append(href)
return attachments
def _download_attachments(self, attachments: List[str]) -> None:
"""下载所有附件。
参数:
attachments:附件列表。
"""
# Make sure the folder exists
Path(self.folder_path).mkdir(parents=True, exist_ok=True)
# Download all attachments
for attachment in attachments:
self.download(attachment)
def _load_documents(self) -> List[Document]:
"""加载文件夹中的所有文档。
返回:
文档列表。
"""
# Create the document loader
loader = DirectoryLoader(
path=self.folder_path,
glob="*.pdf",
loader_cls=PyPDFLoader, # type: ignore
)
# Load the documents
documents = loader.load()
# Return all documents
return documents
def _get_paths(self, soup: Any) -> List[str]:
"""获取导航栏中的所有相对路径。"""
relative_paths = []
course_menu = soup.find("ul", {"class": "courseMenu"})
if course_menu is None:
raise ValueError("No course menu found.")
for link in course_menu.find_all("a"):
href = link.get("href")
if href is not None and href.startswith("/"):
relative_paths.append(href)
return relative_paths
[docs] def download(self, path: str) -> None:
"""从URL下载文件。
参数:
path:文件的路径。
"""
# Get the file content
response = self.session.get(self.base_url + path, allow_redirects=True)
# Get the filename
filename = self.parse_filename(response.url)
# Write the file to disk
with open(Path(self.folder_path) / filename, "wb") as f:
f.write(response.content)
[docs] def parse_filename(self, url: str) -> str:
"""从url中解析文件名。
参数:
url:要从中解析文件名的url。
返回:
文件名。
"""
if (url_path := Path(url)) and url_path.suffix == ".pdf":
return url_path.name
else:
return self._parse_filename_from_url(url)
def _parse_filename_from_url(self, url: str) -> str:
"""从url中解析文件名。
参数:
url:要从中解析文件名的url。
返回:
文件名。
引发:
ValueError:如果无法解析文件名。
"""
filename_matches = re.search(r"filename%2A%3DUTF-8%27%27(.+)", url)
if filename_matches:
filename = filename_matches.group(1)
else:
raise ValueError(f"Could not parse filename from {url}")
if ".pdf" not in filename:
raise ValueError(f"Incorrect file type: {filename}")
filename = filename.split(".pdf")[0] + ".pdf"
filename = unquote(filename)
filename = filename.replace("%20", " ")
return filename
if __name__ == "__main__":
loader = BlackboardLoader(
"https://<YOUR BLACKBOARD URL"
" HERE>/webapps/blackboard/content/listContent.jsp?course_id=_<YOUR COURSE ID"
" HERE>_1&content_id=_<YOUR CONTENT ID HERE>_1&mode=reset",
"<YOUR BBROUTER COOKIE HERE>",
load_all_recursively=True,
)
documents = loader.load()
print(f"Loaded {len(documents)} pages of PDFs from {loader.web_path}") # noqa: T201