Source code for langchain_community.document_loaders.ifixit

from typing import List, Optional

import requests
from langchain_core.documents import Document

from langchain_community.document_loaders.base import BaseLoader
from langchain_community.document_loaders.web_base import WebBaseLoader

IFIXIT_BASE_URL = "https://www.ifixit.com/api/2.0"


[docs]class IFixitLoader(BaseLoader): """加载`iFixit`修复指南、设备维基和答案。 iFixit是网络上最大的开放式维修社区。该网站包含近100,000份维修手册、42,000台设备的200,000个问题和答案,所有数据都在CC-BY许可下发布。 该加载器将允许您使用它们的开放API和网络抓取从iFixit上下载修复指南、问答和设备维基的文本。"""
[docs] def __init__(self, web_path: str): """使用Web路径进行初始化。""" if not web_path.startswith("https://www.ifixit.com"): raise ValueError("web path must start with 'https://www.ifixit.com'") path = web_path.replace("https://www.ifixit.com", "") allowed_paths = ["/Device", "/Guide", "/Answers", "/Teardown"] """ TODO: Add /Wiki """ if not any(path.startswith(allowed_path) for allowed_path in allowed_paths): raise ValueError( "web path must start with /Device, /Guide, /Teardown or /Answers" ) pieces = [x for x in path.split("/") if x] """Teardowns are just guides by a different name""" self.page_type = pieces[0] if pieces[0] != "Teardown" else "Guide" if self.page_type == "Guide" or self.page_type == "Answers": self.id = pieces[2] else: self.id = pieces[1] self.web_path = web_path
[docs] def load(self) -> List[Document]: if self.page_type == "Device": return self.load_device() elif self.page_type == "Guide" or self.page_type == "Teardown": return self.load_guide() elif self.page_type == "Answers": return self.load_questions_and_answers() else: raise ValueError("Unknown page type: " + self.page_type)
[docs] @staticmethod def load_suggestions(query: str = "", doc_type: str = "all") -> List[Document]: """加载建议。 参数: query:查询字符串 doc_type:要搜索的文档类型。可以是以下之一:"all"、"device"、"guide"、"teardown"、"answer"、"wiki"。 返回值: """ res = requests.get( IFIXIT_BASE_URL + "/suggest/" + query + "?doctypes=" + doc_type ) if res.status_code != 200: raise ValueError( 'Could not load suggestions for "' + query + '"\n' + res.json() ) data = res.json() results = data["results"] output = [] for result in results: try: loader = IFixitLoader(result["url"]) if loader.page_type == "Device": output += loader.load_device(include_guides=False) else: output += loader.load() except ValueError: continue return output
[docs] def load_questions_and_answers( self, url_override: Optional[str] = None ) -> List[Document]: """加载问题和答案列表。 参数: url_override:要覆盖默认URL的URL。 返回:List[Document] """ loader = WebBaseLoader(self.web_path if url_override is None else url_override) soup = loader.scrape() output = [] title = soup.find("h1", "post-title").text output.append("# " + title) output.append(soup.select_one(".post-content .post-text").text.strip()) answersHeader = soup.find("div", "post-answers-header") if answersHeader: output.append("\n## " + answersHeader.text.strip()) for answer in soup.select(".js-answers-list .post.post-answer"): if answer.has_attr("itemprop") and "acceptedAnswer" in answer["itemprop"]: output.append("\n### Accepted Answer") elif "post-helpful" in answer["class"]: output.append("\n### Most Helpful Answer") else: output.append("\n### Other Answer") output += [ a.text.strip() for a in answer.select(".post-content .post-text") ] output.append("\n") text = "\n".join(output).strip() metadata = {"source": self.web_path, "title": title} return [Document(page_content=text, metadata=metadata)]
[docs] def load_device( self, url_override: Optional[str] = None, include_guides: bool = True ) -> List[Document]: """加载设备 参数: url_override: 用于覆盖默认URL的URL。 include_guides: 是否包括与设备链接的指南。 默认为True。 返回值: """ documents = [] if url_override is None: url = IFIXIT_BASE_URL + "/wikis/CATEGORY/" + self.id else: url = url_override res = requests.get(url) data = res.json() text = "\n".join( [ data[key] for key in ["title", "description", "contents_raw"] if key in data ] ).strip() metadata = {"source": self.web_path, "title": data["title"]} documents.append(Document(page_content=text, metadata=metadata)) if include_guides: """Load and return documents for each guide linked to from the device""" guide_urls = [guide["url"] for guide in data["guides"]] for guide_url in guide_urls: documents.append(IFixitLoader(guide_url).load()[0]) return documents
[docs] def load_guide(self, url_override: Optional[str] = None) -> List[Document]: """加载指南 参数: url_override: 用于覆盖默认URL的URL。 返回: 文档列表 """ if url_override is None: url = IFIXIT_BASE_URL + "/guides/" + self.id else: url = url_override res = requests.get(url) if res.status_code != 200: raise ValueError( "Could not load guide: " + self.web_path + "\n" + res.json() ) data = res.json() doc_parts = ["# " + data["title"], data["introduction_raw"]] doc_parts.append("\n\n###Tools Required:") if len(data["tools"]) == 0: doc_parts.append("\n - None") else: for tool in data["tools"]: doc_parts.append("\n - " + tool["text"]) doc_parts.append("\n\n###Parts Required:") if len(data["parts"]) == 0: doc_parts.append("\n - None") else: for part in data["parts"]: doc_parts.append("\n - " + part["text"]) for row in data["steps"]: doc_parts.append( "\n\n## " + ( row["title"] if row["title"] != "" else "Step {}".format(row["orderby"]) ) ) for line in row["lines"]: doc_parts.append(line["text_raw"]) doc_parts.append(data["conclusion_raw"]) text = "\n".join(doc_parts) metadata = {"source": self.web_path, "title": data["title"]} return [Document(page_content=text, metadata=metadata)]