Mangoapps guides

MangoppsGuidesReader #

Bases: BaseReader

MangoppsGuides读取器。从MangoppsGuides工作空间中读取数据。

Parameters:

Name	Type	Description	Default
`domain_url`	`str`	MangoppsGuides域名url	required
`limir`	`int`	爬取的深度	required

Source code in llama_index/readers/mangoapps_guides/base.py

class MangoppsGuidesReader(BaseReader):
    """MangoppsGuides读取器。从MangoppsGuides工作空间中读取数据。

    Args:
        domain_url (str): MangoppsGuides域名url
        limir (int): 爬取的深度"""

    def __init__(self) -> None:
        """初始化MangoppsGuides阅读器。"""

    def load_data(self, domain_url: str, limit: int) -> List[Document]:
        """从工作空间加载数据。

返回：
    List[Document]: 文档列表。
"""
        import requests
        from bs4 import BeautifulSoup

        self.domain_url = domain_url
        self.limit = limit
        self.start_url = f"{self.domain_url}/home/"

        fetched_urls = self.crawl_urls()[: self.limit]

        results = []

        guides_pages = {}
        for url in fetched_urls:
            try:
                response = requests.get(url)
                soup = BeautifulSoup(response.content, "html.parser")

                page_title = soup.find("title").text

                # Remove the div with aria-label="Table of contents"
                table_of_contents_div = soup.find(
                    "div", {"aria-label": "Table of contents"}
                )
                if table_of_contents_div:
                    table_of_contents_div.decompose()

                # Remove header and footer
                header = soup.find("header")
                if header:
                    header.decompose()
                footer = soup.find("footer")
                if footer:
                    footer.decompose()

                # Exclude links and their text content from the main content
                for link in soup.find_all("a"):
                    link.decompose()

                # Remove empty elements from the main content
                for element in soup.find_all():
                    if element.get_text(strip=True) == "":
                        element.decompose()

                # Find the main element containing the desired content
                main_element = soup.find(
                    "main"
                )  # Replace "main" with the appropriate element tag or CSS class

                # Extract the text content from the main element
                if main_element:
                    text_content = main_element.get_text("\n")
                    # Remove multiple consecutive newlines and keep only one newline
                    text_content = re.sub(r"\n+", "\n", text_content)
                else:
                    text_content = ""

                page_text = text_content

                guides_page = {}
                guides_page["title"] = page_title
                guides_page["text"] = page_text
                guides_pages[url] = guides_page
            except Exception as e:
                print(f"Failed for {url} => {e}")

        for k, v in guides_pages.items():
            extra_info = {"url": k, "title": v["title"]}
            results.append(
                Document(
                    text=v["text"],
                    extra_info=extra_info,
                )
            )

        return results

    def crawl_urls(self) -> List[str]:
        """从给定的域名中爬取所有的URL。"""
        self.visited = []

        fetched_urls = self.fetch_url(self.start_url)
        return list(set(fetched_urls))

    def fetch_url(self, url):
        """从给定的域名获取URL。"""
        import requests
        from bs4 import BeautifulSoup

        response = requests.get(url)
        soup = BeautifulSoup(response.content, "html.parser")

        self.visited.append(url)

        newurls = []
        for link in soup.find_all("a"):
            href: str = link.get("href")
            if href and urlparse(href).netloc == self.domain_url:
                newurls.append(href)
            elif href and href.startswith("/"):
                newurls.append(f"{self.domain_url}{href}")

        for newurl in newurls:
            if (
                newurl not in self.visited
                and not newurl.startswith("#")
                and f"https://{urlparse(newurl).netloc}" == self.domain_url
                and len(self.visited) <= self.limit
            ):
                newurls = newurls + self.fetch_url(newurl)

        return list(set(newurls))

load_data #

load_data(domain_url: str, limit: int) -> List[Document]

从工作空间加载数据。

返回： List[Document]: 文档列表。

Source code in llama_index/readers/mangoapps_guides/base.py

    def load_data(self, domain_url: str, limit: int) -> List[Document]:
        """从工作空间加载数据。

返回：
    List[Document]: 文档列表。
"""
        import requests
        from bs4 import BeautifulSoup

        self.domain_url = domain_url
        self.limit = limit
        self.start_url = f"{self.domain_url}/home/"

        fetched_urls = self.crawl_urls()[: self.limit]

        results = []

        guides_pages = {}
        for url in fetched_urls:
            try:
                response = requests.get(url)
                soup = BeautifulSoup(response.content, "html.parser")

                page_title = soup.find("title").text

                # Remove the div with aria-label="Table of contents"
                table_of_contents_div = soup.find(
                    "div", {"aria-label": "Table of contents"}
                )
                if table_of_contents_div:
                    table_of_contents_div.decompose()

                # Remove header and footer
                header = soup.find("header")
                if header:
                    header.decompose()
                footer = soup.find("footer")
                if footer:
                    footer.decompose()

                # Exclude links and their text content from the main content
                for link in soup.find_all("a"):
                    link.decompose()

                # Remove empty elements from the main content
                for element in soup.find_all():
                    if element.get_text(strip=True) == "":
                        element.decompose()

                # Find the main element containing the desired content
                main_element = soup.find(
                    "main"
                )  # Replace "main" with the appropriate element tag or CSS class

                # Extract the text content from the main element
                if main_element:
                    text_content = main_element.get_text("\n")
                    # Remove multiple consecutive newlines and keep only one newline
                    text_content = re.sub(r"\n+", "\n", text_content)
                else:
                    text_content = ""

                page_text = text_content

                guides_page = {}
                guides_page["title"] = page_title
                guides_page["text"] = page_text
                guides_pages[url] = guides_page
            except Exception as e:
                print(f"Failed for {url} => {e}")

        for k, v in guides_pages.items():
            extra_info = {"url": k, "title": v["title"]}
            results.append(
                Document(
                    text=v["text"],
                    extra_info=extra_info,
                )
            )

        return results

crawl_urls #

crawl_urls() -> List[str]

从给定的域名中爬取所有的URL。

Source code in llama_index/readers/mangoapps_guides/base.py

def crawl_urls(self) -> List[str]:
    """从给定的域名中爬取所有的URL。"""
    self.visited = []

    fetched_urls = self.fetch_url(self.start_url)
    return list(set(fetched_urls))

fetch_url #

fetch_url(url)

从给定的域名获取URL。

Source code in llama_index/readers/mangoapps_guides/base.py

def fetch_url(self, url):
    """从给定的域名获取URL。"""
    import requests
    from bs4 import BeautifulSoup

    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")

    self.visited.append(url)

    newurls = []
    for link in soup.find_all("a"):
        href: str = link.get("href")
        if href and urlparse(href).netloc == self.domain_url:
            newurls.append(href)
        elif href and href.startswith("/"):
            newurls.append(f"{self.domain_url}{href}")

    for newurl in newurls:
        if (
            newurl not in self.visited
            and not newurl.startswith("#")
            and f"https://{urlparse(newurl).netloc}" == self.domain_url
            and len(self.visited) <= self.limit
        ):
            newurls = newurls + self.fetch_url(newurl)

    return list(set(newurls))