Web

初始化文件。

AsyncWebPageReader #

Bases: BaseReader

异步网页阅读器。

异步读取网页。

Parameters:

Name	Type	Description	Default
`html_to_text`	`bool`	是否将HTML转换为文本。需要`html2text`包。	`False`
`limit`	`int`	最大并发请求数。	`10`
`dedupe`	`bool`	如果给定列表中存在精确匹配的URL，则进行URL去重	`True`
`fail_on_error`	`bool`	如果请求的URL未返回状态码200，则程序将引发ValueError异常	`False`

Source code in llama_index/readers/web/async_web/base.py

class AsyncWebPageReader(BaseReader):
    """异步网页阅读器。

    异步读取网页。

    Args:
        html_to_text (bool): 是否将HTML转换为文本。
            需要`html2text`包。
        limit (int): 最大并发请求数。
        dedupe (bool): 如果给定列表中存在精确匹配的URL，则进行URL去重
        fail_on_error (bool): 如果请求的URL未返回状态码200，则程序将引发ValueError异常"""

    def __init__(
        self,
        html_to_text: bool = False,
        limit: int = 10,
        dedupe: bool = True,
        fail_on_error: bool = False,
    ) -> None:
        """使用参数进行初始化。"""
        try:
            import html2text  # noqa: F401
        except ImportError:
            raise ImportError(
                "`html2text` package not found, please run `pip install html2text`"
            )
        try:
            import aiohttp  # noqa: F401
        except ImportError:
            raise ImportError(
                "`aiohttp` package not found, please run `pip install aiohttp`"
            )
        self._limit = limit
        self._html_to_text = html_to_text
        self._dedupe = dedupe
        self._fail_on_error = fail_on_error

    def load_data(self, urls: List[str]) -> List[Document]:
        """从输入的URL加载数据。

Args:
    urls（List[str]）：要抓取的URL列表。

Returns:
    List[Document]：文档列表。
"""
        if self._dedupe:
            urls = list(dict.fromkeys(urls))

        import aiohttp

        def chunked_http_client(limit: int):
            semaphore = asyncio.Semaphore(limit)

            async def http_get(url: str, session: aiohttp.ClientSession):
                async with semaphore:
                    async with session.get(url) as response:
                        return response, await response.text()

            return http_get

        async def fetch_urls(urls: List[str]):
            http_client = chunked_http_client(self._limit)
            async with aiohttp.ClientSession() as session:
                tasks = [http_client(url, session) for url in urls]
                return await asyncio.gather(*tasks, return_exceptions=True)

        if not isinstance(urls, list):
            raise ValueError("urls must be a list of strings.")

        documents = []
        responses = asyncio.run(fetch_urls(urls))

        for i, response_tuple in enumerate(responses):
            if not isinstance(response_tuple, tuple):
                raise ValueError(f"One of the inputs is not a valid url: {urls[i]}")

            response, raw_page = response_tuple

            if response.status != 200:
                logger.warning(f"error fetching page from {urls[i]}")
                logger.info(response)

                if self._fail_on_error:
                    raise ValueError(
                        f"error fetching page from {urls[i]}. server returned status:"
                        f" {response.status} and response {raw_page}"
                    )

                continue

            if self._html_to_text:
                import html2text

                response_text = html2text.html2text(raw_page)
            else:
                response_text = raw_page

            documents.append(
                Document(text=response_text, extra_info={"Source": str(response.url)})
            )

        return documents

load_data #

load_data(urls: List[str]) -> List[Document]

从输入的URL加载数据。

Returns:

Type	Description
`List[Document]`	List[Document]：文档列表。

Source code in llama_index/readers/web/async_web/base.py

    def load_data(self, urls: List[str]) -> List[Document]:
        """从输入的URL加载数据。

Args:
    urls（List[str]）：要抓取的URL列表。

Returns:
    List[Document]：文档列表。
"""
        if self._dedupe:
            urls = list(dict.fromkeys(urls))

        import aiohttp

        def chunked_http_client(limit: int):
            semaphore = asyncio.Semaphore(limit)

            async def http_get(url: str, session: aiohttp.ClientSession):
                async with semaphore:
                    async with session.get(url) as response:
                        return response, await response.text()

            return http_get

        async def fetch_urls(urls: List[str]):
            http_client = chunked_http_client(self._limit)
            async with aiohttp.ClientSession() as session:
                tasks = [http_client(url, session) for url in urls]
                return await asyncio.gather(*tasks, return_exceptions=True)

        if not isinstance(urls, list):
            raise ValueError("urls must be a list of strings.")

        documents = []
        responses = asyncio.run(fetch_urls(urls))

        for i, response_tuple in enumerate(responses):
            if not isinstance(response_tuple, tuple):
                raise ValueError(f"One of the inputs is not a valid url: {urls[i]}")

            response, raw_page = response_tuple

            if response.status != 200:
                logger.warning(f"error fetching page from {urls[i]}")
                logger.info(response)

                if self._fail_on_error:
                    raise ValueError(
                        f"error fetching page from {urls[i]}. server returned status:"
                        f" {response.status} and response {raw_page}"
                    )

                continue

            if self._html_to_text:
                import html2text

                response_text = html2text.html2text(raw_page)
            else:
                response_text = raw_page

            documents.append(
                Document(text=response_text, extra_info={"Source": str(response.url)})
            )

        return documents

BeautifulSoupWebReader #

Bases: BasePydanticReader

美丽汤网页阅读器。

从网页读取页面。需要bs4和urllib包。

Parameters:

Name	Type	Description	Default
`website_extractor`	`Optional[Dict[str, Callable]]`	网站主机名（例如google.com）到指定从BeautifulSoup对象中提取文本的函数的映射。参见DEFAULT_WEBSITE_EXTRACTOR。	`None`

Source code in llama_index/readers/web/beautiful_soup_web/base.py

class BeautifulSoupWebReader(BasePydanticReader):
    """美丽汤网页阅读器。

    从网页读取页面。
    需要`bs4`和`urllib`包。

    Args:
        website_extractor (Optional[Dict[str, Callable]]): 网站主机名（例如google.com）到指定从BeautifulSoup对象中提取文本的函数的映射。参见DEFAULT_WEBSITE_EXTRACTOR。"""

    is_remote: bool = True
    _website_extractor: Dict[str, Callable] = PrivateAttr()

    def __init__(self, website_extractor: Optional[Dict[str, Callable]] = None) -> None:
        self._website_extractor = website_extractor or DEFAULT_WEBSITE_EXTRACTOR
        super().__init__()

    @classmethod
    def class_name(cls) -> str:
        """获取类的名称标识符。"""
        return "BeautifulSoupWebReader"

    def load_data(
        self,
        urls: List[str],
        custom_hostname: Optional[str] = None,
        include_url_in_text: Optional[bool] = True,
    ) -> List[Document]:
        """从URL加载数据。

Args:
    urls（List[str]）：要抓取的URL列表。
    custom_hostname（可选[str]）：在网站显示在自定义URL（例如Substack博客）下时，强制使用特定的主机名
    include_url_in_text（可选[bool]）：在文档文本中包含参考URL

Returns:
    List[Document]：文档列表。
"""
        from urllib.parse import urlparse

        import requests
        from bs4 import BeautifulSoup

        documents = []
        for url in urls:
            try:
                page = requests.get(url)
            except Exception:
                raise ValueError(f"One of the inputs is not a valid url: {url}")

            hostname = custom_hostname or urlparse(url).hostname or ""

            soup = BeautifulSoup(page.content, "html.parser")

            data = ""
            extra_info = {"URL": url}
            if hostname in self._website_extractor:
                data, metadata = self._website_extractor[hostname](
                    soup=soup, url=url, include_url_in_text=include_url_in_text
                )
                extra_info.update(metadata)

            else:
                data = soup.getText()

            documents.append(Document(text=data, id_=url, extra_info=extra_info))

        return documents

class_name `classmethod` #

class_name() -> str

获取类的名称标识符。

Source code in llama_index/readers/web/beautiful_soup_web/base.py

@classmethod
def class_name(cls) -> str:
    """获取类的名称标识符。"""
    return "BeautifulSoupWebReader"

load_data #

load_data(
    urls: List[str],
    custom_hostname: Optional[str] = None,
    include_url_in_text: Optional[bool] = True,
) -> List[Document]

从URL加载数据。

Returns:

Type	Description
`List[Document]`	List[Document]：文档列表。

Source code in llama_index/readers/web/beautiful_soup_web/base.py

    def load_data(
        self,
        urls: List[str],
        custom_hostname: Optional[str] = None,
        include_url_in_text: Optional[bool] = True,
    ) -> List[Document]:
        """从URL加载数据。

Args:
    urls（List[str]）：要抓取的URL列表。
    custom_hostname（可选[str]）：在网站显示在自定义URL（例如Substack博客）下时，强制使用特定的主机名
    include_url_in_text（可选[bool]）：在文档文本中包含参考URL

Returns:
    List[Document]：文档列表。
"""
        from urllib.parse import urlparse

        import requests
        from bs4 import BeautifulSoup

        documents = []
        for url in urls:
            try:
                page = requests.get(url)
            except Exception:
                raise ValueError(f"One of the inputs is not a valid url: {url}")

            hostname = custom_hostname or urlparse(url).hostname or ""

            soup = BeautifulSoup(page.content, "html.parser")

            data = ""
            extra_info = {"URL": url}
            if hostname in self._website_extractor:
                data, metadata = self._website_extractor[hostname](
                    soup=soup, url=url, include_url_in_text=include_url_in_text
                )
                extra_info.update(metadata)

            else:
                data = soup.getText()

            documents.append(Document(text=data, id_=url, extra_info=extra_info))

        return documents

BrowserbaseWebReader #

Bases: BaseReader

浏览器基础Web阅读器。

使用托管在Browserbase上的无头浏览器加载预渲染的网页。依赖于browserbase包。从https://browserbase.com 获取您的API密钥。

Source code in llama_index/readers/web/browserbase_web/base.py

class BrowserbaseWebReader(BaseReader):
    """浏览器基础Web阅读器。

    使用托管在Browserbase上的无头浏览器加载预渲染的网页。
    依赖于`browserbase`包。
    从https://browserbase.com 获取您的API密钥。"""

    def __init__(
        self,
        api_key: Optional[str] = None,
        project_id: Optional[str] = None,
    ) -> None:
        try:
            from browserbase import Browserbase
        except ImportError:
            raise ImportError(
                "`browserbase` package not found, please run `pip install browserbase`"
            )

        self.browserbase = Browserbase(api_key, project_id)

    def lazy_load_data(
        self,
        urls: Sequence[str],
        text_content: bool = False,
        session_id: Optional[str] = None,
        proxy: Optional[bool] = None,
    ) -> Iterator[Document]:
        """从URL加载页面。"""
        pages = self.browserbase.load_urls(urls, text_content, session_id, proxy)

        for i, page in enumerate(pages):
            yield Document(
                text=page,
                metadata={
                    "url": urls[i],
                },
            )

lazy_load_data #

lazy_load_data(
    urls: Sequence[str],
    text_content: bool = False,
    session_id: Optional[str] = None,
    proxy: Optional[bool] = None,
) -> Iterator[Document]

从URL加载页面。

Source code in llama_index/readers/web/browserbase_web/base.py

def lazy_load_data(
    self,
    urls: Sequence[str],
    text_content: bool = False,
    session_id: Optional[str] = None,
    proxy: Optional[bool] = None,
) -> Iterator[Document]:
    """从URL加载页面。"""
    pages = self.browserbase.load_urls(urls, text_content, session_id, proxy)

    for i, page in enumerate(pages):
        yield Document(
            text=page,
            metadata={
                "url": urls[i],
            },
        )

KnowledgeBaseWebReader #

Bases: BaseReader

知识库阅读器。

使用Playwright爬取和阅读知识库/帮助中心的文章。在Zendesk和Intercom CMS上进行了测试，可能也适用于其他平台。可以在无头模式下运行，但可能会被Cloudflare阻止。为了安全起见，建议以有头模式运行。偶尔会超时，如果出现超时情况，只需增加默认超时时间即可。需要安装playwright包。

Parameters:

Name	Type	Description	Default
`root_url`	`str`	知识库的基本url，末尾不带斜杠例如 'https://support.intercom.com'	required
`link_selectors`	`List[str]`	用于在爬取过程中查找文章链接的css选择器列表例如 ['.article-list a', '.article-list a']	required
`article_path`	`str`	该域上文章的url路径，以便爬虫知道何时停止例如 '/articles'	required
`title_selector`	`Optional[str]`	用于查找文章标题的css选择器例如 '.article-title'	`None`
`subtitle_selector`	`Optional[str]`	用于查找文章副标题/描述的css选择器例如 '.article-subtitle'	`None`
`body_selector`	`Optional[str]`	用于查找文章正文的css选择器例如 '.article-body'	`None`

Source code in llama_index/readers/web/knowledge_base/base.py

class KnowledgeBaseWebReader(BaseReader):
    """知识库阅读器。

    使用Playwright爬取和阅读知识库/帮助中心的文章。
    在Zendesk和Intercom CMS上进行了测试，可能也适用于其他平台。
    可以在无头模式下运行，但可能会被Cloudflare阻止。为了安全起见，建议以有头模式运行。
    偶尔会超时，如果出现超时情况，只需增加默认超时时间即可。
    需要安装`playwright`包。

    Args:
        root_url (str): 知识库的基本url，末尾不带斜杠
            例如 'https://support.intercom.com'
        link_selectors (List[str]): 用于在爬取过程中查找文章链接的css选择器列表
            例如 ['.article-list a', '.article-list a']
        article_path (str): 该域上文章的url路径，以便爬虫知道何时停止
            例如 '/articles'
        title_selector (Optional[str]): 用于查找文章标题的css选择器
            例如 '.article-title'
        subtitle_selector (Optional[str]): 用于查找文章副标题/描述的css选择器
            例如 '.article-subtitle'
        body_selector (Optional[str]): 用于查找文章正文的css选择器
            例如 '.article-body'"""

    def __init__(
        self,
        root_url: str,
        link_selectors: List[str],
        article_path: str,
        title_selector: Optional[str] = None,
        subtitle_selector: Optional[str] = None,
        body_selector: Optional[str] = None,
    ) -> None:
        """使用参数进行初始化。"""
        self.root_url = root_url
        self.link_selectors = link_selectors
        self.article_path = article_path
        self.title_selector = title_selector
        self.subtitle_selector = subtitle_selector
        self.body_selector = body_selector

    def load_data(self) -> List[Document]:
        """从知识库加载数据。"""
        from playwright.sync_api import sync_playwright

        with sync_playwright() as p:
            browser = p.chromium.launch(headless=False)

            # Crawl
            article_urls = self.get_article_urls(
                browser,
                self.root_url,
                self.root_url,
            )

            # Scrape
            documents = []
            for url in article_urls:
                article = self.scrape_article(
                    browser,
                    url,
                )
                extra_info = {
                    "title": article["title"],
                    "subtitle": article["subtitle"],
                    "url": article["url"],
                }
                documents.append(Document(text=article["body"], extra_info=extra_info))

            browser.close()

            return documents

    def scrape_article(
        self,
        browser: Any,
        url: str,
    ) -> Dict[str, str]:
        """爬取单个文章的URL。

Args:
    browser（任意）：Playwright Chromium浏览器。
    url（str）：要爬取的文章的URL。

Returns:
    Dict[str, str]：文章属性与其值的映射。
"""
        page = browser.new_page(ignore_https_errors=True)
        page.set_default_timeout(60000)
        page.goto(url, wait_until="domcontentloaded")

        title = (
            (
                page.query_selector(self.title_selector).evaluate(
                    "node => node.innerText"
                )
            )
            if self.title_selector
            else ""
        )
        subtitle = (
            (
                page.query_selector(self.subtitle_selector).evaluate(
                    "node => node.innerText"
                )
            )
            if self.subtitle_selector
            else ""
        )
        body = (
            (page.query_selector(self.body_selector).evaluate("node => node.innerText"))
            if self.body_selector
            else ""
        )

        page.close()
        print("scraped:", url)
        return {"title": title, "subtitle": subtitle, "body": body, "url": url}

    def get_article_urls(
        self, browser: Any, root_url: str, current_url: str
    ) -> List[str]:
        """递归地遍历知识库，以找到文章列表。

Args:
    browser (Any): Playwright Chromium 浏览器。
    root_url (str): 知识库的根URL。
    current_url (str): 正在遍历的当前URL。

Returns:
    List[str]: 找到的文章的URL列表。
"""
        page = browser.new_page(ignore_https_errors=True)
        page.set_default_timeout(60000)
        page.goto(current_url, wait_until="domcontentloaded")

        # If this is a leaf node aka article page, return itself
        if self.article_path in current_url:
            print("Found an article: ", current_url)
            page.close()
            return [current_url]

        # Otherwise crawl this page and find all the articles linked from it
        article_urls = []
        links = []

        for link_selector in self.link_selectors:
            ahrefs = page.query_selector_all(link_selector)
            links.extend(ahrefs)

        for link in links:
            url = root_url + page.evaluate("(node) => node.getAttribute('href')", link)
            article_urls.extend(self.get_article_urls(browser, root_url, url))

        page.close()

        return article_urls

load_data #

load_data() -> List[Document]

从知识库加载数据。

Source code in llama_index/readers/web/knowledge_base/base.py

def load_data(self) -> List[Document]:
    """从知识库加载数据。"""
    from playwright.sync_api import sync_playwright

    with sync_playwright() as p:
        browser = p.chromium.launch(headless=False)

        # Crawl
        article_urls = self.get_article_urls(
            browser,
            self.root_url,
            self.root_url,
        )

        # Scrape
        documents = []
        for url in article_urls:
            article = self.scrape_article(
                browser,
                url,
            )
            extra_info = {
                "title": article["title"],
                "subtitle": article["subtitle"],
                "url": article["url"],
            }
            documents.append(Document(text=article["body"], extra_info=extra_info))

        browser.close()

        return documents

scrape_article #

scrape_article(browser: Any, url: str) -> Dict[str, str]

爬取单个文章的URL。

Returns:

Type	Description
`Dict[str, str]`	Dict[str, str]：文章属性与其值的映射。

Source code in llama_index/readers/web/knowledge_base/base.py

    def scrape_article(
        self,
        browser: Any,
        url: str,
    ) -> Dict[str, str]:
        """爬取单个文章的URL。

Args:
    browser（任意）：Playwright Chromium浏览器。
    url（str）：要爬取的文章的URL。

Returns:
    Dict[str, str]：文章属性与其值的映射。
"""
        page = browser.new_page(ignore_https_errors=True)
        page.set_default_timeout(60000)
        page.goto(url, wait_until="domcontentloaded")

        title = (
            (
                page.query_selector(self.title_selector).evaluate(
                    "node => node.innerText"
                )
            )
            if self.title_selector
            else ""
        )
        subtitle = (
            (
                page.query_selector(self.subtitle_selector).evaluate(
                    "node => node.innerText"
                )
            )
            if self.subtitle_selector
            else ""
        )
        body = (
            (page.query_selector(self.body_selector).evaluate("node => node.innerText"))
            if self.body_selector
            else ""
        )

        page.close()
        print("scraped:", url)
        return {"title": title, "subtitle": subtitle, "body": body, "url": url}

get_article_urls #

get_article_urls(
    browser: Any, root_url: str, current_url: str
) -> List[str]

递归地遍历知识库，以找到文章列表。

Parameters:

Name	Type	Description	Default
`browser`	`Any`	Playwright Chromium 浏览器。	required
`root_url`	`str`	知识库的根URL。	required
`current_url`	`str`	正在遍历的当前URL。	required

Returns:

Type	Description
`List[str]`	List[str]: 找到的文章的URL列表。

Source code in llama_index/readers/web/knowledge_base/base.py

    def get_article_urls(
        self, browser: Any, root_url: str, current_url: str
    ) -> List[str]:
        """递归地遍历知识库，以找到文章列表。

Args:
    browser (Any): Playwright Chromium 浏览器。
    root_url (str): 知识库的根URL。
    current_url (str): 正在遍历的当前URL。

Returns:
    List[str]: 找到的文章的URL列表。
"""
        page = browser.new_page(ignore_https_errors=True)
        page.set_default_timeout(60000)
        page.goto(current_url, wait_until="domcontentloaded")

        # If this is a leaf node aka article page, return itself
        if self.article_path in current_url:
            print("Found an article: ", current_url)
            page.close()
            return [current_url]

        # Otherwise crawl this page and find all the articles linked from it
        article_urls = []
        links = []

        for link_selector in self.link_selectors:
            ahrefs = page.query_selector_all(link_selector)
            links.extend(ahrefs)

        for link in links:
            url = root_url + page.evaluate("(node) => node.getAttribute('href')", link)
            article_urls.extend(self.get_article_urls(browser, root_url, url))

        page.close()

        return article_urls

MainContentExtractorReader #

Bases: BaseReader

主要内容提取器网页阅读器。

从网页中读取页面。

Parameters:

Name	Type	Description	Default
`text_format`	`(str, 可选)`	文本的格式。默认为 "markdown"。需要 `MainContentExtractor` 包。	`'markdown'`

Source code in llama_index/readers/web/main_content_extractor/base.py

class MainContentExtractorReader(BaseReader):
    """主要内容提取器网页阅读器。

    从网页中读取页面。

    Args:
        text_format (str, 可选): 文本的格式。默认为 "markdown"。
            需要 `MainContentExtractor` 包。"""

    def __init__(self, text_format: str = "markdown") -> None:
        """使用参数进行初始化。"""
        self.text_format = text_format

    def load_data(self, urls: List[str]) -> List[Document]:
        """从输入目录加载数据。

Args:
    urls（List[str]）：要抓取的URL列表。

Returns:
    List[Document]：文档列表。
"""
        if not isinstance(urls, list):
            raise ValueError("urls must be a list of strings.")

        from main_content_extractor import MainContentExtractor

        documents = []
        for url in urls:
            response = requests.get(url).text
            response = MainContentExtractor.extract(
                response, output_format=self.text_format, include_links=False
            )

            documents.append(Document(text=response))

        return documents

load_data #

load_data(urls: List[str]) -> List[Document]

从输入目录加载数据。

Returns:

Type	Description
`List[Document]`	List[Document]：文档列表。

Source code in llama_index/readers/web/main_content_extractor/base.py

    def load_data(self, urls: List[str]) -> List[Document]:
        """从输入目录加载数据。

Args:
    urls（List[str]）：要抓取的URL列表。

Returns:
    List[Document]：文档列表。
"""
        if not isinstance(urls, list):
            raise ValueError("urls must be a list of strings.")

        from main_content_extractor import MainContentExtractor

        documents = []
        for url in urls:
            response = requests.get(url).text
            response = MainContentExtractor.extract(
                response, output_format=self.text_format, include_links=False
            )

            documents.append(Document(text=response))

        return documents

NewsArticleReader #

Bases: BaseReader

简单的新闻文章阅读器。

从网络上读取新闻文章，并使用newspaper库进行解析。

Parameters:

Name	Type	Description	Default
`text_mode`	`bool`	是否加载内容的文本版本或HTML版本（默认为True）。	`True`
`use_nlp`	`bool`	是否使用自然语言处理来提取额外的摘要和关键词（默认为True）。	`True`
`newspaper_kwargs`	`Any`	传递给newspaper.Article的额外关键字参数。参见 https://newspaper.readthedocs.io/en/latest/user_guide/quickstart.html#article	`{}`

Source code in llama_index/readers/web/news/base.py

class NewsArticleReader(BaseReader):
    """简单的新闻文章阅读器。

    从网络上读取新闻文章，并使用`newspaper`库进行解析。

    Args:
        text_mode (bool): 是否加载内容的文本版本或HTML版本（默认为True）。
        use_nlp (bool): 是否使用自然语言处理来提取额外的摘要和关键词（默认为True）。
        newspaper_kwargs: 传递给newspaper.Article的额外关键字参数。参见
            https://newspaper.readthedocs.io/en/latest/user_guide/quickstart.html#article"""

    def __init__(
        self, text_mode: bool = True, use_nlp: bool = True, **newspaper_kwargs: Any
    ) -> None:
        """使用参数进行初始化。"""
        if find_spec("newspaper") is None:
            raise ImportError(
                "`newspaper` package not found, please run `pip install newspaper3k`"
            )
        self.load_text = text_mode
        self.use_nlp = use_nlp
        self.newspaper_kwargs = newspaper_kwargs

    def load_data(self, urls: List[str]) -> List[Document]:
        """从新闻文章网址列表中加载数据。

Args:
    urls（List[str]）：要加载新闻文章的网址列表。

Returns:
    List[Document]：文档列表。
"""
        if not isinstance(urls, list) and not isinstance(urls, Generator):
            raise ValueError("urls must be a list or generator.")
        documents = []
        for url in urls:
            from newspaper import Article

            try:
                article = Article(url, **self.newspaper_kwargs)
                article.download()
                article.parse()

                if self.use_nlp:
                    article.nlp()

            except Exception as e:
                logger.error(f"Error fetching or processing {url}, exception: {e}")
                continue

            metadata = {
                "title": getattr(article, "title", ""),
                "link": getattr(article, "url", getattr(article, "canonical_link", "")),
                "authors": getattr(article, "authors", []),
                "language": getattr(article, "meta_lang", ""),
                "description": getattr(article, "meta_description", ""),
                "publish_date": getattr(article, "publish_date", ""),
            }

            if self.load_text:
                content = article.text
            else:
                content = article.html

            if self.use_nlp:
                metadata["keywords"] = getattr(article, "keywords", [])
                metadata["summary"] = getattr(article, "summary", "")

            documents.append(Document(text=content, metadata=metadata))

        return documents

load_data #

load_data(urls: List[str]) -> List[Document]

从新闻文章网址列表中加载数据。

Returns:

Type	Description
`List[Document]`	List[Document]：文档列表。

Source code in llama_index/readers/web/news/base.py

    def load_data(self, urls: List[str]) -> List[Document]:
        """从新闻文章网址列表中加载数据。

Args:
    urls（List[str]）：要加载新闻文章的网址列表。

Returns:
    List[Document]：文档列表。
"""
        if not isinstance(urls, list) and not isinstance(urls, Generator):
            raise ValueError("urls must be a list or generator.")
        documents = []
        for url in urls:
            from newspaper import Article

            try:
                article = Article(url, **self.newspaper_kwargs)
                article.download()
                article.parse()

                if self.use_nlp:
                    article.nlp()

            except Exception as e:
                logger.error(f"Error fetching or processing {url}, exception: {e}")
                continue

            metadata = {
                "title": getattr(article, "title", ""),
                "link": getattr(article, "url", getattr(article, "canonical_link", "")),
                "authors": getattr(article, "authors", []),
                "language": getattr(article, "meta_lang", ""),
                "description": getattr(article, "meta_description", ""),
                "publish_date": getattr(article, "publish_date", ""),
            }

            if self.load_text:
                content = article.text
            else:
                content = article.html

            if self.use_nlp:
                metadata["keywords"] = getattr(article, "keywords", [])
                metadata["summary"] = getattr(article, "summary", "")

            documents.append(Document(text=content, metadata=metadata))

        return documents

ReadabilityWebPageReader #

Bases: BaseReader

网页可读性加载器。

从完全呈现的网页中提取相关信息。在处理过程中，始终假定用作数据源的网页包含文本内容。

加载页面并等待其呈现。（playwright）
注入Readability.js以提取主要内容。

Source code in llama_index/readers/web/readability_web/base.py

class ReadabilityWebPageReader(BaseReader):
    """网页可读性加载器。

    从完全呈现的网页中提取相关信息。
    在处理过程中，始终假定用作数据源的网页包含文本内容。

    1. 加载页面并等待其呈现。（playwright）
    2. 注入Readability.js以提取主要内容。

    Args:
        proxy（可选[str]，可选）：代理服务器。默认为None。
        wait_until（可选[Literal["commit", "domcontentloaded", "load", "networkidle"]]，可选）：等待页面加载完成。默认为"domcontentloaded"。
        text_splitter（TextSplitter，可选）：文本分割器。默认为None。
        normalizer（可选[Callable[[str], str]]，可选）：文本规范化器。默认为nfkc_normalize。"""

    def __init__(
        self,
        proxy: Optional[str] = None,
        wait_until: Optional[
            Literal["commit", "domcontentloaded", "load", "networkidle"]
        ] = "domcontentloaded",
        text_splitter: Optional[TextSplitter] = None,
        normalize: Optional[Callable[[str], str]] = nfkc_normalize,
    ) -> None:
        self._launch_options = {
            "headless": True,
        }
        self._wait_until = wait_until
        if proxy:
            self._launch_options["proxy"] = {
                "server": proxy,
            }
        self._text_splitter = text_splitter
        self._normalize = normalize
        self._readability_js = None

    async def async_load_data(self, url: str) -> List[Document]:
        """渲染并从URL加载数据内容。

Args:
    url（str）：要抓取的URL。

Returns:
    List[Document]：文档列表。
"""
        from playwright.async_api import async_playwright

        async with async_playwright() as async_playwright:
            browser = await async_playwright.chromium.launch(**self._launch_options)

            article = await self.scrape_page(
                browser,
                url,
            )
            extra_info = {
                key: article[key]
                for key in [
                    "title",
                    "length",
                    "excerpt",
                    "byline",
                    "dir",
                    "lang",
                    "siteName",
                ]
            }

            if self._normalize is not None:
                article["textContent"] = self._normalize(article["textContent"])
            texts = []
            if self._text_splitter is not None:
                texts = self._text_splitter.split_text(article["textContent"])
            else:
                texts = [article["textContent"]]

            await browser.close()

            return [Document(text=x, extra_info=extra_info) for x in texts]

    def load_data(self, url: str) -> List[Document]:
        return async_to_sync(self.async_load_data(url))

    async def scrape_page(
        self,
        browser: Browser,
        url: str,
    ) -> Dict[str, str]:
        """爬取单个文章的URL。

Args:
    browser（任意）：Playwright Chromium浏览器。
    url（str）：要爬取的文章的URL。

Returns:
    Ref：https://github.com/mozilla/readability
    title：文章标题；
    content：经过处理的文章内容的HTML字符串；
    textContent：去除所有HTML标记的文章文本内容；
    length：文章长度，以字符计算；
    excerpt：文章描述，或者从内容中摘录的简短摘要；
    byline：作者元数据；
    dir：内容方向；
    siteName：站点名称。
    lang：内容语言。
"""
        if self._readability_js is None:
            with open(path) as f:
                self._readability_js = f.read()

        inject_readability = f"""
            (function(){{
            {self._readability_js}
            function executor() {{
                return new Readability({{}}, document).parse();
            }}
            return executor();
            }}())
        """

        # browser = cast(Browser, browser)
        page = await browser.new_page(ignore_https_errors=True)
        page.set_default_timeout(60000)
        await page.goto(url, wait_until=self._wait_until)

        r = await page.evaluate(inject_readability)

        await page.close()
        print("scraped:", url)

        return r

async_load_data `async` #

async_load_data(url: str) -> List[Document]

渲染并从URL加载数据内容。

Returns:

Type	Description
`List[Document]`	List[Document]：文档列表。

Source code in llama_index/readers/web/readability_web/base.py

    async def async_load_data(self, url: str) -> List[Document]:
        """渲染并从URL加载数据内容。

Args:
    url（str）：要抓取的URL。

Returns:
    List[Document]：文档列表。
"""
        from playwright.async_api import async_playwright

        async with async_playwright() as async_playwright:
            browser = await async_playwright.chromium.launch(**self._launch_options)

            article = await self.scrape_page(
                browser,
                url,
            )
            extra_info = {
                key: article[key]
                for key in [
                    "title",
                    "length",
                    "excerpt",
                    "byline",
                    "dir",
                    "lang",
                    "siteName",
                ]
            }

            if self._normalize is not None:
                article["textContent"] = self._normalize(article["textContent"])
            texts = []
            if self._text_splitter is not None:
                texts = self._text_splitter.split_text(article["textContent"])
            else:
                texts = [article["textContent"]]

            await browser.close()

            return [Document(text=x, extra_info=extra_info) for x in texts]

scrape_page `async` #

scrape_page(browser: Browser, url: str) -> Dict[str, str]

爬取单个文章的URL。

Returns:

Type	Description
`Dict[str, str]`	Ref：https://github.com/mozilla/readability
`Dict[str, str]`	title：文章标题；
`Dict[str, str]`	content：经过处理的文章内容的HTML字符串；
`Dict[str, str]`	textContent：去除所有HTML标记的文章文本内容；
`Dict[str, str]`	length：文章长度，以字符计算；
`Dict[str, str]`	excerpt：文章描述，或者从内容中摘录的简短摘要；
`Dict[str, str]`	byline：作者元数据；
`Dict[str, str]`	dir：内容方向；
`Dict[str, str]`	siteName：站点名称。
`Dict[str, str]`	lang：内容语言。

Source code in llama_index/readers/web/readability_web/base.py

    async def scrape_page(
        self,
        browser: Browser,
        url: str,
    ) -> Dict[str, str]:
        """爬取单个文章的URL。

Args:
    browser（任意）：Playwright Chromium浏览器。
    url（str）：要爬取的文章的URL。

Returns:
    Ref：https://github.com/mozilla/readability
    title：文章标题；
    content：经过处理的文章内容的HTML字符串；
    textContent：去除所有HTML标记的文章文本内容；
    length：文章长度，以字符计算；
    excerpt：文章描述，或者从内容中摘录的简短摘要；
    byline：作者元数据；
    dir：内容方向；
    siteName：站点名称。
    lang：内容语言。
"""
        if self._readability_js is None:
            with open(path) as f:
                self._readability_js = f.read()

        inject_readability = f"""
            (function(){{
            {self._readability_js}
            function executor() {{
                return new Readability({{}}, document).parse();
            }}
            return executor();
            }}())
        """

        # browser = cast(Browser, browser)
        page = await browser.new_page(ignore_https_errors=True)
        page.set_default_timeout(60000)
        await page.goto(url, wait_until=self._wait_until)

        r = await page.evaluate(inject_readability)

        await page.close()
        print("scraped:", url)

        return r

RssNewsReader #

Bases: BaseReader

RSS新闻阅读器。

从RSS订阅源中读取新闻内容，并使用NewsArticleReader进行解析。

Source code in llama_index/readers/web/rss_news/base.py

class RssNewsReader(BaseReader):
    """RSS新闻阅读器。

从RSS订阅源中读取新闻内容，并使用NewsArticleReader进行解析。"""

    def __init__(self, **reader_kwargs: Any) -> None:
        """使用参数进行初始化。

Args:
    html_to_text (bool): 是否将HTML转换为文本。
        需要`html2text`包。
"""
        try:
            import feedparser  # noqa: F401
        except ImportError:
            raise ImportError(
                "`feedparser` package not found, please run `pip install feedparser`"
            )

        try:
            import listparser  # noqa: F401
        except ImportError:
            raise ImportError(
                "`listparser` package not found, please run `pip install listparser`"
            )

        self.reader_kwargs = reader_kwargs

    def load_data(self, urls: List[str] = None, opml: str = None) -> List[Document]:
        """从RSS订阅源或OPML加载数据。

Args:
    urls (List[str]): 要加载的RSS URL列表。
    opml (str): OPML文件的URL或字符串或字节OPML内容。

Returns:
    List[Document]: 文档列表。
"""
        if (urls is None) == (
            opml is None
        ):  # This is True if both are None or neither is None
            raise ValueError(
                "Provide either the urls or the opml argument, but not both."
            )

        import feedparser

        if urls and not isinstance(urls, list):
            raise ValueError("urls must be a list of strings.")

        documents = []

        if not urls and opml:
            try:
                import listparser
            except ImportError as e:
                raise ImportError(
                    "Package listparser must be installed if the opml arg is used. "
                    "Please install with 'pip install listparser' or use the "
                    "urls arg instead."
                ) from e
            rss = listparser.parse(opml)
            urls = [feed.url for feed in rss.feeds]

        for url in urls:
            try:
                feed = feedparser.parse(url)
                for i, entry in enumerate(feed.entries):
                    article = NewsArticleReader(**self.reader_kwargs).load_data(
                        urls=[entry.link],
                    )[0]
                    article.metadata["feed"] = url

                    documents.append(
                        Document(text=article.text, metadata=article.metadata)
                    )

            except Exception as e:
                logger.error(f"Error fetching or processing {url}, exception: {e}")
                continue

        return documents

load_data #

load_data(
    urls: List[str] = None, opml: str = None
) -> List[Document]

从RSS订阅源或OPML加载数据。

Parameters:

Name	Type	Description	Default
`urls`	`List[str]`	要加载的RSS URL列表。	`None`
`opml`	`str`	OPML文件的URL或字符串或字节OPML内容。	`None`

Returns:

Type	Description
`List[Document]`	List[Document]: 文档列表。

Source code in llama_index/readers/web/rss_news/base.py

    def load_data(self, urls: List[str] = None, opml: str = None) -> List[Document]:
        """从RSS订阅源或OPML加载数据。

Args:
    urls (List[str]): 要加载的RSS URL列表。
    opml (str): OPML文件的URL或字符串或字节OPML内容。

Returns:
    List[Document]: 文档列表。
"""
        if (urls is None) == (
            opml is None
        ):  # This is True if both are None or neither is None
            raise ValueError(
                "Provide either the urls or the opml argument, but not both."
            )

        import feedparser

        if urls and not isinstance(urls, list):
            raise ValueError("urls must be a list of strings.")

        documents = []

        if not urls and opml:
            try:
                import listparser
            except ImportError as e:
                raise ImportError(
                    "Package listparser must be installed if the opml arg is used. "
                    "Please install with 'pip install listparser' or use the "
                    "urls arg instead."
                ) from e
            rss = listparser.parse(opml)
            urls = [feed.url for feed in rss.feeds]

        for url in urls:
            try:
                feed = feedparser.parse(url)
                for i, entry in enumerate(feed.entries):
                    article = NewsArticleReader(**self.reader_kwargs).load_data(
                        urls=[entry.link],
                    )[0]
                    article.metadata["feed"] = url

                    documents.append(
                        Document(text=article.text, metadata=article.metadata)
                    )

            except Exception as e:
                logger.error(f"Error fetching or processing {url}, exception: {e}")
                continue

        return documents

RssReader #

Bases: BasePydanticReader

RSS阅读器。

从RSS源中读取内容。

Source code in llama_index/readers/web/rss/base.py

class RssReader(BasePydanticReader):
    """RSS阅读器。

从RSS源中读取内容。"""

    is_remote: bool = True
    html_to_text: bool = False

    @classmethod
    def class_name(cls) -> str:
        return "RssReader"

    def load_data(self, urls: List[str]) -> List[Document]:
        """从RSS源加载数据。

Args:
    urls（List[str]）：要加载的RSS URL列表。

Returns:
    List[Document]：文档列表。
"""
        import feedparser

        if not isinstance(urls, list):
            raise ValueError("urls must be a list of strings.")

        documents = []

        for url in urls:
            parsed = feedparser.parse(url)
            for entry in parsed.entries:
                doc_id = entry.id or entry.link
                if "content" in entry:
                    data = entry.content[0].value
                else:
                    data = entry.description or entry.summary

                if self.html_to_text:
                    import html2text

                    data = html2text.html2text(data)

                extra_info = {"title": entry.title, "link": entry.link}
                documents.append(Document(text=data, id_=doc_id, extra_info=extra_info))

        return documents

load_data #

load_data(urls: List[str]) -> List[Document]

从RSS源加载数据。

Returns:

Type	Description
`List[Document]`	List[Document]：文档列表。

Source code in llama_index/readers/web/rss/base.py

    def load_data(self, urls: List[str]) -> List[Document]:
        """从RSS源加载数据。

Args:
    urls（List[str]）：要加载的RSS URL列表。

Returns:
    List[Document]：文档列表。
"""
        import feedparser

        if not isinstance(urls, list):
            raise ValueError("urls must be a list of strings.")

        documents = []

        for url in urls:
            parsed = feedparser.parse(url)
            for entry in parsed.entries:
                doc_id = entry.id or entry.link
                if "content" in entry:
                    data = entry.content[0].value
                else:
                    data = entry.description or entry.summary

                if self.html_to_text:
                    import html2text

                    data = html2text.html2text(data)

                extra_info = {"title": entry.title, "link": entry.link}
                documents.append(Document(text=data, id_=doc_id, extra_info=extra_info))

        return documents

ScrapflyReader #

Bases: BasePydanticReader

将url转换为llm可访问的markdown格式，使用Scrapfly.io。

Args: api_key: Scrapfly API密钥。 scrape_config: Scrapfly ScrapeConfig对象。 ignore_scrape_failures: 是否在失败时继续。 urls: 要抓取的url列表。 scrape_format: 抓取结果格式（markdown或text）更多详情，请访问: https://scrapfly.io/docs/sdk/python

Source code in llama_index/readers/web/scrapfly_web/base.py

class ScrapflyReader(BasePydanticReader):
    """将url转换为llm可访问的markdown格式，使用`Scrapfly.io`。

    Args:
    api_key: Scrapfly API密钥。
    scrape_config: Scrapfly ScrapeConfig对象。
    ignore_scrape_failures: 是否在失败时继续。
    urls: 要抓取的url列表。
    scrape_format: 抓取结果格式（markdown或text）
    更多详情，请访问: https://scrapfly.io/docs/sdk/python"""

    api_key: str
    ignore_scrape_failures: bool = True
    scrapfly: Optional["ScrapflyClient"] = None  # Declare the scrapfly attribute

    def __init__(self, api_key: str, ignore_scrape_failures: bool = True) -> None:
        """初始化客户端。"""
        super().__init__(api_key=api_key, ignore_scrape_failures=ignore_scrape_failures)
        try:
            from scrapfly import ScrapflyClient
        except ImportError:
            raise ImportError(
                "`scrapfly` package not found, please run `pip install scrapfly-sdk`"
            )
        self.scrapfly = ScrapflyClient(key=api_key)

    @classmethod
    def class_name(cls) -> str:
        return "Scrapfly_reader"

    def load_data(
        self,
        urls: List[str],
        scrape_format: Literal["markdown", "text"] = "markdown",
        scrape_config: Optional[dict] = None,
    ) -> List[Document]:
        """从URL加载数据。

Args:
    urls: List[str]: 要抓取的URL列表。
    scrape_config: Optional[dict]: ScrapFly抓取配置对象的字典。

Returns:
    List[Document]: 文档列表。

引发：
    ValueError: 如果未提供URL。
"""
        from scrapfly import ScrapeApiResponse, ScrapeConfig

        if urls is None:
            raise ValueError("URLs must be provided.")
        scrape_config = scrape_config if scrape_config is not None else {}

        documents = []
        for url in urls:
            try:
                response: ScrapeApiResponse = self.scrapfly.scrape(
                    ScrapeConfig(url, format=scrape_format, **scrape_config)
                )
                documents.append(
                    Document(
                        text=response.scrape_result["content"], extra_info={"url": url}
                    )
                )
            except Exception as e:
                if self.ignore_scrape_failures:
                    logger.error(f"Error fetching data from {url}, exception: {e}")
                else:
                    raise e  # noqa: TRY201

        return documents

load_data #

load_data(
    urls: List[str],
    scrape_format: Literal["markdown", "text"] = "markdown",
    scrape_config: Optional[dict] = None,
) -> List[Document]

从URL加载数据。

Parameters:

Name	Type	Description	Default
`urls`	`List[str]`	List[str]: 要抓取的URL列表。	required
`scrape_config`	`Optional[dict]`	Optional[dict]: ScrapFly抓取配置对象的字典。	`None`

Returns:

Type	Description
`List[Document]`	List[Document]: 文档列表。

引发： ValueError: 如果未提供URL。

Source code in llama_index/readers/web/scrapfly_web/base.py

    def load_data(
        self,
        urls: List[str],
        scrape_format: Literal["markdown", "text"] = "markdown",
        scrape_config: Optional[dict] = None,
    ) -> List[Document]:
        """从URL加载数据。

Args:
    urls: List[str]: 要抓取的URL列表。
    scrape_config: Optional[dict]: ScrapFly抓取配置对象的字典。

Returns:
    List[Document]: 文档列表。

引发：
    ValueError: 如果未提供URL。
"""
        from scrapfly import ScrapeApiResponse, ScrapeConfig

        if urls is None:
            raise ValueError("URLs must be provided.")
        scrape_config = scrape_config if scrape_config is not None else {}

        documents = []
        for url in urls:
            try:
                response: ScrapeApiResponse = self.scrapfly.scrape(
                    ScrapeConfig(url, format=scrape_format, **scrape_config)
                )
                documents.append(
                    Document(
                        text=response.scrape_result["content"], extra_info={"url": url}
                    )
                )
            except Exception as e:
                if self.ignore_scrape_failures:
                    logger.error(f"Error fetching data from {url}, exception: {e}")
                else:
                    raise e  # noqa: TRY201

        return documents

SimpleWebPageReader #

Bases: BasePydanticReader

简单的网页阅读器。

从网页中读取页面。

Parameters:

Name	Type	Description	Default
`html_to_text`	`bool`	是否将HTML转换为文本。需要`html2text`包。	`False`
`metadata_fn`	`Optional[Callable[[str], Dict]]`	一个接受URL并返回元数据字典的函数。默认值为None。	`None`

Source code in llama_index/readers/web/simple_web/base.py

class SimpleWebPageReader(BasePydanticReader):
    """简单的网页阅读器。

    从网页中读取页面。

    Args:
        html_to_text (bool): 是否将HTML转换为文本。
            需要`html2text`包。
        metadata_fn (Optional[Callable[[str], Dict]]): 一个接受URL并返回元数据字典的函数。
            默认值为None。"""

    is_remote: bool = True
    html_to_text: bool

    _metadata_fn: Optional[Callable[[str], Dict]] = PrivateAttr()

    def __init__(
        self,
        html_to_text: bool = False,
        metadata_fn: Optional[Callable[[str], Dict]] = None,
    ) -> None:
        """使用参数进行初始化。"""
        try:
            import html2text  # noqa
        except ImportError:
            raise ImportError(
                "`html2text` package not found, please run `pip install html2text`"
            )
        self._metadata_fn = metadata_fn
        super().__init__(html_to_text=html_to_text)

    @classmethod
    def class_name(cls) -> str:
        return "SimpleWebPageReader"

    def load_data(self, urls: List[str]) -> List[Document]:
        """从输入目录加载数据。

Args:
    urls（List[str]）：要抓取的URL列表。

Returns:
    List[Document]：文档列表。
"""
        if not isinstance(urls, list):
            raise ValueError("urls must be a list of strings.")
        documents = []
        for url in urls:
            response = requests.get(url, headers=None).text
            if self.html_to_text:
                import html2text

                response = html2text.html2text(response)

            metadata: Optional[Dict] = None
            if self._metadata_fn is not None:
                metadata = self._metadata_fn(url)

            documents.append(Document(text=response, id_=url, metadata=metadata or {}))

        return documents

load_data #

load_data(urls: List[str]) -> List[Document]

从输入目录加载数据。

Returns:

Type	Description
`List[Document]`	List[Document]：文档列表。

Source code in llama_index/readers/web/simple_web/base.py

    def load_data(self, urls: List[str]) -> List[Document]:
        """从输入目录加载数据。

Args:
    urls（List[str]）：要抓取的URL列表。

Returns:
    List[Document]：文档列表。
"""
        if not isinstance(urls, list):
            raise ValueError("urls must be a list of strings.")
        documents = []
        for url in urls:
            response = requests.get(url, headers=None).text
            if self.html_to_text:
                import html2text

                response = html2text.html2text(response)

            metadata: Optional[Dict] = None
            if self._metadata_fn is not None:
                metadata = self._metadata_fn(url)

            documents.append(Document(text=response, id_=url, metadata=metadata or {}))

        return documents

SitemapReader #

Bases: BaseReader

异步站点地图读取器，用于网络。

根据其 sitemap.xml 从网络中读取页面。

Parameters:

Name	Type	Description	Default
`sitemap_url`	`string`	sitemap.xml 的路径。例如 https://gpt-index.readthedocs.io/sitemap.xml	required

html_to_text (bool): 是否将 HTML 转换为文本。需要 html2text 包。 limit (int): 最大并发请求数。

Source code in llama_index/readers/web/sitemap/base.py

class SitemapReader(BaseReader):
    """异步站点地图读取器，用于网络。

    根据其 sitemap.xml 从网络中读取页面。

    Args:
        sitemap_url (string): sitemap.xml 的路径。例如 https://gpt-index.readthedocs.io/sitemap.xml
    html_to_text (bool): 是否将 HTML 转换为文本。
        需要 `html2text` 包。
    limit (int): 最大并发请求数。"""

    xml_schema_sitemap = "http://www.sitemaps.org/schemas/sitemap/0.9"

    def __init__(self, html_to_text: bool = False, limit: int = 10) -> None:
        """使用参数进行初始化。"""
        self._async_loader = AsyncWebPageReader(html_to_text=html_to_text, limit=limit)
        self._html_to_text = html_to_text
        self._limit = limit

    def _load_sitemap(self, sitemap_url: str) -> str:
        sitemap_url_request = urllib.request.urlopen(sitemap_url)

        return sitemap_url_request.read()

    def _parse_sitemap(self, raw_sitemap: str, filter_locs: str = None) -> list:
        sitemap = ET.fromstring(raw_sitemap)
        sitemap_urls = []

        for url in sitemap.findall(f"{{{self.xml_schema_sitemap}}}url"):
            location = url.find(f"{{{self.xml_schema_sitemap}}}loc").text

            if filter_locs is None or filter_locs in location:
                sitemap_urls.append(location)

        return sitemap_urls

    def load_data(self, sitemap_url: str, filter: str = None) -> List[Document]:
        sitemap = self._load_sitemap(sitemap_url=sitemap_url)
        sitemap_urls = self._parse_sitemap(sitemap, filter)

        return self._async_loader.load_data(urls=sitemap_urls)

TrafilaturaWebReader #

Bases: BasePydanticReader

Trafilatura 网页读取器。

从网页中读取页面。需要 trafilatura 包。

Source code in llama_index/readers/web/trafilatura_web/base.py

class TrafilaturaWebReader(BasePydanticReader):
    """Trafilatura 网页读取器。

    从网页中读取页面。
    需要 `trafilatura` 包。"""

    is_remote: bool = True

    @classmethod
    def class_name(cls) -> str:
        """获取类的名称标识符。"""
        return "TrafilaturaWebReader"

    def load_data(
        self,
        urls: List[str],
        include_comments=True,
        output_format="txt",
        include_tables=True,
        include_images=False,
        include_formatting=False,
        include_links=False,
        show_progress=False,
        **kwargs,
    ) -> List[Document]:
        """从URL加载数据。

Args:
    urls（List[str]）：要抓取的URL列表。
    include_comments（bool，可选）：是否在输出中包含注释。默认为True。
    output_format（str，可选）：输出格式。默认为'txt'。
    include_tables（bool，可选）：是否在输出中包含表格。默认为True。
    include_images（bool，可选）：是否在输出中包含图像。默认为False。
    include_formatting（bool，可选）：是否在输出中包含格式。默认为False。
    include_links（bool，可选）：是否在输出中包含链接。默认为False。
    show_progress（bool，可选）：是否显示进度条。默认为False。
    kwargs：`trafilatura.extract`函数的额外关键字参数。

Returns:
    List[Document]：文档列表。
"""
        import trafilatura

        if not isinstance(urls, list):
            raise ValueError("urls must be a list of strings.")
        documents = []

        if show_progress:
            from tqdm import tqdm

            iterator = tqdm(urls, desc="Downloading pages")
        else:
            iterator = urls
        for url in iterator:
            downloaded = trafilatura.fetch_url(url)
            response = trafilatura.extract(
                downloaded,
                include_comments=include_comments,
                output_format=output_format,
                include_tables=include_tables,
                include_images=include_images,
                include_formatting=include_formatting,
                include_links=include_links,
                **kwargs,
            )
            documents.append(Document(text=response, id_=url))

        return documents

class_name `classmethod` #

class_name() -> str

获取类的名称标识符。

Source code in llama_index/readers/web/trafilatura_web/base.py

@classmethod
def class_name(cls) -> str:
    """获取类的名称标识符。"""
    return "TrafilaturaWebReader"

load_data #

load_data(
    urls: List[str],
    include_comments=True,
    output_format="txt",
    include_tables=True,
    include_images=False,
    include_formatting=False,
    include_links=False,
    show_progress=False,
    **kwargs
) -> List[Document]

从URL加载数据。

Returns:

Type	Description
`List[Document]`	List[Document]：文档列表。

Source code in llama_index/readers/web/trafilatura_web/base.py

    def load_data(
        self,
        urls: List[str],
        include_comments=True,
        output_format="txt",
        include_tables=True,
        include_images=False,
        include_formatting=False,
        include_links=False,
        show_progress=False,
        **kwargs,
    ) -> List[Document]:
        """从URL加载数据。

Args:
    urls（List[str]）：要抓取的URL列表。
    include_comments（bool，可选）：是否在输出中包含注释。默认为True。
    output_format（str，可选）：输出格式。默认为'txt'。
    include_tables（bool，可选）：是否在输出中包含表格。默认为True。
    include_images（bool，可选）：是否在输出中包含图像。默认为False。
    include_formatting（bool，可选）：是否在输出中包含格式。默认为False。
    include_links（bool，可选）：是否在输出中包含链接。默认为False。
    show_progress（bool，可选）：是否显示进度条。默认为False。
    kwargs：`trafilatura.extract`函数的额外关键字参数。

Returns:
    List[Document]：文档列表。
"""
        import trafilatura

        if not isinstance(urls, list):
            raise ValueError("urls must be a list of strings.")
        documents = []

        if show_progress:
            from tqdm import tqdm

            iterator = tqdm(urls, desc="Downloading pages")
        else:
            iterator = urls
        for url in iterator:
            downloaded = trafilatura.fetch_url(url)
            response = trafilatura.extract(
                downloaded,
                include_comments=include_comments,
                output_format=output_format,
                include_tables=include_tables,
                include_images=include_images,
                include_formatting=include_formatting,
                include_links=include_links,
                **kwargs,
            )
            documents.append(Document(text=response, id_=url))

        return documents

UnstructuredURLLoader #

Bases: BaseReader

使用unstructured来加载HTML文件的加载器。

Source code in llama_index/readers/web/unstructured_web/base.py

class UnstructuredURLLoader(BaseReader):
    """使用unstructured来加载HTML文件的加载器。"""

    def __init__(
        self, urls: List[str], continue_on_failure: bool = True, headers: dict = {}
    ):
        """使用文件路径进行初始化。"""
        try:
            import unstructured  # noqa:F401
            from unstructured.__version__ import __version__ as __unstructured_version__

            self.__version = __unstructured_version__
        except ImportError:
            raise ValueError(
                "unstructured package not found, please install it with "
                "`pip install unstructured`"
            )

        if not self.__is_headers_available() and len(headers.keys()) != 0:
            logger.warning(
                "You are using old version of unstructured. "
                "The headers parameter is ignored"
            )

        self.urls = urls
        self.continue_on_failure = continue_on_failure
        self.headers = headers

    def __is_headers_available(self) -> bool:
        _unstructured_version = self.__version.split("-")[0]
        unstructured_version = tuple([int(x) for x in _unstructured_version.split(".")])

        return unstructured_version >= (0, 5, 7)

    def load_data(self) -> List[Document]:
        """加载文件。"""
        from unstructured.partition.html import partition_html

        docs: List[Document] = []
        for url in self.urls:
            try:
                if self.__is_headers_available():
                    elements = partition_html(url=url, headers=self.headers)
                else:
                    elements = partition_html(url=url)
                text = "\n\n".join([str(el) for el in elements])
                metadata = {"source": url}
                docs.append(Document(text=text, extra_info=metadata))
            except Exception as e:
                if self.continue_on_failure:
                    logger.error(f"Error fetching or processing {url}, exception: {e}")
                else:
                    raise e  # noqa: TRY201
        return docs

load_data #

load_data() -> List[Document]

加载文件。

Source code in llama_index/readers/web/unstructured_web/base.py

def load_data(self) -> List[Document]:
    """加载文件。"""
    from unstructured.partition.html import partition_html

    docs: List[Document] = []
    for url in self.urls:
        try:
            if self.__is_headers_available():
                elements = partition_html(url=url, headers=self.headers)
            else:
                elements = partition_html(url=url)
            text = "\n\n".join([str(el) for el in elements])
            metadata = {"source": url}
            docs.append(Document(text=text, extra_info=metadata))
        except Exception as e:
            if self.continue_on_failure:
                logger.error(f"Error fetching or processing {url}, exception: {e}")
            else:
                raise e  # noqa: TRY201
    return docs

WholeSiteReader #

Bases: BaseReader

BFS网页爬虫用于网站。

该类提供了使用广度优先搜索算法来爬取整个网站的功能。它从给定的基本URL导航网页，跟踪与指定前缀匹配的链接。

属性： prefix (str): 用于聚焦爬取的URL前缀。 max_depth (int): BFS算法的最大深度。

Parameters:

Name	Type	Description	Default
`prefix`	`str`	用于爬取的URL前缀。	required
`max_depth`	`(int, 可选)`	BFS的最大深度。默认为10。	`10`

Source code in llama_index/readers/web/whole_site/base.py

class WholeSiteReader(BaseReader):
    """BFS网页爬虫用于网站。

该类提供了使用广度优先搜索算法来爬取整个网站的功能。
它从给定的基本URL导航网页，跟踪与指定前缀匹配的链接。

属性：
    prefix (str): 用于聚焦爬取的URL前缀。
    max_depth (int): BFS算法的最大深度。

Args:
    prefix (str): 用于爬取的URL前缀。
    max_depth (int, 可选): BFS的最大深度。默认为10。"""

    def __init__(
        self,
        prefix: str,
        max_depth: int = 10,
        driver: Optional[webdriver.Chrome] = None,
    ) -> None:
        """
        使用提供的前缀和最大深度初始化WholeSiteReader。
        """
        self.prefix = prefix
        self.max_depth = max_depth
        self.driver = driver if driver else self.setup_driver()

    def setup_driver(self):
        """设置Selenium WebDriver用于Chrome。

返回：
    WebDriver：Chrome WebDriver的一个实例。
"""
        try:
            import chromedriver_autoinstaller
        except ImportError:
            raise ImportError("Please install chromedriver_autoinstaller")

        opt = webdriver.ChromeOptions()
        opt.add_argument("--start-maximized")
        chromedriver_autoinstaller.install()
        return webdriver.Chrome(options=opt)

    def clean_url(self, url):
        return url.split("#")[0]

    def restart_driver(self):
        self.driver.quit()
        self.driver = self.setup_driver()

    def extract_content(self):
        WebDriverWait(self.driver, 10).until(
            EC.presence_of_element_located((By.TAG_NAME, "body"))
        )
        body_element = self.driver.find_element(By.TAG_NAME, "body")
        return body_element.text.strip()

    def extract_links(self):
        js_script = """
            var links = [];
            var elements = document.getElementsByTagName('a');
            for (var i = 0; i < elements.length; i++) {
                var href = elements[i].href;
                if (href) {
                    links.push(href);
                }
            }
            return links;
            """
        return self.driver.execute_script(js_script)

    def load_data(self, base_url: str) -> List[Document]:
        """使用BFS算法从基本URL加载数据。

Args:
    base_url（str）：开始爬取的基本URL。

Returns:
    List[Document]：爬取的文档列表。
"""
        added_urls = set()
        urls_to_visit = [(base_url, 0)]
        documents = []

        while urls_to_visit:
            current_url, depth = urls_to_visit.pop(0)
            print(f"Visiting: {current_url}, {len(urls_to_visit)} left")

            try:
                self.driver.get(current_url)
                page_content = self.extract_content()
                added_urls.add(current_url)

                next_depth = depth + 1
                if next_depth <= self.max_depth:
                    # links = self.driver.find_elements(By.TAG_NAME, 'a')
                    links = self.extract_links()
                    # clean all urls
                    links = [self.clean_url(link) for link in links]
                    # extract new links
                    links = [link for link in links if link not in added_urls]
                    print(f"Found {len(links)} new potential links")

                    for href in links:
                        try:
                            if href.startswith(self.prefix) and href not in added_urls:
                                urls_to_visit.append((href, next_depth))
                                added_urls.add(href)
                        except Exception:
                            continue

                documents.append(
                    Document(text=page_content, extra_info={"URL": current_url})
                )
                time.sleep(1)

            except WebDriverException:
                print("WebDriverException encountered, restarting driver...")
                self.restart_driver()
            except Exception as e:
                print(f"An unexpected exception occurred: {e}, skipping URL...")
                continue

        self.driver.quit()
        return documents

setup_driver #

setup_driver()

设置Selenium WebDriver用于Chrome。

返回： WebDriver：Chrome WebDriver的一个实例。

Source code in llama_index/readers/web/whole_site/base.py

    def setup_driver(self):
        """设置Selenium WebDriver用于Chrome。

返回：
    WebDriver：Chrome WebDriver的一个实例。
"""
        try:
            import chromedriver_autoinstaller
        except ImportError:
            raise ImportError("Please install chromedriver_autoinstaller")

        opt = webdriver.ChromeOptions()
        opt.add_argument("--start-maximized")
        chromedriver_autoinstaller.install()
        return webdriver.Chrome(options=opt)

load_data #

load_data(base_url: str) -> List[Document]

使用BFS算法从基本URL加载数据。

Returns:

Type	Description
`List[Document]`	List[Document]：爬取的文档列表。

Source code in llama_index/readers/web/whole_site/base.py

    def load_data(self, base_url: str) -> List[Document]:
        """使用BFS算法从基本URL加载数据。

Args:
    base_url（str）：开始爬取的基本URL。

Returns:
    List[Document]：爬取的文档列表。
"""
        added_urls = set()
        urls_to_visit = [(base_url, 0)]
        documents = []

        while urls_to_visit:
            current_url, depth = urls_to_visit.pop(0)
            print(f"Visiting: {current_url}, {len(urls_to_visit)} left")

            try:
                self.driver.get(current_url)
                page_content = self.extract_content()
                added_urls.add(current_url)

                next_depth = depth + 1
                if next_depth <= self.max_depth:
                    # links = self.driver.find_elements(By.TAG_NAME, 'a')
                    links = self.extract_links()
                    # clean all urls
                    links = [self.clean_url(link) for link in links]
                    # extract new links
                    links = [link for link in links if link not in added_urls]
                    print(f"Found {len(links)} new potential links")

                    for href in links:
                        try:
                            if href.startswith(self.prefix) and href not in added_urls:
                                urls_to_visit.append((href, next_depth))
                                added_urls.add(href)
                        except Exception:
                            continue

                documents.append(
                    Document(text=page_content, extra_info={"URL": current_url})
                )
                time.sleep(1)

            except WebDriverException:
                print("WebDriverException encountered, restarting driver...")
                self.restart_driver()
            except Exception as e:
                print(f"An unexpected exception occurred: {e}, skipping URL...")
                continue

        self.driver.quit()
        return documents

Web

AsyncWebPageReader #

load_data #

BeautifulSoupWebReader #

class_name classmethod #

load_data #

BrowserbaseWebReader #

lazy_load_data #

KnowledgeBaseWebReader #

load_data #

scrape_article #

get_article_urls #

MainContentExtractorReader #

load_data #

NewsArticleReader #

load_data #

ReadabilityWebPageReader #

async_load_data async #

scrape_page async #

RssNewsReader #

load_data #

RssReader #

load_data #

ScrapflyReader #

load_data #

SimpleWebPageReader #

load_data #

SitemapReader #

TrafilaturaWebReader #

class_name classmethod #

load_data #

UnstructuredURLLoader #

load_data #

WholeSiteReader #

setup_driver #

load_data #

class_name `classmethod` #

async_load_data `async` #

scrape_page `async` #

class_name `classmethod` #