Skip to content

Web

初始化文件。

AsyncWebPageReader #

Bases: BaseReader

异步网页阅读器。

异步读取网页。

Parameters:

Name Type Description Default
html_to_text bool

是否将HTML转换为文本。 需要html2text包。

False
limit int

最大并发请求数。

10
dedupe bool

如果给定列表中存在精确匹配的URL,则进行URL去重

True
fail_on_error bool

如果请求的URL未返回状态码200,则程序将引发ValueError异常

False
Source code in llama_index/readers/web/async_web/base.py
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
class AsyncWebPageReader(BaseReader):
    """异步网页阅读器。

    异步读取网页。

    Args:
        html_to_text (bool): 是否将HTML转换为文本。
            需要`html2text`包。
        limit (int): 最大并发请求数。
        dedupe (bool): 如果给定列表中存在精确匹配的URL,则进行URL去重
        fail_on_error (bool): 如果请求的URL未返回状态码200,则程序将引发ValueError异常"""

    def __init__(
        self,
        html_to_text: bool = False,
        limit: int = 10,
        dedupe: bool = True,
        fail_on_error: bool = False,
    ) -> None:
        """使用参数进行初始化。"""
        try:
            import html2text  # noqa: F401
        except ImportError:
            raise ImportError(
                "`html2text` package not found, please run `pip install html2text`"
            )
        try:
            import aiohttp  # noqa: F401
        except ImportError:
            raise ImportError(
                "`aiohttp` package not found, please run `pip install aiohttp`"
            )
        self._limit = limit
        self._html_to_text = html_to_text
        self._dedupe = dedupe
        self._fail_on_error = fail_on_error

    def load_data(self, urls: List[str]) -> List[Document]:
        """从输入的URL加载数据。

Args:
    urls(List[str]):要抓取的URL列表。

Returns:
    List[Document]:文档列表。
"""
        if self._dedupe:
            urls = list(dict.fromkeys(urls))

        import aiohttp

        def chunked_http_client(limit: int):
            semaphore = asyncio.Semaphore(limit)

            async def http_get(url: str, session: aiohttp.ClientSession):
                async with semaphore:
                    async with session.get(url) as response:
                        return response, await response.text()

            return http_get

        async def fetch_urls(urls: List[str]):
            http_client = chunked_http_client(self._limit)
            async with aiohttp.ClientSession() as session:
                tasks = [http_client(url, session) for url in urls]
                return await asyncio.gather(*tasks, return_exceptions=True)

        if not isinstance(urls, list):
            raise ValueError("urls must be a list of strings.")

        documents = []
        responses = asyncio.run(fetch_urls(urls))

        for i, response_tuple in enumerate(responses):
            if not isinstance(response_tuple, tuple):
                raise ValueError(f"One of the inputs is not a valid url: {urls[i]}")

            response, raw_page = response_tuple

            if response.status != 200:
                logger.warning(f"error fetching page from {urls[i]}")
                logger.info(response)

                if self._fail_on_error:
                    raise ValueError(
                        f"error fetching page from {urls[i]}. server returned status:"
                        f" {response.status} and response {raw_page}"
                    )

                continue

            if self._html_to_text:
                import html2text

                response_text = html2text.html2text(raw_page)
            else:
                response_text = raw_page

            documents.append(
                Document(text=response_text, extra_info={"Source": str(response.url)})
            )

        return documents

load_data #

load_data(urls: List[str]) -> List[Document]

从输入的URL加载数据。

Returns:

Type Description
List[Document]

List[Document]:文档列表。

Source code in llama_index/readers/web/async_web/base.py
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
    def load_data(self, urls: List[str]) -> List[Document]:
        """从输入的URL加载数据。

Args:
    urls(List[str]):要抓取的URL列表。

Returns:
    List[Document]:文档列表。
"""
        if self._dedupe:
            urls = list(dict.fromkeys(urls))

        import aiohttp

        def chunked_http_client(limit: int):
            semaphore = asyncio.Semaphore(limit)

            async def http_get(url: str, session: aiohttp.ClientSession):
                async with semaphore:
                    async with session.get(url) as response:
                        return response, await response.text()

            return http_get

        async def fetch_urls(urls: List[str]):
            http_client = chunked_http_client(self._limit)
            async with aiohttp.ClientSession() as session:
                tasks = [http_client(url, session) for url in urls]
                return await asyncio.gather(*tasks, return_exceptions=True)

        if not isinstance(urls, list):
            raise ValueError("urls must be a list of strings.")

        documents = []
        responses = asyncio.run(fetch_urls(urls))

        for i, response_tuple in enumerate(responses):
            if not isinstance(response_tuple, tuple):
                raise ValueError(f"One of the inputs is not a valid url: {urls[i]}")

            response, raw_page = response_tuple

            if response.status != 200:
                logger.warning(f"error fetching page from {urls[i]}")
                logger.info(response)

                if self._fail_on_error:
                    raise ValueError(
                        f"error fetching page from {urls[i]}. server returned status:"
                        f" {response.status} and response {raw_page}"
                    )

                continue

            if self._html_to_text:
                import html2text

                response_text = html2text.html2text(raw_page)
            else:
                response_text = raw_page

            documents.append(
                Document(text=response_text, extra_info={"Source": str(response.url)})
            )

        return documents

BeautifulSoupWebReader #

Bases: BasePydanticReader

美丽汤网页阅读器。

从网页读取页面。 需要bs4urllib包。

Parameters:

Name Type Description Default
website_extractor Optional[Dict[str, Callable]]

网站主机名(例如google.com)到指定从BeautifulSoup对象中提取文本的函数的映射。参见DEFAULT_WEBSITE_EXTRACTOR。

None
Source code in llama_index/readers/web/beautiful_soup_web/base.py
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
class BeautifulSoupWebReader(BasePydanticReader):
    """美丽汤网页阅读器。

    从网页读取页面。
    需要`bs4`和`urllib`包。

    Args:
        website_extractor (Optional[Dict[str, Callable]]): 网站主机名(例如google.com)到指定从BeautifulSoup对象中提取文本的函数的映射。参见DEFAULT_WEBSITE_EXTRACTOR。"""

    is_remote: bool = True
    _website_extractor: Dict[str, Callable] = PrivateAttr()

    def __init__(self, website_extractor: Optional[Dict[str, Callable]] = None) -> None:
        self._website_extractor = website_extractor or DEFAULT_WEBSITE_EXTRACTOR
        super().__init__()

    @classmethod
    def class_name(cls) -> str:
        """获取类的名称标识符。"""
        return "BeautifulSoupWebReader"

    def load_data(
        self,
        urls: List[str],
        custom_hostname: Optional[str] = None,
        include_url_in_text: Optional[bool] = True,
    ) -> List[Document]:
        """从URL加载数据。

Args:
    urls(List[str]):要抓取的URL列表。
    custom_hostname(可选[str]):在网站显示在自定义URL(例如Substack博客)下时,强制使用特定的主机名
    include_url_in_text(可选[bool]):在文档文本中包含参考URL

Returns:
    List[Document]:文档列表。
"""
        from urllib.parse import urlparse

        import requests
        from bs4 import BeautifulSoup

        documents = []
        for url in urls:
            try:
                page = requests.get(url)
            except Exception:
                raise ValueError(f"One of the inputs is not a valid url: {url}")

            hostname = custom_hostname or urlparse(url).hostname or ""

            soup = BeautifulSoup(page.content, "html.parser")

            data = ""
            extra_info = {"URL": url}
            if hostname in self._website_extractor:
                data, metadata = self._website_extractor[hostname](
                    soup=soup, url=url, include_url_in_text=include_url_in_text
                )
                extra_info.update(metadata)

            else:
                data = soup.getText()

            documents.append(Document(text=data, id_=url, extra_info=extra_info))

        return documents

class_name classmethod #

class_name() -> str

获取类的名称标识符。

Source code in llama_index/readers/web/beautiful_soup_web/base.py
151
152
153
154
@classmethod
def class_name(cls) -> str:
    """获取类的名称标识符。"""
    return "BeautifulSoupWebReader"

load_data #

load_data(
    urls: List[str],
    custom_hostname: Optional[str] = None,
    include_url_in_text: Optional[bool] = True,
) -> List[Document]

从URL加载数据。

Returns:

Type Description
List[Document]

List[Document]:文档列表。

Source code in llama_index/readers/web/beautiful_soup_web/base.py
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
    def load_data(
        self,
        urls: List[str],
        custom_hostname: Optional[str] = None,
        include_url_in_text: Optional[bool] = True,
    ) -> List[Document]:
        """从URL加载数据。

Args:
    urls(List[str]):要抓取的URL列表。
    custom_hostname(可选[str]):在网站显示在自定义URL(例如Substack博客)下时,强制使用特定的主机名
    include_url_in_text(可选[bool]):在文档文本中包含参考URL

Returns:
    List[Document]:文档列表。
"""
        from urllib.parse import urlparse

        import requests
        from bs4 import BeautifulSoup

        documents = []
        for url in urls:
            try:
                page = requests.get(url)
            except Exception:
                raise ValueError(f"One of the inputs is not a valid url: {url}")

            hostname = custom_hostname or urlparse(url).hostname or ""

            soup = BeautifulSoup(page.content, "html.parser")

            data = ""
            extra_info = {"URL": url}
            if hostname in self._website_extractor:
                data, metadata = self._website_extractor[hostname](
                    soup=soup, url=url, include_url_in_text=include_url_in_text
                )
                extra_info.update(metadata)

            else:
                data = soup.getText()

            documents.append(Document(text=data, id_=url, extra_info=extra_info))

        return documents

BrowserbaseWebReader #

Bases: BaseReader

浏览器基础Web阅读器。

使用托管在Browserbase上的无头浏览器加载预渲染的网页。 依赖于browserbase包。 从https://browserbase.com 获取您的API密钥。

Source code in llama_index/readers/web/browserbase_web/base.py
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
class BrowserbaseWebReader(BaseReader):
    """浏览器基础Web阅读器。

    使用托管在Browserbase上的无头浏览器加载预渲染的网页。
    依赖于`browserbase`包。
    从https://browserbase.com 获取您的API密钥。"""

    def __init__(
        self,
        api_key: Optional[str] = None,
        project_id: Optional[str] = None,
    ) -> None:
        try:
            from browserbase import Browserbase
        except ImportError:
            raise ImportError(
                "`browserbase` package not found, please run `pip install browserbase`"
            )

        self.browserbase = Browserbase(api_key, project_id)

    def lazy_load_data(
        self,
        urls: Sequence[str],
        text_content: bool = False,
        session_id: Optional[str] = None,
        proxy: Optional[bool] = None,
    ) -> Iterator[Document]:
        """从URL加载页面。"""
        pages = self.browserbase.load_urls(urls, text_content, session_id, proxy)

        for i, page in enumerate(pages):
            yield Document(
                text=page,
                metadata={
                    "url": urls[i],
                },
            )

lazy_load_data #

lazy_load_data(
    urls: Sequence[str],
    text_content: bool = False,
    session_id: Optional[str] = None,
    proxy: Optional[bool] = None,
) -> Iterator[Document]

从URL加载页面。

Source code in llama_index/readers/web/browserbase_web/base.py
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
def lazy_load_data(
    self,
    urls: Sequence[str],
    text_content: bool = False,
    session_id: Optional[str] = None,
    proxy: Optional[bool] = None,
) -> Iterator[Document]:
    """从URL加载页面。"""
    pages = self.browserbase.load_urls(urls, text_content, session_id, proxy)

    for i, page in enumerate(pages):
        yield Document(
            text=page,
            metadata={
                "url": urls[i],
            },
        )

KnowledgeBaseWebReader #

Bases: BaseReader

知识库阅读器。

使用Playwright爬取和阅读知识库/帮助中心的文章。 在Zendesk和Intercom CMS上进行了测试,可能也适用于其他平台。 可以在无头模式下运行,但可能会被Cloudflare阻止。为了安全起见,建议以有头模式运行。 偶尔会超时,如果出现超时情况,只需增加默认超时时间即可。 需要安装playwright包。

Parameters:

Name Type Description Default
root_url str

知识库的基本url,末尾不带斜杠 例如 'https://support.intercom.com'

required
link_selectors List[str]

用于在爬取过程中查找文章链接的css选择器列表 例如 ['.article-list a', '.article-list a']

required
article_path str

该域上文章的url路径,以便爬虫知道何时停止 例如 '/articles'

required
title_selector Optional[str]

用于查找文章标题的css选择器 例如 '.article-title'

None
subtitle_selector Optional[str]

用于查找文章副标题/描述的css选择器 例如 '.article-subtitle'

None
body_selector Optional[str]

用于查找文章正文的css选择器 例如 '.article-body'

None
Source code in llama_index/readers/web/knowledge_base/base.py
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
class KnowledgeBaseWebReader(BaseReader):
    """知识库阅读器。

    使用Playwright爬取和阅读知识库/帮助中心的文章。
    在Zendesk和Intercom CMS上进行了测试,可能也适用于其他平台。
    可以在无头模式下运行,但可能会被Cloudflare阻止。为了安全起见,建议以有头模式运行。
    偶尔会超时,如果出现超时情况,只需增加默认超时时间即可。
    需要安装`playwright`包。

    Args:
        root_url (str): 知识库的基本url,末尾不带斜杠
            例如 'https://support.intercom.com'
        link_selectors (List[str]): 用于在爬取过程中查找文章链接的css选择器列表
            例如 ['.article-list a', '.article-list a']
        article_path (str): 该域上文章的url路径,以便爬虫知道何时停止
            例如 '/articles'
        title_selector (Optional[str]): 用于查找文章标题的css选择器
            例如 '.article-title'
        subtitle_selector (Optional[str]): 用于查找文章副标题/描述的css选择器
            例如 '.article-subtitle'
        body_selector (Optional[str]): 用于查找文章正文的css选择器
            例如 '.article-body'"""

    def __init__(
        self,
        root_url: str,
        link_selectors: List[str],
        article_path: str,
        title_selector: Optional[str] = None,
        subtitle_selector: Optional[str] = None,
        body_selector: Optional[str] = None,
    ) -> None:
        """使用参数进行初始化。"""
        self.root_url = root_url
        self.link_selectors = link_selectors
        self.article_path = article_path
        self.title_selector = title_selector
        self.subtitle_selector = subtitle_selector
        self.body_selector = body_selector

    def load_data(self) -> List[Document]:
        """从知识库加载数据。"""
        from playwright.sync_api import sync_playwright

        with sync_playwright() as p:
            browser = p.chromium.launch(headless=False)

            # Crawl
            article_urls = self.get_article_urls(
                browser,
                self.root_url,
                self.root_url,
            )

            # Scrape
            documents = []
            for url in article_urls:
                article = self.scrape_article(
                    browser,
                    url,
                )
                extra_info = {
                    "title": article["title"],
                    "subtitle": article["subtitle"],
                    "url": article["url"],
                }
                documents.append(Document(text=article["body"], extra_info=extra_info))

            browser.close()

            return documents

    def scrape_article(
        self,
        browser: Any,
        url: str,
    ) -> Dict[str, str]:
        """爬取单个文章的URL。

Args:
    browser(任意):Playwright Chromium浏览器。
    url(str):要爬取的文章的URL。

Returns:
    Dict[str, str]:文章属性与其值的映射。
"""
        page = browser.new_page(ignore_https_errors=True)
        page.set_default_timeout(60000)
        page.goto(url, wait_until="domcontentloaded")

        title = (
            (
                page.query_selector(self.title_selector).evaluate(
                    "node => node.innerText"
                )
            )
            if self.title_selector
            else ""
        )
        subtitle = (
            (
                page.query_selector(self.subtitle_selector).evaluate(
                    "node => node.innerText"
                )
            )
            if self.subtitle_selector
            else ""
        )
        body = (
            (page.query_selector(self.body_selector).evaluate("node => node.innerText"))
            if self.body_selector
            else ""
        )

        page.close()
        print("scraped:", url)
        return {"title": title, "subtitle": subtitle, "body": body, "url": url}

    def get_article_urls(
        self, browser: Any, root_url: str, current_url: str
    ) -> List[str]:
        """递归地遍历知识库,以找到文章列表。

Args:
    browser (Any): Playwright Chromium 浏览器。
    root_url (str): 知识库的根URL。
    current_url (str): 正在遍历的当前URL。

Returns:
    List[str]: 找到的文章的URL列表。
"""
        page = browser.new_page(ignore_https_errors=True)
        page.set_default_timeout(60000)
        page.goto(current_url, wait_until="domcontentloaded")

        # If this is a leaf node aka article page, return itself
        if self.article_path in current_url:
            print("Found an article: ", current_url)
            page.close()
            return [current_url]

        # Otherwise crawl this page and find all the articles linked from it
        article_urls = []
        links = []

        for link_selector in self.link_selectors:
            ahrefs = page.query_selector_all(link_selector)
            links.extend(ahrefs)

        for link in links:
            url = root_url + page.evaluate("(node) => node.getAttribute('href')", link)
            article_urls.extend(self.get_article_urls(browser, root_url, url))

        page.close()

        return article_urls

load_data #

load_data() -> List[Document]

从知识库加载数据。

Source code in llama_index/readers/web/knowledge_base/base.py
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
def load_data(self) -> List[Document]:
    """从知识库加载数据。"""
    from playwright.sync_api import sync_playwright

    with sync_playwright() as p:
        browser = p.chromium.launch(headless=False)

        # Crawl
        article_urls = self.get_article_urls(
            browser,
            self.root_url,
            self.root_url,
        )

        # Scrape
        documents = []
        for url in article_urls:
            article = self.scrape_article(
                browser,
                url,
            )
            extra_info = {
                "title": article["title"],
                "subtitle": article["subtitle"],
                "url": article["url"],
            }
            documents.append(Document(text=article["body"], extra_info=extra_info))

        browser.close()

        return documents

scrape_article #

scrape_article(browser: Any, url: str) -> Dict[str, str]

爬取单个文章的URL。

Returns:

Type Description
Dict[str, str]

Dict[str, str]:文章属性与其值的映射。

Source code in llama_index/readers/web/knowledge_base/base.py
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
    def scrape_article(
        self,
        browser: Any,
        url: str,
    ) -> Dict[str, str]:
        """爬取单个文章的URL。

Args:
    browser(任意):Playwright Chromium浏览器。
    url(str):要爬取的文章的URL。

Returns:
    Dict[str, str]:文章属性与其值的映射。
"""
        page = browser.new_page(ignore_https_errors=True)
        page.set_default_timeout(60000)
        page.goto(url, wait_until="domcontentloaded")

        title = (
            (
                page.query_selector(self.title_selector).evaluate(
                    "node => node.innerText"
                )
            )
            if self.title_selector
            else ""
        )
        subtitle = (
            (
                page.query_selector(self.subtitle_selector).evaluate(
                    "node => node.innerText"
                )
            )
            if self.subtitle_selector
            else ""
        )
        body = (
            (page.query_selector(self.body_selector).evaluate("node => node.innerText"))
            if self.body_selector
            else ""
        )

        page.close()
        print("scraped:", url)
        return {"title": title, "subtitle": subtitle, "body": body, "url": url}

get_article_urls #

get_article_urls(
    browser: Any, root_url: str, current_url: str
) -> List[str]

递归地遍历知识库,以找到文章列表。

Parameters:

Name Type Description Default
browser Any

Playwright Chromium 浏览器。

required
root_url str

知识库的根URL。

required
current_url str

正在遍历的当前URL。

required

Returns:

Type Description
List[str]

List[str]: 找到的文章的URL列表。

Source code in llama_index/readers/web/knowledge_base/base.py
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
    def get_article_urls(
        self, browser: Any, root_url: str, current_url: str
    ) -> List[str]:
        """递归地遍历知识库,以找到文章列表。

Args:
    browser (Any): Playwright Chromium 浏览器。
    root_url (str): 知识库的根URL。
    current_url (str): 正在遍历的当前URL。

Returns:
    List[str]: 找到的文章的URL列表。
"""
        page = browser.new_page(ignore_https_errors=True)
        page.set_default_timeout(60000)
        page.goto(current_url, wait_until="domcontentloaded")

        # If this is a leaf node aka article page, return itself
        if self.article_path in current_url:
            print("Found an article: ", current_url)
            page.close()
            return [current_url]

        # Otherwise crawl this page and find all the articles linked from it
        article_urls = []
        links = []

        for link_selector in self.link_selectors:
            ahrefs = page.query_selector_all(link_selector)
            links.extend(ahrefs)

        for link in links:
            url = root_url + page.evaluate("(node) => node.getAttribute('href')", link)
            article_urls.extend(self.get_article_urls(browser, root_url, url))

        page.close()

        return article_urls

MainContentExtractorReader #

Bases: BaseReader

主要内容提取器网页阅读器。

从网页中读取页面。

Parameters:

Name Type Description Default
text_format (str, 可选)

文本的格式。默认为 "markdown"。 需要 MainContentExtractor 包。

'markdown'
Source code in llama_index/readers/web/main_content_extractor/base.py
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
class MainContentExtractorReader(BaseReader):
    """主要内容提取器网页阅读器。

    从网页中读取页面。

    Args:
        text_format (str, 可选): 文本的格式。默认为 "markdown"。
            需要 `MainContentExtractor` 包。"""

    def __init__(self, text_format: str = "markdown") -> None:
        """使用参数进行初始化。"""
        self.text_format = text_format

    def load_data(self, urls: List[str]) -> List[Document]:
        """从输入目录加载数据。

Args:
    urls(List[str]):要抓取的URL列表。

Returns:
    List[Document]:文档列表。
"""
        if not isinstance(urls, list):
            raise ValueError("urls must be a list of strings.")

        from main_content_extractor import MainContentExtractor

        documents = []
        for url in urls:
            response = requests.get(url).text
            response = MainContentExtractor.extract(
                response, output_format=self.text_format, include_links=False
            )

            documents.append(Document(text=response))

        return documents

load_data #

load_data(urls: List[str]) -> List[Document]

从输入目录加载数据。

Returns:

Type Description
List[Document]

List[Document]:文档列表。

Source code in llama_index/readers/web/main_content_extractor/base.py
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
    def load_data(self, urls: List[str]) -> List[Document]:
        """从输入目录加载数据。

Args:
    urls(List[str]):要抓取的URL列表。

Returns:
    List[Document]:文档列表。
"""
        if not isinstance(urls, list):
            raise ValueError("urls must be a list of strings.")

        from main_content_extractor import MainContentExtractor

        documents = []
        for url in urls:
            response = requests.get(url).text
            response = MainContentExtractor.extract(
                response, output_format=self.text_format, include_links=False
            )

            documents.append(Document(text=response))

        return documents

NewsArticleReader #

Bases: BaseReader

简单的新闻文章阅读器。

从网络上读取新闻文章,并使用newspaper库进行解析。

Parameters:

Name Type Description Default
text_mode bool

是否加载内容的文本版本或HTML版本(默认为True)。

True
use_nlp bool

是否使用自然语言处理来提取额外的摘要和关键词(默认为True)。

True
newspaper_kwargs Any

传递给newspaper.Article的额外关键字参数。参见 https://newspaper.readthedocs.io/en/latest/user_guide/quickstart.html#article

{}
Source code in llama_index/readers/web/news/base.py
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
class NewsArticleReader(BaseReader):
    """简单的新闻文章阅读器。

    从网络上读取新闻文章,并使用`newspaper`库进行解析。

    Args:
        text_mode (bool): 是否加载内容的文本版本或HTML版本(默认为True)。
        use_nlp (bool): 是否使用自然语言处理来提取额外的摘要和关键词(默认为True)。
        newspaper_kwargs: 传递给newspaper.Article的额外关键字参数。参见
            https://newspaper.readthedocs.io/en/latest/user_guide/quickstart.html#article"""

    def __init__(
        self, text_mode: bool = True, use_nlp: bool = True, **newspaper_kwargs: Any
    ) -> None:
        """使用参数进行初始化。"""
        if find_spec("newspaper") is None:
            raise ImportError(
                "`newspaper` package not found, please run `pip install newspaper3k`"
            )
        self.load_text = text_mode
        self.use_nlp = use_nlp
        self.newspaper_kwargs = newspaper_kwargs

    def load_data(self, urls: List[str]) -> List[Document]:
        """从新闻文章网址列表中加载数据。

Args:
    urls(List[str]):要加载新闻文章的网址列表。

Returns:
    List[Document]:文档列表。
"""
        if not isinstance(urls, list) and not isinstance(urls, Generator):
            raise ValueError("urls must be a list or generator.")
        documents = []
        for url in urls:
            from newspaper import Article

            try:
                article = Article(url, **self.newspaper_kwargs)
                article.download()
                article.parse()

                if self.use_nlp:
                    article.nlp()

            except Exception as e:
                logger.error(f"Error fetching or processing {url}, exception: {e}")
                continue

            metadata = {
                "title": getattr(article, "title", ""),
                "link": getattr(article, "url", getattr(article, "canonical_link", "")),
                "authors": getattr(article, "authors", []),
                "language": getattr(article, "meta_lang", ""),
                "description": getattr(article, "meta_description", ""),
                "publish_date": getattr(article, "publish_date", ""),
            }

            if self.load_text:
                content = article.text
            else:
                content = article.html

            if self.use_nlp:
                metadata["keywords"] = getattr(article, "keywords", [])
                metadata["summary"] = getattr(article, "summary", "")

            documents.append(Document(text=content, metadata=metadata))

        return documents

load_data #

load_data(urls: List[str]) -> List[Document]

从新闻文章网址列表中加载数据。

Returns:

Type Description
List[Document]

List[Document]:文档列表。

Source code in llama_index/readers/web/news/base.py
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
    def load_data(self, urls: List[str]) -> List[Document]:
        """从新闻文章网址列表中加载数据。

Args:
    urls(List[str]):要加载新闻文章的网址列表。

Returns:
    List[Document]:文档列表。
"""
        if not isinstance(urls, list) and not isinstance(urls, Generator):
            raise ValueError("urls must be a list or generator.")
        documents = []
        for url in urls:
            from newspaper import Article

            try:
                article = Article(url, **self.newspaper_kwargs)
                article.download()
                article.parse()

                if self.use_nlp:
                    article.nlp()

            except Exception as e:
                logger.error(f"Error fetching or processing {url}, exception: {e}")
                continue

            metadata = {
                "title": getattr(article, "title", ""),
                "link": getattr(article, "url", getattr(article, "canonical_link", "")),
                "authors": getattr(article, "authors", []),
                "language": getattr(article, "meta_lang", ""),
                "description": getattr(article, "meta_description", ""),
                "publish_date": getattr(article, "publish_date", ""),
            }

            if self.load_text:
                content = article.text
            else:
                content = article.html

            if self.use_nlp:
                metadata["keywords"] = getattr(article, "keywords", [])
                metadata["summary"] = getattr(article, "summary", "")

            documents.append(Document(text=content, metadata=metadata))

        return documents

ReadabilityWebPageReader #

Bases: BaseReader

网页可读性加载器。

从完全呈现的网页中提取相关信息。 在处理过程中,始终假定用作数据源的网页包含文本内容。

  1. 加载页面并等待其呈现。(playwright)
  2. 注入Readability.js以提取主要内容。
Source code in llama_index/readers/web/readability_web/base.py
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
class ReadabilityWebPageReader(BaseReader):
    """网页可读性加载器。

    从完全呈现的网页中提取相关信息。
    在处理过程中,始终假定用作数据源的网页包含文本内容。

    1. 加载页面并等待其呈现。(playwright)
    2. 注入Readability.js以提取主要内容。

    Args:
        proxy(可选[str],可选):代理服务器。默认为None。
        wait_until(可选[Literal["commit", "domcontentloaded", "load", "networkidle"]],可选):等待页面加载完成。默认为"domcontentloaded"。
        text_splitter(TextSplitter,可选):文本分割器。默认为None。
        normalizer(可选[Callable[[str], str]],可选):文本规范化器。默认为nfkc_normalize。"""

    def __init__(
        self,
        proxy: Optional[str] = None,
        wait_until: Optional[
            Literal["commit", "domcontentloaded", "load", "networkidle"]
        ] = "domcontentloaded",
        text_splitter: Optional[TextSplitter] = None,
        normalize: Optional[Callable[[str], str]] = nfkc_normalize,
    ) -> None:
        self._launch_options = {
            "headless": True,
        }
        self._wait_until = wait_until
        if proxy:
            self._launch_options["proxy"] = {
                "server": proxy,
            }
        self._text_splitter = text_splitter
        self._normalize = normalize
        self._readability_js = None

    async def async_load_data(self, url: str) -> List[Document]:
        """渲染并从URL加载数据内容。

Args:
    url(str):要抓取的URL。

Returns:
    List[Document]:文档列表。
"""
        from playwright.async_api import async_playwright

        async with async_playwright() as async_playwright:
            browser = await async_playwright.chromium.launch(**self._launch_options)

            article = await self.scrape_page(
                browser,
                url,
            )
            extra_info = {
                key: article[key]
                for key in [
                    "title",
                    "length",
                    "excerpt",
                    "byline",
                    "dir",
                    "lang",
                    "siteName",
                ]
            }

            if self._normalize is not None:
                article["textContent"] = self._normalize(article["textContent"])
            texts = []
            if self._text_splitter is not None:
                texts = self._text_splitter.split_text(article["textContent"])
            else:
                texts = [article["textContent"]]

            await browser.close()

            return [Document(text=x, extra_info=extra_info) for x in texts]

    def load_data(self, url: str) -> List[Document]:
        return async_to_sync(self.async_load_data(url))

    async def scrape_page(
        self,
        browser: Browser,
        url: str,
    ) -> Dict[str, str]:
        """爬取单个文章的URL。

Args:
    browser(任意):Playwright Chromium浏览器。
    url(str):要爬取的文章的URL。

Returns:
    Ref:https://github.com/mozilla/readability
    title:文章标题;
    content:经过处理的文章内容的HTML字符串;
    textContent:去除所有HTML标记的文章文本内容;
    length:文章长度,以字符计算;
    excerpt:文章描述,或者从内容中摘录的简短摘要;
    byline:作者元数据;
    dir:内容方向;
    siteName:站点名称。
    lang:内容语言。
"""
        if self._readability_js is None:
            with open(path) as f:
                self._readability_js = f.read()

        inject_readability = f"""
            (function(){{
            {self._readability_js}
            function executor() {{
                return new Readability({{}}, document).parse();
            }}
            return executor();
            }}())
        """

        # browser = cast(Browser, browser)
        page = await browser.new_page(ignore_https_errors=True)
        page.set_default_timeout(60000)
        await page.goto(url, wait_until=self._wait_until)

        r = await page.evaluate(inject_readability)

        await page.close()
        print("scraped:", url)

        return r

async_load_data async #

async_load_data(url: str) -> List[Document]

渲染并从URL加载数据内容。

Returns:

Type Description
List[Document]

List[Document]:文档列表。

Source code in llama_index/readers/web/readability_web/base.py
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
    async def async_load_data(self, url: str) -> List[Document]:
        """渲染并从URL加载数据内容。

Args:
    url(str):要抓取的URL。

Returns:
    List[Document]:文档列表。
"""
        from playwright.async_api import async_playwright

        async with async_playwright() as async_playwright:
            browser = await async_playwright.chromium.launch(**self._launch_options)

            article = await self.scrape_page(
                browser,
                url,
            )
            extra_info = {
                key: article[key]
                for key in [
                    "title",
                    "length",
                    "excerpt",
                    "byline",
                    "dir",
                    "lang",
                    "siteName",
                ]
            }

            if self._normalize is not None:
                article["textContent"] = self._normalize(article["textContent"])
            texts = []
            if self._text_splitter is not None:
                texts = self._text_splitter.split_text(article["textContent"])
            else:
                texts = [article["textContent"]]

            await browser.close()

            return [Document(text=x, extra_info=extra_info) for x in texts]

scrape_page async #

scrape_page(browser: Browser, url: str) -> Dict[str, str]

爬取单个文章的URL。

Returns:

Type Description
Dict[str, str]

Ref:https://github.com/mozilla/readability

Dict[str, str]

title:文章标题;

Dict[str, str]

content:经过处理的文章内容的HTML字符串;

Dict[str, str]

textContent:去除所有HTML标记的文章文本内容;

Dict[str, str]

length:文章长度,以字符计算;

Dict[str, str]

excerpt:文章描述,或者从内容中摘录的简短摘要;

Dict[str, str]

byline:作者元数据;

Dict[str, str]

dir:内容方向;

Dict[str, str]

siteName:站点名称。

Dict[str, str]

lang:内容语言。

Source code in llama_index/readers/web/readability_web/base.py
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
    async def scrape_page(
        self,
        browser: Browser,
        url: str,
    ) -> Dict[str, str]:
        """爬取单个文章的URL。

Args:
    browser(任意):Playwright Chromium浏览器。
    url(str):要爬取的文章的URL。

Returns:
    Ref:https://github.com/mozilla/readability
    title:文章标题;
    content:经过处理的文章内容的HTML字符串;
    textContent:去除所有HTML标记的文章文本内容;
    length:文章长度,以字符计算;
    excerpt:文章描述,或者从内容中摘录的简短摘要;
    byline:作者元数据;
    dir:内容方向;
    siteName:站点名称。
    lang:内容语言。
"""
        if self._readability_js is None:
            with open(path) as f:
                self._readability_js = f.read()

        inject_readability = f"""
            (function(){{
            {self._readability_js}
            function executor() {{
                return new Readability({{}}, document).parse();
            }}
            return executor();
            }}())
        """

        # browser = cast(Browser, browser)
        page = await browser.new_page(ignore_https_errors=True)
        page.set_default_timeout(60000)
        await page.goto(url, wait_until=self._wait_until)

        r = await page.evaluate(inject_readability)

        await page.close()
        print("scraped:", url)

        return r

RssNewsReader #

Bases: BaseReader

RSS新闻阅读器。

从RSS订阅源中读取新闻内容,并使用NewsArticleReader进行解析。

Source code in llama_index/readers/web/rss_news/base.py
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
class RssNewsReader(BaseReader):
    """RSS新闻阅读器。

从RSS订阅源中读取新闻内容,并使用NewsArticleReader进行解析。"""

    def __init__(self, **reader_kwargs: Any) -> None:
        """使用参数进行初始化。

Args:
    html_to_text (bool): 是否将HTML转换为文本。
        需要`html2text`包。
"""
        try:
            import feedparser  # noqa: F401
        except ImportError:
            raise ImportError(
                "`feedparser` package not found, please run `pip install feedparser`"
            )

        try:
            import listparser  # noqa: F401
        except ImportError:
            raise ImportError(
                "`listparser` package not found, please run `pip install listparser`"
            )

        self.reader_kwargs = reader_kwargs

    def load_data(self, urls: List[str] = None, opml: str = None) -> List[Document]:
        """从RSS订阅源或OPML加载数据。

Args:
    urls (List[str]): 要加载的RSS URL列表。
    opml (str): OPML文件的URL或字符串或字节OPML内容。

Returns:
    List[Document]: 文档列表。
"""
        if (urls is None) == (
            opml is None
        ):  # This is True if both are None or neither is None
            raise ValueError(
                "Provide either the urls or the opml argument, but not both."
            )

        import feedparser

        if urls and not isinstance(urls, list):
            raise ValueError("urls must be a list of strings.")

        documents = []

        if not urls and opml:
            try:
                import listparser
            except ImportError as e:
                raise ImportError(
                    "Package listparser must be installed if the opml arg is used. "
                    "Please install with 'pip install listparser' or use the "
                    "urls arg instead."
                ) from e
            rss = listparser.parse(opml)
            urls = [feed.url for feed in rss.feeds]

        for url in urls:
            try:
                feed = feedparser.parse(url)
                for i, entry in enumerate(feed.entries):
                    article = NewsArticleReader(**self.reader_kwargs).load_data(
                        urls=[entry.link],
                    )[0]
                    article.metadata["feed"] = url

                    documents.append(
                        Document(text=article.text, metadata=article.metadata)
                    )

            except Exception as e:
                logger.error(f"Error fetching or processing {url}, exception: {e}")
                continue

        return documents

load_data #

load_data(
    urls: List[str] = None, opml: str = None
) -> List[Document]

从RSS订阅源或OPML加载数据。

Parameters:

Name Type Description Default
urls List[str]

要加载的RSS URL列表。

None
opml str

OPML文件的URL或字符串或字节OPML内容。

None

Returns:

Type Description
List[Document]

List[Document]: 文档列表。

Source code in llama_index/readers/web/rss_news/base.py
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
    def load_data(self, urls: List[str] = None, opml: str = None) -> List[Document]:
        """从RSS订阅源或OPML加载数据。

Args:
    urls (List[str]): 要加载的RSS URL列表。
    opml (str): OPML文件的URL或字符串或字节OPML内容。

Returns:
    List[Document]: 文档列表。
"""
        if (urls is None) == (
            opml is None
        ):  # This is True if both are None or neither is None
            raise ValueError(
                "Provide either the urls or the opml argument, but not both."
            )

        import feedparser

        if urls and not isinstance(urls, list):
            raise ValueError("urls must be a list of strings.")

        documents = []

        if not urls and opml:
            try:
                import listparser
            except ImportError as e:
                raise ImportError(
                    "Package listparser must be installed if the opml arg is used. "
                    "Please install with 'pip install listparser' or use the "
                    "urls arg instead."
                ) from e
            rss = listparser.parse(opml)
            urls = [feed.url for feed in rss.feeds]

        for url in urls:
            try:
                feed = feedparser.parse(url)
                for i, entry in enumerate(feed.entries):
                    article = NewsArticleReader(**self.reader_kwargs).load_data(
                        urls=[entry.link],
                    )[0]
                    article.metadata["feed"] = url

                    documents.append(
                        Document(text=article.text, metadata=article.metadata)
                    )

            except Exception as e:
                logger.error(f"Error fetching or processing {url}, exception: {e}")
                continue

        return documents

RssReader #

Bases: BasePydanticReader

RSS阅读器。

从RSS源中读取内容。

Source code in llama_index/readers/web/rss/base.py
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
class RssReader(BasePydanticReader):
    """RSS阅读器。

从RSS源中读取内容。"""

    is_remote: bool = True
    html_to_text: bool = False

    @classmethod
    def class_name(cls) -> str:
        return "RssReader"

    def load_data(self, urls: List[str]) -> List[Document]:
        """从RSS源加载数据。

Args:
    urls(List[str]):要加载的RSS URL列表。

Returns:
    List[Document]:文档列表。
"""
        import feedparser

        if not isinstance(urls, list):
            raise ValueError("urls must be a list of strings.")

        documents = []

        for url in urls:
            parsed = feedparser.parse(url)
            for entry in parsed.entries:
                doc_id = entry.id or entry.link
                if "content" in entry:
                    data = entry.content[0].value
                else:
                    data = entry.description or entry.summary

                if self.html_to_text:
                    import html2text

                    data = html2text.html2text(data)

                extra_info = {"title": entry.title, "link": entry.link}
                documents.append(Document(text=data, id_=doc_id, extra_info=extra_info))

        return documents

load_data #

load_data(urls: List[str]) -> List[Document]

从RSS源加载数据。

Returns:

Type Description
List[Document]

List[Document]:文档列表。

Source code in llama_index/readers/web/rss/base.py
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
    def load_data(self, urls: List[str]) -> List[Document]:
        """从RSS源加载数据。

Args:
    urls(List[str]):要加载的RSS URL列表。

Returns:
    List[Document]:文档列表。
"""
        import feedparser

        if not isinstance(urls, list):
            raise ValueError("urls must be a list of strings.")

        documents = []

        for url in urls:
            parsed = feedparser.parse(url)
            for entry in parsed.entries:
                doc_id = entry.id or entry.link
                if "content" in entry:
                    data = entry.content[0].value
                else:
                    data = entry.description or entry.summary

                if self.html_to_text:
                    import html2text

                    data = html2text.html2text(data)

                extra_info = {"title": entry.title, "link": entry.link}
                documents.append(Document(text=data, id_=doc_id, extra_info=extra_info))

        return documents

ScrapflyReader #

Bases: BasePydanticReader

将url转换为llm可访问的markdown格式,使用Scrapfly.io

Args: api_key: Scrapfly API密钥。 scrape_config: Scrapfly ScrapeConfig对象。 ignore_scrape_failures: 是否在失败时继续。 urls: 要抓取的url列表。 scrape_format: 抓取结果格式(markdown或text) 更多详情,请访问: https://scrapfly.io/docs/sdk/python

Source code in llama_index/readers/web/scrapfly_web/base.py
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
class ScrapflyReader(BasePydanticReader):
    """将url转换为llm可访问的markdown格式,使用`Scrapfly.io`。

    Args:
    api_key: Scrapfly API密钥。
    scrape_config: Scrapfly ScrapeConfig对象。
    ignore_scrape_failures: 是否在失败时继续。
    urls: 要抓取的url列表。
    scrape_format: 抓取结果格式(markdown或text)
    更多详情,请访问: https://scrapfly.io/docs/sdk/python"""

    api_key: str
    ignore_scrape_failures: bool = True
    scrapfly: Optional["ScrapflyClient"] = None  # Declare the scrapfly attribute

    def __init__(self, api_key: str, ignore_scrape_failures: bool = True) -> None:
        """初始化客户端。"""
        super().__init__(api_key=api_key, ignore_scrape_failures=ignore_scrape_failures)
        try:
            from scrapfly import ScrapflyClient
        except ImportError:
            raise ImportError(
                "`scrapfly` package not found, please run `pip install scrapfly-sdk`"
            )
        self.scrapfly = ScrapflyClient(key=api_key)

    @classmethod
    def class_name(cls) -> str:
        return "Scrapfly_reader"

    def load_data(
        self,
        urls: List[str],
        scrape_format: Literal["markdown", "text"] = "markdown",
        scrape_config: Optional[dict] = None,
    ) -> List[Document]:
        """从URL加载数据。

Args:
    urls: List[str]: 要抓取的URL列表。
    scrape_config: Optional[dict]: ScrapFly抓取配置对象的字典。

Returns:
    List[Document]: 文档列表。

引发:
    ValueError: 如果未提供URL。
"""
        from scrapfly import ScrapeApiResponse, ScrapeConfig

        if urls is None:
            raise ValueError("URLs must be provided.")
        scrape_config = scrape_config if scrape_config is not None else {}

        documents = []
        for url in urls:
            try:
                response: ScrapeApiResponse = self.scrapfly.scrape(
                    ScrapeConfig(url, format=scrape_format, **scrape_config)
                )
                documents.append(
                    Document(
                        text=response.scrape_result["content"], extra_info={"url": url}
                    )
                )
            except Exception as e:
                if self.ignore_scrape_failures:
                    logger.error(f"Error fetching data from {url}, exception: {e}")
                else:
                    raise e  # noqa: TRY201

        return documents

load_data #

load_data(
    urls: List[str],
    scrape_format: Literal["markdown", "text"] = "markdown",
    scrape_config: Optional[dict] = None,
) -> List[Document]

从URL加载数据。

Parameters:

Name Type Description Default
urls List[str]

List[str]: 要抓取的URL列表。

required
scrape_config Optional[dict]

Optional[dict]: ScrapFly抓取配置对象的字典。

None

Returns:

Type Description
List[Document]

List[Document]: 文档列表。

引发: ValueError: 如果未提供URL。

Source code in llama_index/readers/web/scrapfly_web/base.py
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
    def load_data(
        self,
        urls: List[str],
        scrape_format: Literal["markdown", "text"] = "markdown",
        scrape_config: Optional[dict] = None,
    ) -> List[Document]:
        """从URL加载数据。

Args:
    urls: List[str]: 要抓取的URL列表。
    scrape_config: Optional[dict]: ScrapFly抓取配置对象的字典。

Returns:
    List[Document]: 文档列表。

引发:
    ValueError: 如果未提供URL。
"""
        from scrapfly import ScrapeApiResponse, ScrapeConfig

        if urls is None:
            raise ValueError("URLs must be provided.")
        scrape_config = scrape_config if scrape_config is not None else {}

        documents = []
        for url in urls:
            try:
                response: ScrapeApiResponse = self.scrapfly.scrape(
                    ScrapeConfig(url, format=scrape_format, **scrape_config)
                )
                documents.append(
                    Document(
                        text=response.scrape_result["content"], extra_info={"url": url}
                    )
                )
            except Exception as e:
                if self.ignore_scrape_failures:
                    logger.error(f"Error fetching data from {url}, exception: {e}")
                else:
                    raise e  # noqa: TRY201

        return documents

SimpleWebPageReader #

Bases: BasePydanticReader

简单的网页阅读器。

从网页中读取页面。

Parameters:

Name Type Description Default
html_to_text bool

是否将HTML转换为文本。 需要html2text包。

False
metadata_fn Optional[Callable[[str], Dict]]

一个接受URL并返回元数据字典的函数。 默认值为None。

None
Source code in llama_index/readers/web/simple_web/base.py
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
class SimpleWebPageReader(BasePydanticReader):
    """简单的网页阅读器。

    从网页中读取页面。

    Args:
        html_to_text (bool): 是否将HTML转换为文本。
            需要`html2text`包。
        metadata_fn (Optional[Callable[[str], Dict]]): 一个接受URL并返回元数据字典的函数。
            默认值为None。"""

    is_remote: bool = True
    html_to_text: bool

    _metadata_fn: Optional[Callable[[str], Dict]] = PrivateAttr()

    def __init__(
        self,
        html_to_text: bool = False,
        metadata_fn: Optional[Callable[[str], Dict]] = None,
    ) -> None:
        """使用参数进行初始化。"""
        try:
            import html2text  # noqa
        except ImportError:
            raise ImportError(
                "`html2text` package not found, please run `pip install html2text`"
            )
        self._metadata_fn = metadata_fn
        super().__init__(html_to_text=html_to_text)

    @classmethod
    def class_name(cls) -> str:
        return "SimpleWebPageReader"

    def load_data(self, urls: List[str]) -> List[Document]:
        """从输入目录加载数据。

Args:
    urls(List[str]):要抓取的URL列表。

Returns:
    List[Document]:文档列表。
"""
        if not isinstance(urls, list):
            raise ValueError("urls must be a list of strings.")
        documents = []
        for url in urls:
            response = requests.get(url, headers=None).text
            if self.html_to_text:
                import html2text

                response = html2text.html2text(response)

            metadata: Optional[Dict] = None
            if self._metadata_fn is not None:
                metadata = self._metadata_fn(url)

            documents.append(Document(text=response, id_=url, metadata=metadata or {}))

        return documents

load_data #

load_data(urls: List[str]) -> List[Document]

从输入目录加载数据。

Returns:

Type Description
List[Document]

List[Document]:文档列表。

Source code in llama_index/readers/web/simple_web/base.py
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
    def load_data(self, urls: List[str]) -> List[Document]:
        """从输入目录加载数据。

Args:
    urls(List[str]):要抓取的URL列表。

Returns:
    List[Document]:文档列表。
"""
        if not isinstance(urls, list):
            raise ValueError("urls must be a list of strings.")
        documents = []
        for url in urls:
            response = requests.get(url, headers=None).text
            if self.html_to_text:
                import html2text

                response = html2text.html2text(response)

            metadata: Optional[Dict] = None
            if self._metadata_fn is not None:
                metadata = self._metadata_fn(url)

            documents.append(Document(text=response, id_=url, metadata=metadata or {}))

        return documents

SitemapReader #

Bases: BaseReader

异步站点地图读取器,用于网络。

根据其 sitemap.xml 从网络中读取页面。

Parameters:

Name Type Description Default
sitemap_url string

sitemap.xml 的路径。例如 https://gpt-index.readthedocs.io/sitemap.xml

required

html_to_text (bool): 是否将 HTML 转换为文本。 需要 html2text 包。 limit (int): 最大并发请求数。

Source code in llama_index/readers/web/sitemap/base.py
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
class SitemapReader(BaseReader):
    """异步站点地图读取器,用于网络。

    根据其 sitemap.xml 从网络中读取页面。

    Args:
        sitemap_url (string): sitemap.xml 的路径。例如 https://gpt-index.readthedocs.io/sitemap.xml
    html_to_text (bool): 是否将 HTML 转换为文本。
        需要 `html2text` 包。
    limit (int): 最大并发请求数。"""

    xml_schema_sitemap = "http://www.sitemaps.org/schemas/sitemap/0.9"

    def __init__(self, html_to_text: bool = False, limit: int = 10) -> None:
        """使用参数进行初始化。"""
        self._async_loader = AsyncWebPageReader(html_to_text=html_to_text, limit=limit)
        self._html_to_text = html_to_text
        self._limit = limit

    def _load_sitemap(self, sitemap_url: str) -> str:
        sitemap_url_request = urllib.request.urlopen(sitemap_url)

        return sitemap_url_request.read()

    def _parse_sitemap(self, raw_sitemap: str, filter_locs: str = None) -> list:
        sitemap = ET.fromstring(raw_sitemap)
        sitemap_urls = []

        for url in sitemap.findall(f"{{{self.xml_schema_sitemap}}}url"):
            location = url.find(f"{{{self.xml_schema_sitemap}}}loc").text

            if filter_locs is None or filter_locs in location:
                sitemap_urls.append(location)

        return sitemap_urls

    def load_data(self, sitemap_url: str, filter: str = None) -> List[Document]:
        sitemap = self._load_sitemap(sitemap_url=sitemap_url)
        sitemap_urls = self._parse_sitemap(sitemap, filter)

        return self._async_loader.load_data(urls=sitemap_urls)

TrafilaturaWebReader #

Bases: BasePydanticReader

Trafilatura 网页读取器。

从网页中读取页面。 需要 trafilatura 包。

Source code in llama_index/readers/web/trafilatura_web/base.py
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
class TrafilaturaWebReader(BasePydanticReader):
    """Trafilatura 网页读取器。

    从网页中读取页面。
    需要 `trafilatura` 包。"""

    is_remote: bool = True

    @classmethod
    def class_name(cls) -> str:
        """获取类的名称标识符。"""
        return "TrafilaturaWebReader"

    def load_data(
        self,
        urls: List[str],
        include_comments=True,
        output_format="txt",
        include_tables=True,
        include_images=False,
        include_formatting=False,
        include_links=False,
        show_progress=False,
        **kwargs,
    ) -> List[Document]:
        """从URL加载数据。

Args:
    urls(List[str]):要抓取的URL列表。
    include_comments(bool,可选):是否在输出中包含注释。默认为True。
    output_format(str,可选):输出格式。默认为'txt'。
    include_tables(bool,可选):是否在输出中包含表格。默认为True。
    include_images(bool,可选):是否在输出中包含图像。默认为False。
    include_formatting(bool,可选):是否在输出中包含格式。默认为False。
    include_links(bool,可选):是否在输出中包含链接。默认为False。
    show_progress(bool,可选):是否显示进度条。默认为False。
    kwargs:`trafilatura.extract`函数的额外关键字参数。

Returns:
    List[Document]:文档列表。
"""
        import trafilatura

        if not isinstance(urls, list):
            raise ValueError("urls must be a list of strings.")
        documents = []

        if show_progress:
            from tqdm import tqdm

            iterator = tqdm(urls, desc="Downloading pages")
        else:
            iterator = urls
        for url in iterator:
            downloaded = trafilatura.fetch_url(url)
            response = trafilatura.extract(
                downloaded,
                include_comments=include_comments,
                output_format=output_format,
                include_tables=include_tables,
                include_images=include_images,
                include_formatting=include_formatting,
                include_links=include_links,
                **kwargs,
            )
            documents.append(Document(text=response, id_=url))

        return documents

class_name classmethod #

class_name() -> str

获取类的名称标识符。

Source code in llama_index/readers/web/trafilatura_web/base.py
15
16
17
18
@classmethod
def class_name(cls) -> str:
    """获取类的名称标识符。"""
    return "TrafilaturaWebReader"

load_data #

load_data(
    urls: List[str],
    include_comments=True,
    output_format="txt",
    include_tables=True,
    include_images=False,
    include_formatting=False,
    include_links=False,
    show_progress=False,
    **kwargs
) -> List[Document]

从URL加载数据。

Returns:

Type Description
List[Document]

List[Document]:文档列表。

Source code in llama_index/readers/web/trafilatura_web/base.py
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
    def load_data(
        self,
        urls: List[str],
        include_comments=True,
        output_format="txt",
        include_tables=True,
        include_images=False,
        include_formatting=False,
        include_links=False,
        show_progress=False,
        **kwargs,
    ) -> List[Document]:
        """从URL加载数据。

Args:
    urls(List[str]):要抓取的URL列表。
    include_comments(bool,可选):是否在输出中包含注释。默认为True。
    output_format(str,可选):输出格式。默认为'txt'。
    include_tables(bool,可选):是否在输出中包含表格。默认为True。
    include_images(bool,可选):是否在输出中包含图像。默认为False。
    include_formatting(bool,可选):是否在输出中包含格式。默认为False。
    include_links(bool,可选):是否在输出中包含链接。默认为False。
    show_progress(bool,可选):是否显示进度条。默认为False。
    kwargs:`trafilatura.extract`函数的额外关键字参数。

Returns:
    List[Document]:文档列表。
"""
        import trafilatura

        if not isinstance(urls, list):
            raise ValueError("urls must be a list of strings.")
        documents = []

        if show_progress:
            from tqdm import tqdm

            iterator = tqdm(urls, desc="Downloading pages")
        else:
            iterator = urls
        for url in iterator:
            downloaded = trafilatura.fetch_url(url)
            response = trafilatura.extract(
                downloaded,
                include_comments=include_comments,
                output_format=output_format,
                include_tables=include_tables,
                include_images=include_images,
                include_formatting=include_formatting,
                include_links=include_links,
                **kwargs,
            )
            documents.append(Document(text=response, id_=url))

        return documents

UnstructuredURLLoader #

Bases: BaseReader

使用unstructured来加载HTML文件的加载器。

Source code in llama_index/readers/web/unstructured_web/base.py
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
class UnstructuredURLLoader(BaseReader):
    """使用unstructured来加载HTML文件的加载器。"""

    def __init__(
        self, urls: List[str], continue_on_failure: bool = True, headers: dict = {}
    ):
        """使用文件路径进行初始化。"""
        try:
            import unstructured  # noqa:F401
            from unstructured.__version__ import __version__ as __unstructured_version__

            self.__version = __unstructured_version__
        except ImportError:
            raise ValueError(
                "unstructured package not found, please install it with "
                "`pip install unstructured`"
            )

        if not self.__is_headers_available() and len(headers.keys()) != 0:
            logger.warning(
                "You are using old version of unstructured. "
                "The headers parameter is ignored"
            )

        self.urls = urls
        self.continue_on_failure = continue_on_failure
        self.headers = headers

    def __is_headers_available(self) -> bool:
        _unstructured_version = self.__version.split("-")[0]
        unstructured_version = tuple([int(x) for x in _unstructured_version.split(".")])

        return unstructured_version >= (0, 5, 7)

    def load_data(self) -> List[Document]:
        """加载文件。"""
        from unstructured.partition.html import partition_html

        docs: List[Document] = []
        for url in self.urls:
            try:
                if self.__is_headers_available():
                    elements = partition_html(url=url, headers=self.headers)
                else:
                    elements = partition_html(url=url)
                text = "\n\n".join([str(el) for el in elements])
                metadata = {"source": url}
                docs.append(Document(text=text, extra_info=metadata))
            except Exception as e:
                if self.continue_on_failure:
                    logger.error(f"Error fetching or processing {url}, exception: {e}")
                else:
                    raise e  # noqa: TRY201
        return docs

load_data #

load_data() -> List[Document]

加载文件。

Source code in llama_index/readers/web/unstructured_web/base.py
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
def load_data(self) -> List[Document]:
    """加载文件。"""
    from unstructured.partition.html import partition_html

    docs: List[Document] = []
    for url in self.urls:
        try:
            if self.__is_headers_available():
                elements = partition_html(url=url, headers=self.headers)
            else:
                elements = partition_html(url=url)
            text = "\n\n".join([str(el) for el in elements])
            metadata = {"source": url}
            docs.append(Document(text=text, extra_info=metadata))
        except Exception as e:
            if self.continue_on_failure:
                logger.error(f"Error fetching or processing {url}, exception: {e}")
            else:
                raise e  # noqa: TRY201
    return docs

WholeSiteReader #

Bases: BaseReader

BFS网页爬虫用于网站。

该类提供了使用广度优先搜索算法来爬取整个网站的功能。 它从给定的基本URL导航网页,跟踪与指定前缀匹配的链接。

属性: prefix (str): 用于聚焦爬取的URL前缀。 max_depth (int): BFS算法的最大深度。

Parameters:

Name Type Description Default
prefix str

用于爬取的URL前缀。

required
max_depth (int, 可选)

BFS的最大深度。默认为10。

10
Source code in llama_index/readers/web/whole_site/base.py
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
class WholeSiteReader(BaseReader):
    """BFS网页爬虫用于网站。

该类提供了使用广度优先搜索算法来爬取整个网站的功能。
它从给定的基本URL导航网页,跟踪与指定前缀匹配的链接。

属性:
    prefix (str): 用于聚焦爬取的URL前缀。
    max_depth (int): BFS算法的最大深度。

Args:
    prefix (str): 用于爬取的URL前缀。
    max_depth (int, 可选): BFS的最大深度。默认为10。"""

    def __init__(
        self,
        prefix: str,
        max_depth: int = 10,
        driver: Optional[webdriver.Chrome] = None,
    ) -> None:
        """
        使用提供的前缀和最大深度初始化WholeSiteReader。
        """
        self.prefix = prefix
        self.max_depth = max_depth
        self.driver = driver if driver else self.setup_driver()

    def setup_driver(self):
        """设置Selenium WebDriver用于Chrome。

返回:
    WebDriver:Chrome WebDriver的一个实例。
"""
        try:
            import chromedriver_autoinstaller
        except ImportError:
            raise ImportError("Please install chromedriver_autoinstaller")

        opt = webdriver.ChromeOptions()
        opt.add_argument("--start-maximized")
        chromedriver_autoinstaller.install()
        return webdriver.Chrome(options=opt)

    def clean_url(self, url):
        return url.split("#")[0]

    def restart_driver(self):
        self.driver.quit()
        self.driver = self.setup_driver()

    def extract_content(self):
        WebDriverWait(self.driver, 10).until(
            EC.presence_of_element_located((By.TAG_NAME, "body"))
        )
        body_element = self.driver.find_element(By.TAG_NAME, "body")
        return body_element.text.strip()

    def extract_links(self):
        js_script = """
            var links = [];
            var elements = document.getElementsByTagName('a');
            for (var i = 0; i < elements.length; i++) {
                var href = elements[i].href;
                if (href) {
                    links.push(href);
                }
            }
            return links;
            """
        return self.driver.execute_script(js_script)

    def load_data(self, base_url: str) -> List[Document]:
        """使用BFS算法从基本URL加载数据。

Args:
    base_url(str):开始爬取的基本URL。

Returns:
    List[Document]:爬取的文档列表。
"""
        added_urls = set()
        urls_to_visit = [(base_url, 0)]
        documents = []

        while urls_to_visit:
            current_url, depth = urls_to_visit.pop(0)
            print(f"Visiting: {current_url}, {len(urls_to_visit)} left")

            try:
                self.driver.get(current_url)
                page_content = self.extract_content()
                added_urls.add(current_url)

                next_depth = depth + 1
                if next_depth <= self.max_depth:
                    # links = self.driver.find_elements(By.TAG_NAME, 'a')
                    links = self.extract_links()
                    # clean all urls
                    links = [self.clean_url(link) for link in links]
                    # extract new links
                    links = [link for link in links if link not in added_urls]
                    print(f"Found {len(links)} new potential links")

                    for href in links:
                        try:
                            if href.startswith(self.prefix) and href not in added_urls:
                                urls_to_visit.append((href, next_depth))
                                added_urls.add(href)
                        except Exception:
                            continue

                documents.append(
                    Document(text=page_content, extra_info={"URL": current_url})
                )
                time.sleep(1)

            except WebDriverException:
                print("WebDriverException encountered, restarting driver...")
                self.restart_driver()
            except Exception as e:
                print(f"An unexpected exception occurred: {e}, skipping URL...")
                continue

        self.driver.quit()
        return documents

setup_driver #

setup_driver()

设置Selenium WebDriver用于Chrome。

返回: WebDriver:Chrome WebDriver的一个实例。

Source code in llama_index/readers/web/whole_site/base.py
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
    def setup_driver(self):
        """设置Selenium WebDriver用于Chrome。

返回:
    WebDriver:Chrome WebDriver的一个实例。
"""
        try:
            import chromedriver_autoinstaller
        except ImportError:
            raise ImportError("Please install chromedriver_autoinstaller")

        opt = webdriver.ChromeOptions()
        opt.add_argument("--start-maximized")
        chromedriver_autoinstaller.install()
        return webdriver.Chrome(options=opt)

load_data #

load_data(base_url: str) -> List[Document]

使用BFS算法从基本URL加载数据。

Returns:

Type Description
List[Document]

List[Document]:爬取的文档列表。

Source code in llama_index/readers/web/whole_site/base.py
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
    def load_data(self, base_url: str) -> List[Document]:
        """使用BFS算法从基本URL加载数据。

Args:
    base_url(str):开始爬取的基本URL。

Returns:
    List[Document]:爬取的文档列表。
"""
        added_urls = set()
        urls_to_visit = [(base_url, 0)]
        documents = []

        while urls_to_visit:
            current_url, depth = urls_to_visit.pop(0)
            print(f"Visiting: {current_url}, {len(urls_to_visit)} left")

            try:
                self.driver.get(current_url)
                page_content = self.extract_content()
                added_urls.add(current_url)

                next_depth = depth + 1
                if next_depth <= self.max_depth:
                    # links = self.driver.find_elements(By.TAG_NAME, 'a')
                    links = self.extract_links()
                    # clean all urls
                    links = [self.clean_url(link) for link in links]
                    # extract new links
                    links = [link for link in links if link not in added_urls]
                    print(f"Found {len(links)} new potential links")

                    for href in links:
                        try:
                            if href.startswith(self.prefix) and href not in added_urls:
                                urls_to_visit.append((href, next_depth))
                                added_urls.add(href)
                        except Exception:
                            continue

                documents.append(
                    Document(text=page_content, extra_info={"URL": current_url})
                )
                time.sleep(1)

            except WebDriverException:
                print("WebDriverException encountered, restarting driver...")
                self.restart_driver()
            except Exception as e:
                print(f"An unexpected exception occurred: {e}, skipping URL...")
                continue

        self.driver.quit()
        return documents