Bases: BaseReader
异步Stripe文档阅读器。
根据sitemap.xml从Stripe文档中读取页面。
Source code in llama_index/readers/stripe_docs/base.py
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64 | class StripeDocsReader(BaseReader):
"""异步Stripe文档阅读器。
根据sitemap.xml从Stripe文档中读取页面。
Args:
html_to_text(布尔值):是否将HTML转换为文本。
limit(整数):最大并发请求数。"""
def __init__(self, html_to_text: bool = False, limit: int = 10) -> None:
self._async_loader = AsyncWebPageReader(html_to_text=html_to_text, limit=limit)
self._html_to_text = html_to_text
self._limit = limit
def _load_url(self, url: str) -> str:
return urllib.request.urlopen(url).read()
def _load_sitemap(self) -> str:
return self._load_url(STRIPE_SITEMAP_URL)
def _parse_sitemap(
self, raw_sitemap: str, filters: List[str] = DEFAULT_FILTERS
) -> List:
root_sitemap = ET.fromstring(raw_sitemap)
sitemap_partition_urls = []
sitemap_urls = []
for sitemap in root_sitemap.findall(f"{{{XML_SITEMAP_SCHEMA}}}sitemap"):
loc = sitemap.find(f"{{{XML_SITEMAP_SCHEMA}}}loc").text
sitemap_partition_urls.append(loc)
for sitemap_partition_url in sitemap_partition_urls:
sitemap_partition = ET.fromstring(self._load_url(sitemap_partition_url))
# Find all <url /> and iterate through them
for url in sitemap_partition.findall(f"{{{XML_SITEMAP_SCHEMA}}}url"):
loc = url.find(f"{{{XML_SITEMAP_SCHEMA}}}loc").text
contains_filter = any(filter in loc for filter in filters)
if contains_filter:
sitemap_urls.append(loc)
return sitemap_urls
def load_data(self, filters: List[str] = DEFAULT_FILTERS) -> List[Document]:
sitemap = self._load_sitemap()
sitemap_urls = self._parse_sitemap(sitemap, filters)
return self._async_loader.load_data(urls=sitemap_urls)
|