Skip to content

Wordpress

WordpressReader #

Bases: BaseReader

Wordpress阅读器。从Wordpress工作空间读取数据。

Parameters:

Name Type Description Default
wordpress_subdomain str

Wordpress子域名

required
Source code in llama_index/readers/wordpress/base.py
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
class WordpressReader(BaseReader):
    """Wordpress阅读器。从Wordpress工作空间读取数据。

    Args:
        wordpress_subdomain (str): Wordpress子域名"""

    def __init__(self, url: str, password: str, username: str) -> None:
        """初始化Wordpress阅读器。"""
        self.url = url
        self.username = username
        self.password = password

    def load_data(self) -> List[Document]:
        """从工作空间加载数据。

返回:
    List[Document]: 文档列表。
"""
        from bs4 import BeautifulSoup

        results = []

        articles = self.get_all_posts()

        for article in articles:
            body = article.get("content", {}).get("rendered", None)
            if not body:
                body = article.get("content")

            soup = BeautifulSoup(body, "html.parser")
            body = soup.get_text()

            title = article.get("title", {}).get("rendered", None)
            if not title:
                title = article.get("title")

            extra_info = {
                "id": article["id"],
                "title": title,
                "url": article["link"],
                "updated_at": article["modified"],
            }

            results.append(
                Document(
                    text=body,
                    extra_info=extra_info,
                )
            )
        return results

    def get_all_posts(self):
        posts = []
        next_page = 1

        while True:
            response = self.get_posts_page(next_page)
            posts.extend(response["articles"])
            next_page = response["next_page"]

            if next_page is None:
                break

        return posts

    def get_posts_page(self, current_page: int = 1):
        import requests

        url = f"{self.url}/wp-json/wp/v2/posts?per_page=100&page={current_page}"

        response = requests.get(url)
        headers = response.headers

        if "X-WP-TotalPages" in headers:
            num_pages = int(headers["X-WP-TotalPages"])
        else:
            num_pages = 1

        if num_pages > current_page:
            next_page = current_page + 1
        else:
            next_page = None

        response_json = json.loads(response.text)

        articles = response_json

        return {"articles": articles, "next_page": next_page}

load_data #

load_data() -> List[Document]

从工作空间加载数据。

返回: List[Document]: 文档列表。

Source code in llama_index/readers/wordpress/base.py
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
    def load_data(self) -> List[Document]:
        """从工作空间加载数据。

返回:
    List[Document]: 文档列表。
"""
        from bs4 import BeautifulSoup

        results = []

        articles = self.get_all_posts()

        for article in articles:
            body = article.get("content", {}).get("rendered", None)
            if not body:
                body = article.get("content")

            soup = BeautifulSoup(body, "html.parser")
            body = soup.get_text()

            title = article.get("title", {}).get("rendered", None)
            if not title:
                title = article.get("title")

            extra_info = {
                "id": article["id"],
                "title": title,
                "url": article["link"],
                "updated_at": article["modified"],
            }

            results.append(
                Document(
                    text=body,
                    extra_info=extra_info,
                )
            )
        return results