跳至内容

董事会文档

BoardDocsReader #

基类: BaseReader

BoardDocs文档阅读器。

读取BoardDocs网站上包含的公开议程。

参数:

名称 类型 描述 默认值
site str

您想要索引的BoardDocs站点,例如"ca/redwood"

required
committee_id str

您想要索引的网站上的委员会

required
Source code in llama-index-integrations/readers/llama-index-readers-boarddocs/llama_index/readers/boarddocs/base.py
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
class BoardDocsReader(BaseReader):
    """
    BoardDocs doc reader.

    Read public agendas included on a BoardDocs site.

    Args:
        site (str): The BoardDocs site you'd like to index, e.g. "ca/redwood"
        committee_id (str): The committee on the site you want to index

    """

    def __init__(
        self,
        site: str,
        committee_id: str,
    ) -> None:
        """Initialize with parameters."""
        self.site = site
        self.committee_id = committee_id
        self.base_url = "https://go.boarddocs.com/" + site + "/Board.nsf"

        # set up the headers required for the server to answer
        self.headers = {
            "accept": "application/json, text/javascript, */*; q=0.01",
            "accept-language": "en-US,en;q=0.9",
            "content-type": "application/x-www-form-urlencoded; charset=UTF-8",
            "sec-ch-ua": (
                '"Google Chrome";v="113", "Chromium";v="113", "Not-A.Brand";v="24"'
            ),
            "sec-ch-ua-mobile": "?0",
            "sec-ch-ua-platform": '"macOS"',
            "sec-fetch-dest": "empty",
            "sec-fetch-mode": "cors",
            "sec-fetch-site": "same-origin",
            "x-requested-with": "XMLHttpRequest",
        }
        super().__init__()

    def get_meeting_list(self) -> List[dict]:
        """
        Returns a list of meetings for the committee.

        Args:
            None
        Returns:
            List[dict]: A list of meetings, each with a meetingID, date, and unid

        """
        meeting_list_url = self.base_url + "/BD-GetMeetingsList?open"

        data = "current_committee_id=" + self.committee_id
        response = requests.post(meeting_list_url, headers=self.headers, data=data)
        meetingsData = json.loads(response.text)

        return [
            {
                "meetingID": meeting.get("unique", None),
                "date": meeting.get("numberdate", None),
                "unid": meeting.get("unid", None),
            }
            for meeting in meetingsData
        ]

    def process_meeting(
        self, meeting_id: str, index_pdfs: bool = True
    ) -> List[Document]:
        """
        Returns documents from the given meeting.
        """
        agenda_url = self.base_url + "/PRINT-AgendaDetailed"

        # set the meetingID & committee
        data = "id=" + meeting_id + "&" + "current_committee_id=" + self.committee_id

        # POST the request!
        response = requests.post(agenda_url, headers=self.headers, data=data)

        # parse the returned HTML
        soup = BeautifulSoup(response.content, "html.parser")
        agenda_date = soup.find("div", {"class": "print-meeting-date"}).string
        agenda_title = soup.find("div", {"class": "print-meeting-name"}).string
        [fd.a.get("href") for fd in soup.find_all("div", {"class": "public-file"})]
        agenda_data = html2text.html2text(response.text)

        # TODO: index the linked PDFs in agenda_files!

        docs = []
        agenda_doc = Document(
            text=agenda_data,
            doc_id=meeting_id,
            extra_info={
                "committee": self.committee_id,
                "title": agenda_title,
                "date": agenda_date,
                "url": agenda_url,
            },
        )
        docs.append(agenda_doc)
        return docs

    def load_data(
        self, meeting_ids: Optional[List[str]] = None, **load_kwargs: Any
    ) -> List[Document]:
        """
        Load all meetings of the committee.

        Args:
            meeting_ids (List[str]): A list of meeting IDs to load. If None, load all meetings.

        """
        # if a list of meetings wasn't provided, enumerate them all
        if not meeting_ids:
            meeting_ids = [
                meeting.get("meetingID") for meeting in self.get_meeting_list()
            ]

        # process all relevant meetings & return the documents
        docs = []
        for meeting_id in meeting_ids:
            docs.extend(self.process_meeting(meeting_id))
        return docs

get_meeting_list #

get_meeting_list() -> List[dict]

返回该委员会的会议列表。

返回: List[dict]: 会议列表,每个会议包含meetingID、日期和unid

Source code in llama-index-integrations/readers/llama-index-readers-boarddocs/llama_index/readers/boarddocs/base.py
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
def get_meeting_list(self) -> List[dict]:
    """
    Returns a list of meetings for the committee.

    Args:
        None
    Returns:
        List[dict]: A list of meetings, each with a meetingID, date, and unid

    """
    meeting_list_url = self.base_url + "/BD-GetMeetingsList?open"

    data = "current_committee_id=" + self.committee_id
    response = requests.post(meeting_list_url, headers=self.headers, data=data)
    meetingsData = json.loads(response.text)

    return [
        {
            "meetingID": meeting.get("unique", None),
            "date": meeting.get("numberdate", None),
            "unid": meeting.get("unid", None),
        }
        for meeting in meetingsData
    ]

处理会议 #

process_meeting(meeting_id: str, index_pdfs: bool = True) -> List[Document]

返回给定会议中的文档。

Source code in llama-index-integrations/readers/llama-index-readers-boarddocs/llama_index/readers/boarddocs/base.py
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
def process_meeting(
    self, meeting_id: str, index_pdfs: bool = True
) -> List[Document]:
    """
    Returns documents from the given meeting.
    """
    agenda_url = self.base_url + "/PRINT-AgendaDetailed"

    # set the meetingID & committee
    data = "id=" + meeting_id + "&" + "current_committee_id=" + self.committee_id

    # POST the request!
    response = requests.post(agenda_url, headers=self.headers, data=data)

    # parse the returned HTML
    soup = BeautifulSoup(response.content, "html.parser")
    agenda_date = soup.find("div", {"class": "print-meeting-date"}).string
    agenda_title = soup.find("div", {"class": "print-meeting-name"}).string
    [fd.a.get("href") for fd in soup.find_all("div", {"class": "public-file"})]
    agenda_data = html2text.html2text(response.text)

    # TODO: index the linked PDFs in agenda_files!

    docs = []
    agenda_doc = Document(
        text=agenda_data,
        doc_id=meeting_id,
        extra_info={
            "committee": self.committee_id,
            "title": agenda_title,
            "date": agenda_date,
            "url": agenda_url,
        },
    )
    docs.append(agenda_doc)
    return docs

加载数据 #

load_data(meeting_ids: Optional[List[str]] = None, **load_kwargs: Any) -> List[Document]

加载委员会的所有会议。

参数:

名称 类型 描述 默认值
meeting_ids List[str]

要加载的会议ID列表。如果为None,则加载所有会议。

None
Source code in llama-index-integrations/readers/llama-index-readers-boarddocs/llama_index/readers/boarddocs/base.py
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
def load_data(
    self, meeting_ids: Optional[List[str]] = None, **load_kwargs: Any
) -> List[Document]:
    """
    Load all meetings of the committee.

    Args:
        meeting_ids (List[str]): A list of meeting IDs to load. If None, load all meetings.

    """
    # if a list of meetings wasn't provided, enumerate them all
    if not meeting_ids:
        meeting_ids = [
            meeting.get("meetingID") for meeting in self.get_meeting_list()
        ]

    # process all relevant meetings & return the documents
    docs = []
    for meeting_id in meeting_ids:
        docs.extend(self.process_meeting(meeting_id))
    return docs