Source code for langchain_community.document_loaders.github

import base64
from abc import ABC
from datetime import datetime
from typing import Callable, Dict, Iterator, List, Literal, Optional, Union

import requests
from langchain_core.documents import Document
from langchain_core.pydantic_v1 import BaseModel, root_validator, validator
from langchain_core.utils import get_from_dict_or_env

from langchain_community.document_loaders.base import BaseLoader


[docs]class BaseGitHubLoader(BaseLoader, BaseModel, ABC): """加载`GitHub`存储库的问题。""" repo: str """存储库的名称""" access_token: str """个人访问令牌 - 请参阅https://github.com/settings/tokens?type=beta""" github_api_url: str = "https://api.github.com" """GitHub API的URL""" @root_validator(pre=True, allow_reuse=True) def validate_environment(cls, values: Dict) -> Dict: """验证环境中是否存在访问令牌。""" values["access_token"] = get_from_dict_or_env( values, "access_token", "GITHUB_PERSONAL_ACCESS_TOKEN" ) return values @property def headers(self) -> Dict[str, str]: return { "Accept": "application/vnd.github+json", "Authorization": f"Bearer {self.access_token}", }
[docs]class GitHubIssuesLoader(BaseGitHubLoader): """加载GitHub存储库的问题。""" include_prs: bool = True """如果为True,则包括拉取请求在结果中,否则忽略它们。""" milestone: Union[int, Literal["*", "none"], None] = None """如果传递整数,则应该是里程碑的数字字段。 如果传递字符串'*',则接受具有任何里程碑的问题。 如果传递字符串'none',则返回没有里程碑的问题。""" state: Optional[Literal["open", "closed", "all"]] = None """过滤问题状态。可以是以下之一:'open'(打开),'closed'(关闭),'all'(全部)。""" assignee: Optional[str] = None """过滤已分配的用户。传递'none'表示没有用户,传递'*'表示任何用户。""" creator: Optional[str] = None """过滤创建了问题的用户。""" mentioned: Optional[str] = None """过滤在问题中提到的用户。""" labels: Optional[List[str]] = None """过滤标签名称。例如:bug,ui,@high。""" sort: Optional[Literal["created", "updated", "comments"]] = None """按什么排序结果。可以是以下之一:'created','updated','comments'。 默认为'created'。""" direction: Optional[Literal["asc", "desc"]] = None """结果 按照结果排序的方向。可以是:'asc'(升序)、'desc'(降序)。""" since: Optional[str] = None """仅显示在给定时间之后更新的通知。 这是一个ISO 8601格式的时间戳:YYYY-MM-DDTHH:MM:SSZ。""" page: Optional[int] = None """分页结果的页码。 在GitHub API中默认为1。""" per_page: Optional[int] = None """每页的项目数量。默认为GitHub API中的30。""" @validator("since", allow_reuse=True) def validate_since(cls, v: Optional[str]) -> Optional[str]: if v: try: datetime.strptime(v, "%Y-%m-%dT%H:%M:%SZ") except ValueError: raise ValueError( "Invalid value for 'since'. Expected a date string in " f"YYYY-MM-DDTHH:MM:SSZ format. Received: {v}" ) return v
[docs] def lazy_load(self) -> Iterator[Document]: """获取GitHub存储库的问题。 返回: 一个具有以下属性的文档列表: - 页面内容 - 元数据 - 网址 - 标题 - 创建者 - 创建时间 - 最后更新时间 - 关闭时间 - 评论数量 - 状态 - 标签 - 被指派者 - 多个被指派者 - 里程碑 - 已锁定 - 编号 - 是否为拉取请求 """ url: Optional[str] = self.url while url: response = requests.get(url, headers=self.headers) response.raise_for_status() issues = response.json() for issue in issues: doc = self.parse_issue(issue) if not self.include_prs and doc.metadata["is_pull_request"]: continue yield doc if ( response.links and response.links.get("next") and (not self.page and not self.per_page) ): url = response.links["next"]["url"] else: url = None
[docs] def parse_issue(self, issue: dict) -> Document: """从GitHub问题列表创建文档对象。""" metadata = { "url": issue["html_url"], "title": issue["title"], "creator": issue["user"]["login"], "created_at": issue["created_at"], "comments": issue["comments"], "state": issue["state"], "labels": [label["name"] for label in issue["labels"]], "assignee": issue["assignee"]["login"] if issue["assignee"] else None, "milestone": issue["milestone"]["title"] if issue["milestone"] else None, "locked": issue["locked"], "number": issue["number"], "is_pull_request": "pull_request" in issue, } content = issue["body"] if issue["body"] is not None else "" return Document(page_content=content, metadata=metadata)
@property def query_params(self) -> str: """为GitHub API创建查询参数。""" labels = ",".join(self.labels) if self.labels else self.labels query_params_dict = { "milestone": self.milestone, "state": self.state, "assignee": self.assignee, "creator": self.creator, "mentioned": self.mentioned, "labels": labels, "sort": self.sort, "direction": self.direction, "since": self.since, "page": self.page, "per_page": self.per_page, } query_params_list = [ f"{k}={v}" for k, v in query_params_dict.items() if v is not None ] query_params = "&".join(query_params_list) return query_params @property def url(self) -> str: """创建用于GitHub API的URL。""" return f"{self.github_api_url}/repos/{self.repo}/issues?{self.query_params}"
[docs]class GithubFileLoader(BaseGitHubLoader, ABC): """加载GitHub文件""" file_extension: str = ".md" branch: str = "main" file_filter: Optional[Callable[[str], bool]]
[docs] def get_file_paths(self) -> List[Dict]: base_url = ( f"{self.github_api_url}/repos/{self.repo}/git/trees/" f"{self.branch}?recursive=1" ) response = requests.get(base_url, headers=self.headers) response.raise_for_status() all_files = response.json()["tree"] """ one element in all_files { 'path': '.github', 'mode': '040000', 'type': 'tree', 'sha': '5dc46e6b38b22707894ced126270b15e2f22f64e', 'url': 'https://api.github.com/repos/langchain-ai/langchain/git/blobs/5dc46e6b38b22707894ced126270b15e2f22f64e' } """ return [ f for f in all_files if not (self.file_filter and not self.file_filter(f["path"])) ]
[docs] def get_file_content_by_path(self, path: str) -> str: base_url = f"{self.github_api_url}/repos/{self.repo}/contents/{path}" response = requests.get(base_url, headers=self.headers) response.raise_for_status() if isinstance(response.json(), dict): content_encoded = response.json()["content"] return base64.b64decode(content_encoded).decode("utf-8") return ""
[docs] def lazy_load(self) -> Iterator[Document]: files = self.get_file_paths() for file in files: content = self.get_file_content_by_path(file["path"]) if content == "": continue metadata = { "path": file["path"], "sha": file["sha"], "source": f"{self.github_api_url}/{self.repo}/{file['type']}/" f"{self.branch}/{file['path']}", } yield Document(page_content=content, metadata=metadata)