import base64
from abc import ABC
from datetime import datetime
from typing import Callable, Dict, Iterator, List, Literal, Optional, Union
import requests
from langchain_core.documents import Document
from langchain_core.pydantic_v1 import BaseModel, root_validator, validator
from langchain_core.utils import get_from_dict_or_env
from langchain_community.document_loaders.base import BaseLoader
[docs]class BaseGitHubLoader(BaseLoader, BaseModel, ABC):
"""加载`GitHub`存储库的问题。"""
repo: str
"""存储库的名称"""
access_token: str
"""个人访问令牌 - 请参阅https://github.com/settings/tokens?type=beta"""
github_api_url: str = "https://api.github.com"
"""GitHub API的URL"""
@root_validator(pre=True, allow_reuse=True)
def validate_environment(cls, values: Dict) -> Dict:
"""验证环境中是否存在访问令牌。"""
values["access_token"] = get_from_dict_or_env(
values, "access_token", "GITHUB_PERSONAL_ACCESS_TOKEN"
)
return values
@property
def headers(self) -> Dict[str, str]:
return {
"Accept": "application/vnd.github+json",
"Authorization": f"Bearer {self.access_token}",
}
[docs]class GitHubIssuesLoader(BaseGitHubLoader):
"""加载GitHub存储库的问题。"""
include_prs: bool = True
"""如果为True,则包括拉取请求在结果中,否则忽略它们。"""
milestone: Union[int, Literal["*", "none"], None] = None
"""如果传递整数,则应该是里程碑的数字字段。
如果传递字符串'*',则接受具有任何里程碑的问题。
如果传递字符串'none',则返回没有里程碑的问题。"""
state: Optional[Literal["open", "closed", "all"]] = None
"""过滤问题状态。可以是以下之一:'open'(打开),'closed'(关闭),'all'(全部)。"""
assignee: Optional[str] = None
"""过滤已分配的用户。传递'none'表示没有用户,传递'*'表示任何用户。"""
creator: Optional[str] = None
"""过滤创建了问题的用户。"""
mentioned: Optional[str] = None
"""过滤在问题中提到的用户。"""
labels: Optional[List[str]] = None
"""过滤标签名称。例如:bug,ui,@high。"""
sort: Optional[Literal["created", "updated", "comments"]] = None
"""按什么排序结果。可以是以下之一:'created','updated','comments'。
默认为'created'。"""
direction: Optional[Literal["asc", "desc"]] = None
"""结果
按照结果排序的方向。可以是:'asc'(升序)、'desc'(降序)。"""
since: Optional[str] = None
"""仅显示在给定时间之后更新的通知。
这是一个ISO 8601格式的时间戳:YYYY-MM-DDTHH:MM:SSZ。"""
page: Optional[int] = None
"""分页结果的页码。
在GitHub API中默认为1。"""
per_page: Optional[int] = None
"""每页的项目数量。默认为GitHub API中的30。"""
@validator("since", allow_reuse=True)
def validate_since(cls, v: Optional[str]) -> Optional[str]:
if v:
try:
datetime.strptime(v, "%Y-%m-%dT%H:%M:%SZ")
except ValueError:
raise ValueError(
"Invalid value for 'since'. Expected a date string in "
f"YYYY-MM-DDTHH:MM:SSZ format. Received: {v}"
)
return v
[docs] def lazy_load(self) -> Iterator[Document]:
"""获取GitHub存储库的问题。
返回:
一个具有以下属性的文档列表:
- 页面内容
- 元数据
- 网址
- 标题
- 创建者
- 创建时间
- 最后更新时间
- 关闭时间
- 评论数量
- 状态
- 标签
- 被指派者
- 多个被指派者
- 里程碑
- 已锁定
- 编号
- 是否为拉取请求
"""
url: Optional[str] = self.url
while url:
response = requests.get(url, headers=self.headers)
response.raise_for_status()
issues = response.json()
for issue in issues:
doc = self.parse_issue(issue)
if not self.include_prs and doc.metadata["is_pull_request"]:
continue
yield doc
if (
response.links
and response.links.get("next")
and (not self.page and not self.per_page)
):
url = response.links["next"]["url"]
else:
url = None
[docs] def parse_issue(self, issue: dict) -> Document:
"""从GitHub问题列表创建文档对象。"""
metadata = {
"url": issue["html_url"],
"title": issue["title"],
"creator": issue["user"]["login"],
"created_at": issue["created_at"],
"comments": issue["comments"],
"state": issue["state"],
"labels": [label["name"] for label in issue["labels"]],
"assignee": issue["assignee"]["login"] if issue["assignee"] else None,
"milestone": issue["milestone"]["title"] if issue["milestone"] else None,
"locked": issue["locked"],
"number": issue["number"],
"is_pull_request": "pull_request" in issue,
}
content = issue["body"] if issue["body"] is not None else ""
return Document(page_content=content, metadata=metadata)
@property
def query_params(self) -> str:
"""为GitHub API创建查询参数。"""
labels = ",".join(self.labels) if self.labels else self.labels
query_params_dict = {
"milestone": self.milestone,
"state": self.state,
"assignee": self.assignee,
"creator": self.creator,
"mentioned": self.mentioned,
"labels": labels,
"sort": self.sort,
"direction": self.direction,
"since": self.since,
"page": self.page,
"per_page": self.per_page,
}
query_params_list = [
f"{k}={v}" for k, v in query_params_dict.items() if v is not None
]
query_params = "&".join(query_params_list)
return query_params
@property
def url(self) -> str:
"""创建用于GitHub API的URL。"""
return f"{self.github_api_url}/repos/{self.repo}/issues?{self.query_params}"
[docs]class GithubFileLoader(BaseGitHubLoader, ABC):
"""加载GitHub文件"""
file_extension: str = ".md"
branch: str = "main"
file_filter: Optional[Callable[[str], bool]]
[docs] def get_file_paths(self) -> List[Dict]:
base_url = (
f"{self.github_api_url}/repos/{self.repo}/git/trees/"
f"{self.branch}?recursive=1"
)
response = requests.get(base_url, headers=self.headers)
response.raise_for_status()
all_files = response.json()["tree"]
""" one element in all_files
{
'path': '.github',
'mode': '040000',
'type': 'tree',
'sha': '5dc46e6b38b22707894ced126270b15e2f22f64e',
'url': 'https://api.github.com/repos/langchain-ai/langchain/git/blobs/5dc46e6b38b22707894ced126270b15e2f22f64e'
}
"""
return [
f
for f in all_files
if not (self.file_filter and not self.file_filter(f["path"]))
]
[docs] def get_file_content_by_path(self, path: str) -> str:
base_url = f"{self.github_api_url}/repos/{self.repo}/contents/{path}"
response = requests.get(base_url, headers=self.headers)
response.raise_for_status()
if isinstance(response.json(), dict):
content_encoded = response.json()["content"]
return base64.b64decode(content_encoded).decode("utf-8")
return ""
[docs] def lazy_load(self) -> Iterator[Document]:
files = self.get_file_paths()
for file in files:
content = self.get_file_content_by_path(file["path"])
if content == "":
continue
metadata = {
"path": file["path"],
"sha": file["sha"],
"source": f"{self.github_api_url}/{self.repo}/{file['type']}/"
f"{self.branch}/{file['path']}",
}
yield Document(page_content=content, metadata=metadata)