Source code for langchain_community.document_loaders.hugging_face_model
from typing import Iterator, List, Optional
import requests
from langchain_core.documents import Document
from langchain_community.document_loaders.base import BaseLoader
[docs]class HuggingFaceModelLoader(BaseLoader):
""" 从`Hugging Face Hub`加载模型信息,包括README内容。
该加载器与Hugging Face Models API进行交互,以获取和加载模型元数据和README文件。
该API允许您根据特定标准(如模型标签、作者等)搜索和过滤模型。
API URL: https://huggingface.co/api/models
DOC URL: https://huggingface.co/docs/hub/en/api
示例:
.. code-block:: python
from langchain_community.document_loaders import HuggingFaceModelLoader
# 使用搜索条件初始化加载器
loader = HuggingFaceModelLoader(search="bert", limit=10)
# 加载模型
documents = loader.load()
# 遍历获取的文档
for doc in documents:
print(doc.page_content) # 模型的README内容
print(doc.metadata) # 模型的元数据"""
BASE_URL = "https://huggingface.co/api/models"
README_BASE_URL = "https://huggingface.co/{model_id}/raw/main/README.md"
[docs] def __init__(
self,
*,
search: Optional[str] = None,
author: Optional[str] = None,
filter: Optional[str] = None,
sort: Optional[str] = None,
direction: Optional[str] = None,
limit: Optional[int] = 3,
full: Optional[bool] = None,
config: Optional[bool] = None,
):
"""初始化HuggingFaceModelLoader。
参数:
search: 根据存储库及其用户名的子字符串进行过滤。
author: 按作者或组织过滤模型。
filter: 根据标签进行过滤。
sort: 排序时要使用的属性。
direction: 排序方向。
limit: 限制获取的模型数量。
full: 是否获取大部分模型数据。
config: 是否还获取存储库配置。
"""
self.params = {
"search": search,
"author": author,
"filter": filter,
"sort": sort,
"direction": direction,
"limit": limit,
"full": full,
"config": config,
}
[docs] def fetch_models(self) -> List[dict]:
"""从Hugging Face Hub获取模型信息。"""
response = requests.get(
self.BASE_URL,
params={k: v for k, v in self.params.items() if v is not None},
)
response.raise_for_status()
return response.json()
[docs] def fetch_readme_content(self, model_id: str) -> str:
"""获取给定模型的README内容。"""
readme_url = self.README_BASE_URL.format(model_id=model_id)
try:
response = requests.get(readme_url)
response.raise_for_status()
return response.text
except requests.RequestException:
return "README not available for this model."
[docs] def lazy_load(self) -> Iterator[Document]:
"""延迟加载模型信息,包括README内容。"""
models = self.fetch_models()
for model in models:
model_id = model.get("modelId", "")
readme_content = self.fetch_readme_content(model_id)
yield Document(
page_content=readme_content,
metadata=model,
)