Source code for langchain_community.document_loaders.hugging_face_model

from typing import Iterator, List, Optional

import requests
from langchain_core.documents import Document

from langchain_community.document_loaders.base import BaseLoader


[docs]class HuggingFaceModelLoader(BaseLoader): """ 从`Hugging Face Hub`加载模型信息,包括README内容。 该加载器与Hugging Face Models API进行交互,以获取和加载模型元数据和README文件。 该API允许您根据特定标准(如模型标签、作者等)搜索和过滤模型。 API URL: https://huggingface.co/api/models DOC URL: https://huggingface.co/docs/hub/en/api 示例: .. code-block:: python from langchain_community.document_loaders import HuggingFaceModelLoader # 使用搜索条件初始化加载器 loader = HuggingFaceModelLoader(search="bert", limit=10) # 加载模型 documents = loader.load() # 遍历获取的文档 for doc in documents: print(doc.page_content) # 模型的README内容 print(doc.metadata) # 模型的元数据""" BASE_URL = "https://huggingface.co/api/models" README_BASE_URL = "https://huggingface.co/{model_id}/raw/main/README.md"
[docs] def __init__( self, *, search: Optional[str] = None, author: Optional[str] = None, filter: Optional[str] = None, sort: Optional[str] = None, direction: Optional[str] = None, limit: Optional[int] = 3, full: Optional[bool] = None, config: Optional[bool] = None, ): """初始化HuggingFaceModelLoader。 参数: search: 根据存储库及其用户名的子字符串进行过滤。 author: 按作者或组织过滤模型。 filter: 根据标签进行过滤。 sort: 排序时要使用的属性。 direction: 排序方向。 limit: 限制获取的模型数量。 full: 是否获取大部分模型数据。 config: 是否还获取存储库配置。 """ self.params = { "search": search, "author": author, "filter": filter, "sort": sort, "direction": direction, "limit": limit, "full": full, "config": config, }
[docs] def fetch_models(self) -> List[dict]: """从Hugging Face Hub获取模型信息。""" response = requests.get( self.BASE_URL, params={k: v for k, v in self.params.items() if v is not None}, ) response.raise_for_status() return response.json()
[docs] def fetch_readme_content(self, model_id: str) -> str: """获取给定模型的README内容。""" readme_url = self.README_BASE_URL.format(model_id=model_id) try: response = requests.get(readme_url) response.raise_for_status() return response.text except requests.RequestException: return "README not available for this model."
[docs] def lazy_load(self) -> Iterator[Document]: """延迟加载模型信息,包括README内容。""" models = self.fetch_models() for model in models: model_id = model.get("modelId", "") readme_content = self.fetch_readme_content(model_id) yield Document( page_content=readme_content, metadata=model, )