Source code for langchain_community.document_loaders.assemblyai

from __future__ import annotations

from enum import Enum
from pathlib import Path
from typing import TYPE_CHECKING, Iterator, Optional, Union

import requests
from langchain_core.documents import Document

from langchain_community.document_loaders.base import BaseLoader

if TYPE_CHECKING:
    import assemblyai


[docs]class TranscriptFormat(Enum): """用于文档加载器的转录格式。""" TEXT = "text" """一个包含转录文本的文档""" SENTENCES = "sentences" """多个文档,通过每个句子拆分转录。""" PARAGRAPHS = "paragraphs" """多个文档,按每个段落拆分转录。""" SUBTITLES_SRT = "subtitles_srt" """一个以SRT字幕格式导出的文档。""" SUBTITLES_VTT = "subtitles_vtt" """一个以VTT字幕格式导出的文档。"""
[docs]class AssemblyAIAudioTranscriptLoader(BaseLoader): """加载 AssemblyAI 音频转录。 它使用 AssemblyAI API 来转录音频文件,并将转录的文本加载到一个或多个文档中,具体取决于指定的格式。 要使用,您应该已安装``assemblyai`` python包,并且环境变量``ASSEMBLYAI_API_KEY``已设置为您的 API 密钥。 或者,API 密钥也可以作为参数传递。 音频文件可以通过 URL 或本地文件路径指定。"""
[docs] def __init__( self, file_path: Union[str, Path], *, transcript_format: TranscriptFormat = TranscriptFormat.TEXT, config: Optional[assemblyai.TranscriptionConfig] = None, api_key: Optional[str] = None, ): """初始化AssemblyAI AudioTranscriptLoader。 参数: file_path:URL或本地文件路径。 transcript_format:要使用的转录格式。 有关更多信息,请参见类“TranscriptFormat”。 config:转录选项和功能。如果给定“None”, 将使用Transcriber的默认配置。 api_key:AssemblyAI API密钥。 """ try: import assemblyai except ImportError: raise ImportError( "Could not import assemblyai python package. " "Please install it with `pip install assemblyai`." ) if api_key is not None: assemblyai.settings.api_key = api_key self.file_path = str(file_path) self.transcript_format = transcript_format self.transcriber = assemblyai.Transcriber(config=config)
[docs] def lazy_load(self) -> Iterator[Document]: """将音频文件转录并加载转录内容到文档中。 使用AssemblyAI API来转录音频文件,并阻塞直到转录完成。 """ transcript = self.transcriber.transcribe(self.file_path) # This will raise a ValueError if no API key is set. if transcript.error: raise ValueError(f"Could not transcribe file: {transcript.error}") if self.transcript_format == TranscriptFormat.TEXT: yield Document( page_content=transcript.text, metadata=transcript.json_response ) elif self.transcript_format == TranscriptFormat.SENTENCES: sentences = transcript.get_sentences() for s in sentences: yield Document(page_content=s.text, metadata=s.dict(exclude={"text"})) elif self.transcript_format == TranscriptFormat.PARAGRAPHS: paragraphs = transcript.get_paragraphs() for p in paragraphs: yield Document(page_content=p.text, metadata=p.dict(exclude={"text"})) elif self.transcript_format == TranscriptFormat.SUBTITLES_SRT: yield Document(page_content=transcript.export_subtitles_srt()) elif self.transcript_format == TranscriptFormat.SUBTITLES_VTT: yield Document(page_content=transcript.export_subtitles_vtt()) else: raise ValueError("Unknown transcript format.")
[docs]class AssemblyAIAudioLoaderById(BaseLoader): """加载AssemblyAI音频转录。 它使用AssemblyAI API获取现有的转录,并根据指定的格式将转录文本加载到一个或多个文档中。"""
[docs] def __init__( self, transcript_id: str, api_key: str, transcript_format: TranscriptFormat ): """初始化AssemblyAI AssemblyAIAudioLoaderById。 参数: transcript_id:现有转录的ID。 transcript_format:要使用的转录格式。 有关更多信息,请参见类“TranscriptFormat”。 api_key:AssemblyAI API密钥。 """ self.api_key = api_key self.transcript_id = transcript_id self.transcript_format = transcript_format
[docs] def lazy_load(self) -> Iterator[Document]: """将数据加载到Document对象中。""" HEADERS = {"authorization": self.api_key} if self.transcript_format == TranscriptFormat.TEXT: try: transcript_response = requests.get( f"https://api.assemblyai.com/v2/transcript/{self.transcript_id}", headers=HEADERS, ) transcript_response.raise_for_status() except Exception as e: print(f"An error occurred: {e}") # noqa: T201 raise transcript = transcript_response.json()["text"] yield Document(page_content=transcript, metadata=transcript_response.json()) elif self.transcript_format == TranscriptFormat.PARAGRAPHS: try: paragraphs_response = requests.get( f"https://api.assemblyai.com/v2/transcript/{self.transcript_id}/paragraphs", headers=HEADERS, ) paragraphs_response.raise_for_status() except Exception as e: print(f"An error occurred: {e}") # noqa: T201 raise paragraphs = paragraphs_response.json()["paragraphs"] for p in paragraphs: yield Document(page_content=p["text"], metadata=p) elif self.transcript_format == TranscriptFormat.SENTENCES: try: sentences_response = requests.get( f"https://api.assemblyai.com/v2/transcript/{self.transcript_id}/sentences", headers=HEADERS, ) sentences_response.raise_for_status() except Exception as e: print(f"An error occurred: {e}") # noqa: T201 raise sentences = sentences_response.json()["sentences"] for s in sentences: yield Document(page_content=s["text"], metadata=s) elif self.transcript_format == TranscriptFormat.SUBTITLES_SRT: try: srt_response = requests.get( f"https://api.assemblyai.com/v2/transcript/{self.transcript_id}/srt", headers=HEADERS, ) srt_response.raise_for_status() except Exception as e: print(f"An error occurred: {e}") # noqa: T201 raise srt = srt_response.text yield Document(page_content=srt) elif self.transcript_format == TranscriptFormat.SUBTITLES_VTT: try: vtt_response = requests.get( f"https://api.assemblyai.com/v2/transcript/{self.transcript_id}/vtt", headers=HEADERS, ) vtt_response.raise_for_status() except Exception as e: print(f"An error occurred: {e}") # noqa: T201 raise vtt = vtt_response.text yield Document(page_content=vtt) else: raise ValueError("Unknown transcript format.")