Source code for langchain_community.document_loaders.assemblyai
from __future__ import annotations
from enum import Enum
from pathlib import Path
from typing import TYPE_CHECKING, Iterator, Optional, Union
import requests
from langchain_core.documents import Document
from langchain_community.document_loaders.base import BaseLoader
if TYPE_CHECKING:
import assemblyai
[docs]class AssemblyAIAudioTranscriptLoader(BaseLoader):
"""加载 AssemblyAI 音频转录。
它使用 AssemblyAI API 来转录音频文件,并将转录的文本加载到一个或多个文档中,具体取决于指定的格式。
要使用,您应该已安装``assemblyai`` python包,并且环境变量``ASSEMBLYAI_API_KEY``已设置为您的 API 密钥。
或者,API 密钥也可以作为参数传递。
音频文件可以通过 URL 或本地文件路径指定。"""
[docs] def __init__(
self,
file_path: Union[str, Path],
*,
transcript_format: TranscriptFormat = TranscriptFormat.TEXT,
config: Optional[assemblyai.TranscriptionConfig] = None,
api_key: Optional[str] = None,
):
"""初始化AssemblyAI AudioTranscriptLoader。
参数:
file_path:URL或本地文件路径。
transcript_format:要使用的转录格式。
有关更多信息,请参见类“TranscriptFormat”。
config:转录选项和功能。如果给定“None”,
将使用Transcriber的默认配置。
api_key:AssemblyAI API密钥。
"""
try:
import assemblyai
except ImportError:
raise ImportError(
"Could not import assemblyai python package. "
"Please install it with `pip install assemblyai`."
)
if api_key is not None:
assemblyai.settings.api_key = api_key
self.file_path = str(file_path)
self.transcript_format = transcript_format
self.transcriber = assemblyai.Transcriber(config=config)
[docs] def lazy_load(self) -> Iterator[Document]:
"""将音频文件转录并加载转录内容到文档中。
使用AssemblyAI API来转录音频文件,并阻塞直到转录完成。
"""
transcript = self.transcriber.transcribe(self.file_path)
# This will raise a ValueError if no API key is set.
if transcript.error:
raise ValueError(f"Could not transcribe file: {transcript.error}")
if self.transcript_format == TranscriptFormat.TEXT:
yield Document(
page_content=transcript.text, metadata=transcript.json_response
)
elif self.transcript_format == TranscriptFormat.SENTENCES:
sentences = transcript.get_sentences()
for s in sentences:
yield Document(page_content=s.text, metadata=s.dict(exclude={"text"}))
elif self.transcript_format == TranscriptFormat.PARAGRAPHS:
paragraphs = transcript.get_paragraphs()
for p in paragraphs:
yield Document(page_content=p.text, metadata=p.dict(exclude={"text"}))
elif self.transcript_format == TranscriptFormat.SUBTITLES_SRT:
yield Document(page_content=transcript.export_subtitles_srt())
elif self.transcript_format == TranscriptFormat.SUBTITLES_VTT:
yield Document(page_content=transcript.export_subtitles_vtt())
else:
raise ValueError("Unknown transcript format.")
[docs]class AssemblyAIAudioLoaderById(BaseLoader):
"""加载AssemblyAI音频转录。
它使用AssemblyAI API获取现有的转录,并根据指定的格式将转录文本加载到一个或多个文档中。"""
[docs] def __init__(
self, transcript_id: str, api_key: str, transcript_format: TranscriptFormat
):
"""初始化AssemblyAI AssemblyAIAudioLoaderById。
参数:
transcript_id:现有转录的ID。
transcript_format:要使用的转录格式。
有关更多信息,请参见类“TranscriptFormat”。
api_key:AssemblyAI API密钥。
"""
self.api_key = api_key
self.transcript_id = transcript_id
self.transcript_format = transcript_format
[docs] def lazy_load(self) -> Iterator[Document]:
"""将数据加载到Document对象中。"""
HEADERS = {"authorization": self.api_key}
if self.transcript_format == TranscriptFormat.TEXT:
try:
transcript_response = requests.get(
f"https://api.assemblyai.com/v2/transcript/{self.transcript_id}",
headers=HEADERS,
)
transcript_response.raise_for_status()
except Exception as e:
print(f"An error occurred: {e}") # noqa: T201
raise
transcript = transcript_response.json()["text"]
yield Document(page_content=transcript, metadata=transcript_response.json())
elif self.transcript_format == TranscriptFormat.PARAGRAPHS:
try:
paragraphs_response = requests.get(
f"https://api.assemblyai.com/v2/transcript/{self.transcript_id}/paragraphs",
headers=HEADERS,
)
paragraphs_response.raise_for_status()
except Exception as e:
print(f"An error occurred: {e}") # noqa: T201
raise
paragraphs = paragraphs_response.json()["paragraphs"]
for p in paragraphs:
yield Document(page_content=p["text"], metadata=p)
elif self.transcript_format == TranscriptFormat.SENTENCES:
try:
sentences_response = requests.get(
f"https://api.assemblyai.com/v2/transcript/{self.transcript_id}/sentences",
headers=HEADERS,
)
sentences_response.raise_for_status()
except Exception as e:
print(f"An error occurred: {e}") # noqa: T201
raise
sentences = sentences_response.json()["sentences"]
for s in sentences:
yield Document(page_content=s["text"], metadata=s)
elif self.transcript_format == TranscriptFormat.SUBTITLES_SRT:
try:
srt_response = requests.get(
f"https://api.assemblyai.com/v2/transcript/{self.transcript_id}/srt",
headers=HEADERS,
)
srt_response.raise_for_status()
except Exception as e:
print(f"An error occurred: {e}") # noqa: T201
raise
srt = srt_response.text
yield Document(page_content=srt)
elif self.transcript_format == TranscriptFormat.SUBTITLES_VTT:
try:
vtt_response = requests.get(
f"https://api.assemblyai.com/v2/transcript/{self.transcript_id}/vtt",
headers=HEADERS,
)
vtt_response.raise_for_status()
except Exception as e:
print(f"An error occurred: {e}") # noqa: T201
raise
vtt = vtt_response.text
yield Document(page_content=vtt)
else:
raise ValueError("Unknown transcript format.")