Source code for langchain_community.document_loaders.assemblyai

from __future__ import annotations

from enum import Enum
from pathlib import Path
from typing import TYPE_CHECKING, Iterator, Optional, Union

import requests
from langchain_core.documents import Document

from langchain_community.document_loaders.base import BaseLoader

if TYPE_CHECKING:
    import assemblyai


[docs]class TranscriptFormat(Enum):
    """用于文档加载器的转录格式。"""

    TEXT = "text"
    """一个包含转录文本的文档"""
    SENTENCES = "sentences"
    """多个文档，通过每个句子拆分转录。"""
    PARAGRAPHS = "paragraphs"
    """多个文档，按每个段落拆分转录。"""
    SUBTITLES_SRT = "subtitles_srt"
    """一个以SRT字幕格式导出的文档。"""
    SUBTITLES_VTT = "subtitles_vtt"
    """一个以VTT字幕格式导出的文档。"""


[docs]class AssemblyAIAudioTranscriptLoader(BaseLoader):
    """加载 AssemblyAI 音频转录。

    它使用 AssemblyAI API 来转录音频文件，并将转录的文本加载到一个或多个文档中，具体取决于指定的格式。

    要使用，您应该已安装``assemblyai`` python包，并且环境变量``ASSEMBLYAI_API_KEY``已设置为您的 API 密钥。
    或者，API 密钥也可以作为参数传递。

    音频文件可以通过 URL 或本地文件路径指定。"""

[docs]    def __init__(
        self,
        file_path: Union[str, Path],
        *,
        transcript_format: TranscriptFormat = TranscriptFormat.TEXT,
        config: Optional[assemblyai.TranscriptionConfig] = None,
        api_key: Optional[str] = None,
    ):
        """初始化AssemblyAI AudioTranscriptLoader。

参数：
    file_path：URL或本地文件路径。
    transcript_format：要使用的转录格式。
        有关更多信息，请参见类“TranscriptFormat”。
    config：转录选项和功能。如果给定“None”，
        将使用Transcriber的默认配置。
    api_key：AssemblyAI API密钥。
"""
        try:
            import assemblyai
        except ImportError:
            raise ImportError(
                "Could not import assemblyai python package. "
                "Please install it with `pip install assemblyai`."
            )
        if api_key is not None:
            assemblyai.settings.api_key = api_key

        self.file_path = str(file_path)
        self.transcript_format = transcript_format
        self.transcriber = assemblyai.Transcriber(config=config)

[docs]    def lazy_load(self) -> Iterator[Document]:
        """将音频文件转录并加载转录内容到文档中。

使用AssemblyAI API来转录音频文件，并阻塞直到转录完成。
"""
        transcript = self.transcriber.transcribe(self.file_path)
        # This will raise a ValueError if no API key is set.

        if transcript.error:
            raise ValueError(f"Could not transcribe file: {transcript.error}")

        if self.transcript_format == TranscriptFormat.TEXT:
            yield Document(
                page_content=transcript.text, metadata=transcript.json_response
            )
        elif self.transcript_format == TranscriptFormat.SENTENCES:
            sentences = transcript.get_sentences()
            for s in sentences:
                yield Document(page_content=s.text, metadata=s.dict(exclude={"text"}))
        elif self.transcript_format == TranscriptFormat.PARAGRAPHS:
            paragraphs = transcript.get_paragraphs()
            for p in paragraphs:
                yield Document(page_content=p.text, metadata=p.dict(exclude={"text"}))
        elif self.transcript_format == TranscriptFormat.SUBTITLES_SRT:
            yield Document(page_content=transcript.export_subtitles_srt())
        elif self.transcript_format == TranscriptFormat.SUBTITLES_VTT:
            yield Document(page_content=transcript.export_subtitles_vtt())
        else:
            raise ValueError("Unknown transcript format.")


[docs]class AssemblyAIAudioLoaderById(BaseLoader):
    """加载AssemblyAI音频转录。

它使用AssemblyAI API获取现有的转录，并根据指定的格式将转录文本加载到一个或多个文档中。"""

[docs]    def __init__(
        self, transcript_id: str, api_key: str, transcript_format: TranscriptFormat
    ):
        """初始化AssemblyAI AssemblyAIAudioLoaderById。

参数：
    transcript_id：现有转录的ID。
    transcript_format：要使用的转录格式。
        有关更多信息，请参见类“TranscriptFormat”。
    api_key：AssemblyAI API密钥。
"""

        self.api_key = api_key
        self.transcript_id = transcript_id
        self.transcript_format = transcript_format

[docs]    def lazy_load(self) -> Iterator[Document]:
        """将数据加载到Document对象中。"""
        HEADERS = {"authorization": self.api_key}

        if self.transcript_format == TranscriptFormat.TEXT:
            try:
                transcript_response = requests.get(
                    f"https://api.assemblyai.com/v2/transcript/{self.transcript_id}",
                    headers=HEADERS,
                )
                transcript_response.raise_for_status()
            except Exception as e:
                print(f"An error occurred: {e}")  # noqa: T201
                raise

            transcript = transcript_response.json()["text"]

            yield Document(page_content=transcript, metadata=transcript_response.json())
        elif self.transcript_format == TranscriptFormat.PARAGRAPHS:
            try:
                paragraphs_response = requests.get(
                    f"https://api.assemblyai.com/v2/transcript/{self.transcript_id}/paragraphs",
                    headers=HEADERS,
                )
                paragraphs_response.raise_for_status()
            except Exception as e:
                print(f"An error occurred: {e}")  # noqa: T201
                raise

            paragraphs = paragraphs_response.json()["paragraphs"]

            for p in paragraphs:
                yield Document(page_content=p["text"], metadata=p)

        elif self.transcript_format == TranscriptFormat.SENTENCES:
            try:
                sentences_response = requests.get(
                    f"https://api.assemblyai.com/v2/transcript/{self.transcript_id}/sentences",
                    headers=HEADERS,
                )
                sentences_response.raise_for_status()
            except Exception as e:
                print(f"An error occurred: {e}")  # noqa: T201
                raise

            sentences = sentences_response.json()["sentences"]

            for s in sentences:
                yield Document(page_content=s["text"], metadata=s)

        elif self.transcript_format == TranscriptFormat.SUBTITLES_SRT:
            try:
                srt_response = requests.get(
                    f"https://api.assemblyai.com/v2/transcript/{self.transcript_id}/srt",
                    headers=HEADERS,
                )
                srt_response.raise_for_status()
            except Exception as e:
                print(f"An error occurred: {e}")  # noqa: T201
                raise

            srt = srt_response.text

            yield Document(page_content=srt)

        elif self.transcript_format == TranscriptFormat.SUBTITLES_VTT:
            try:
                vtt_response = requests.get(
                    f"https://api.assemblyai.com/v2/transcript/{self.transcript_id}/vtt",
                    headers=HEADERS,
                )
                vtt_response.raise_for_status()
            except Exception as e:
                print(f"An error occurred: {e}")  # noqa: T201
                raise

            vtt = vtt_response.text

            yield Document(page_content=vtt)
        else:
            raise ValueError("Unknown transcript format.")