Skip to content

Youtube transcript

YoutubeTranscriptReader #

Bases: BasePydanticReader

Youtube字幕阅读器。

Source code in llama_index/readers/youtube_transcript/base.py
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
class YoutubeTranscriptReader(BasePydanticReader):
    """Youtube字幕阅读器。"""

    is_remote: bool = True

    @classmethod
    def class_name(cls) -> str:
        """获取类的名称标识符。"""
        return "YoutubeTranscriptReader"

    def load_data(
        self,
        ytlinks: List[str],
        languages: Optional[List[str]] = ["en"],
        **load_kwargs: Any,
    ) -> List[Document]:
        """从输入目录加载数据。

Args:
    pages(List[str]):要读取字幕的youtube链接列表。
"""
        results = []
        for link in ytlinks:
            video_id = self._extract_video_id(link)
            if not video_id:
                raise ValueError(
                    f"Supplied url {link} is not a supported youtube URL."
                    "Supported formats include:"
                    "  youtube.com/watch?v=\\{video_id\\} "
                    "(with or without 'www.')\n"
                    "  youtube.com/embed?v=\\{video_id\\} "
                    "(with or without 'www.')\n"
                    "  youtu.be/{video_id\\} (never includes www subdomain)"
                )
            transcript_chunks = YouTubeTranscriptApi.get_transcript(
                video_id, languages=languages
            )
            chunk_text = [chunk["text"] for chunk in transcript_chunks]
            transcript = "\n".join(chunk_text)
            results.append(
                Document(
                    text=transcript, id_=video_id, extra_info={"video_id": video_id}
                )
            )
        return results

    @staticmethod
    def _extract_video_id(yt_link) -> Optional[str]:
        for pattern in YOUTUBE_URL_PATTERNS:
            match = re.search(pattern, yt_link)
            if match:
                return match.group(1)

        # return None if no match is found
        return None

class_name classmethod #

class_name() -> str

获取类的名称标识符。

Source code in llama_index/readers/youtube_transcript/base.py
17
18
19
20
@classmethod
def class_name(cls) -> str:
    """获取类的名称标识符。"""
    return "YoutubeTranscriptReader"

load_data #

load_data(
    ytlinks: List[str],
    languages: Optional[List[str]] = ["en"],
    **load_kwargs: Any
) -> List[Document]

从输入目录加载数据。

Source code in llama_index/readers/youtube_transcript/base.py
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
    def load_data(
        self,
        ytlinks: List[str],
        languages: Optional[List[str]] = ["en"],
        **load_kwargs: Any,
    ) -> List[Document]:
        """从输入目录加载数据。

Args:
    pages(List[str]):要读取字幕的youtube链接列表。
"""
        results = []
        for link in ytlinks:
            video_id = self._extract_video_id(link)
            if not video_id:
                raise ValueError(
                    f"Supplied url {link} is not a supported youtube URL."
                    "Supported formats include:"
                    "  youtube.com/watch?v=\\{video_id\\} "
                    "(with or without 'www.')\n"
                    "  youtube.com/embed?v=\\{video_id\\} "
                    "(with or without 'www.')\n"
                    "  youtu.be/{video_id\\} (never includes www subdomain)"
                )
            transcript_chunks = YouTubeTranscriptApi.get_transcript(
                video_id, languages=languages
            )
            chunk_text = [chunk["text"] for chunk in transcript_chunks]
            transcript = "\n".join(chunk_text)
            results.append(
                Document(
                    text=transcript, id_=video_id, extra_info={"video_id": video_id}
                )
            )
        return results