Skip to content

Bilibili

BilibiliTranscriptReader #

Bases: BaseReader

哔哩哔哩(Bilibili)字幕和视频信息阅读器。

Source code in llama_index/readers/bilibili/base.py
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
class BilibiliTranscriptReader(BaseReader):
    """哔哩哔哩(Bilibili)字幕和视频信息阅读器。"""

    @staticmethod
    def get_bilibili_info_and_subs(bili_url):
        import json
        import re

        import requests
        from bilibili_api import sync, video

        bvid = re.search(r"BV\w+", bili_url).group()
        # Create credential object
        v = video.Video(bvid=bvid)
        # Get video info and basic info
        video_info = sync(v.get_info())
        title = video_info["title"]
        desc = video_info["desc"]

        # Get subtitle url
        sub_list = video_info["subtitle"]["list"]
        if sub_list:
            sub_url = sub_list[0]["subtitle_url"]
            result = requests.get(sub_url)
            raw_sub_titles = json.loads(result.content)["body"]
            raw_transcript = " ".join([c["content"] for c in raw_sub_titles])
            # Add basic video info to transcript
            return (
                f"Video Title: {title}, description: {desc}\nTranscript:"
                f" {raw_transcript}"
            )
        else:
            raw_transcript = ""
            warnings.warn(
                f"No subtitles found for video: {bili_url}. Return Empty transcript."
            )
            return raw_transcript

    def load_data(self, video_urls: List[str], **load_kwargs: Any) -> List[Document]:
        """从Bilibili加载自动生成的视频字幕,包括附加的元数据。

Args:
    video_urls(List[str]):要读取字幕的Bilibili链接列表。

Returns:
    List[Document]:包含每个Bilibili视频字幕的Document对象列表。
"""
        results = []
        for bili_url in video_urls:
            try:
                transcript = self.get_bilibili_info_and_subs(bili_url)
                results.append(Document(text=transcript))
            except Exception as e:
                warnings.warn(
                    f"Error loading transcript for video {bili_url}: {e!s}. Skipping"
                    " video."
                )
        return results

load_data #

load_data(
    video_urls: List[str], **load_kwargs: Any
) -> List[Document]

从Bilibili加载自动生成的视频字幕,包括附加的元数据。

Returns:

Type Description
List[Document]

List[Document]:包含每个Bilibili视频字幕的Document对象列表。

Source code in llama_index/readers/bilibili/base.py
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
    def load_data(self, video_urls: List[str], **load_kwargs: Any) -> List[Document]:
        """从Bilibili加载自动生成的视频字幕,包括附加的元数据。

Args:
    video_urls(List[str]):要读取字幕的Bilibili链接列表。

Returns:
    List[Document]:包含每个Bilibili视频字幕的Document对象列表。
"""
        results = []
        for bili_url in video_urls:
            try:
                transcript = self.get_bilibili_info_and_subs(bili_url)
                results.append(Document(text=transcript))
            except Exception as e:
                warnings.warn(
                    f"Error loading transcript for video {bili_url}: {e!s}. Skipping"
                    " video."
                )
        return results