Bases: BasePydanticReader
Youtube字幕阅读器。
Source code in llama_index/readers/youtube_transcript/base.py
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66 | class YoutubeTranscriptReader(BasePydanticReader):
"""Youtube字幕阅读器。"""
is_remote: bool = True
@classmethod
def class_name(cls) -> str:
"""获取类的名称标识符。"""
return "YoutubeTranscriptReader"
def load_data(
self,
ytlinks: List[str],
languages: Optional[List[str]] = ["en"],
**load_kwargs: Any,
) -> List[Document]:
"""从输入目录加载数据。
Args:
pages(List[str]):要读取字幕的youtube链接列表。
"""
results = []
for link in ytlinks:
video_id = self._extract_video_id(link)
if not video_id:
raise ValueError(
f"Supplied url {link} is not a supported youtube URL."
"Supported formats include:"
" youtube.com/watch?v=\\{video_id\\} "
"(with or without 'www.')\n"
" youtube.com/embed?v=\\{video_id\\} "
"(with or without 'www.')\n"
" youtu.be/{video_id\\} (never includes www subdomain)"
)
transcript_chunks = YouTubeTranscriptApi.get_transcript(
video_id, languages=languages
)
chunk_text = [chunk["text"] for chunk in transcript_chunks]
transcript = "\n".join(chunk_text)
results.append(
Document(
text=transcript, id_=video_id, extra_info={"video_id": video_id}
)
)
return results
@staticmethod
def _extract_video_id(yt_link) -> Optional[str]:
for pattern in YOUTUBE_URL_PATTERNS:
match = re.search(pattern, yt_link)
if match:
return match.group(1)
# return None if no match is found
return None
|
class_name
classmethod
获取类的名称标识符。
Source code in llama_index/readers/youtube_transcript/base.py
| @classmethod
def class_name(cls) -> str:
"""获取类的名称标识符。"""
return "YoutubeTranscriptReader"
|
load_data
load_data(
ytlinks: List[str],
languages: Optional[List[str]] = ["en"],
**load_kwargs: Any
) -> List[Document]
从输入目录加载数据。
Source code in llama_index/readers/youtube_transcript/base.py
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56 | def load_data(
self,
ytlinks: List[str],
languages: Optional[List[str]] = ["en"],
**load_kwargs: Any,
) -> List[Document]:
"""从输入目录加载数据。
Args:
pages(List[str]):要读取字幕的youtube链接列表。
"""
results = []
for link in ytlinks:
video_id = self._extract_video_id(link)
if not video_id:
raise ValueError(
f"Supplied url {link} is not a supported youtube URL."
"Supported formats include:"
" youtube.com/watch?v=\\{video_id\\} "
"(with or without 'www.')\n"
" youtube.com/embed?v=\\{video_id\\} "
"(with or without 'www.')\n"
" youtu.be/{video_id\\} (never includes www subdomain)"
)
transcript_chunks = YouTubeTranscriptApi.get_transcript(
video_id, languages=languages
)
chunk_text = [chunk["text"] for chunk in transcript_chunks]
transcript = "\n".join(chunk_text)
results.append(
Document(
text=transcript, id_=video_id, extra_info={"video_id": video_id}
)
)
return results
|