llama-index
45 строк · 1.4 Кб
1"""Simple Reader that reads transcript of youtube video."""
2
3from typing import Any, List4
5from llama_index.legacy.readers.base import BasePydanticReader6from llama_index.legacy.schema import Document7
8
9class YoutubeTranscriptReader(BasePydanticReader):10"""Youtube Transcript reader."""11
12is_remote: bool = True13languages: tuple = ("en",)14
15@classmethod16def class_name(cls) -> str:17return "YoutubeTranscriptReader"18
19def load_data(self, ytlinks: List[str], **load_kwargs: Any) -> List[Document]:20"""Load data from the input links.21
22Args:
23pages (List[str]): List of youtube links \
24for which transcripts are to be read.
25
26"""
27try:28from youtube_transcript_api import YouTubeTranscriptApi29except ImportError:30raise ImportError(31"`youtube_transcript_api` package not found, \32please run `pip install youtube-transcript-api`"33)34
35results = []36for link in ytlinks:37video_id = link.split("?v=")[-1]38srt = YouTubeTranscriptApi.get_transcript(39video_id, languages=self.languages40)41transcript = ""42for chunk in srt:43transcript = transcript + chunk["text"] + "\n"44results.append(Document(text=transcript, id_=video_id))45return results46