llama-index

youtube_transcript.py
45 строк · 1.4 Кб
Перенос по словам
1
"""Simple Reader that reads transcript of youtube video."""
2

3
from typing import Any, List
4

5
from llama_index.legacy.readers.base import BasePydanticReader
6
from llama_index.legacy.schema import Document
7

8

9
class YoutubeTranscriptReader(BasePydanticReader):
10
    """Youtube Transcript reader."""
11

12
    is_remote: bool = True
13
    languages: tuple = ("en",)
14

15
    @classmethod
16
    def class_name(cls) -> str:
17
        return "YoutubeTranscriptReader"
18

19
    def load_data(self, ytlinks: List[str], **load_kwargs: Any) -> List[Document]:
20
        """Load data from the input links.
21

22
        Args:
23
            pages (List[str]): List of youtube links \
24
                for which transcripts are to be read.
25

26
        """
27
        try:
28
            from youtube_transcript_api import YouTubeTranscriptApi
29
        except ImportError:
30
            raise ImportError(
31
                "`youtube_transcript_api` package not found, \
32
                    please run `pip install youtube-transcript-api`"
33
            )
34

35
        results = []
36
        for link in ytlinks:
37
            video_id = link.split("?v=")[-1]
38
            srt = YouTubeTranscriptApi.get_transcript(
39
                video_id, languages=self.languages
40
            )
41
            transcript = ""
42
            for chunk in srt:
43
                transcript = transcript + chunk["text"] + "\n"
44
            results.append(Document(text=transcript, id_=video_id))
45
        return results
46
llama-index

Использование cookies