llama-index

markdown_reader.py
114 строк · 3.6 Кб
Перенос по словам
1
"""Markdown parser.
2

3
Contains parser for md files.
4

5
"""
6

7
import re
8
from pathlib import Path
9
from typing import Any, Dict, List, Optional, Tuple, cast
10

11
from llama_index.legacy.readers.base import BaseReader
12
from llama_index.legacy.schema import Document
13

14

15
class MarkdownReader(BaseReader):
16
    """Markdown parser.
17

18
    Extract text from markdown files.
19
    Returns dictionary with keys as headers and values as the text between headers.
20

21
    """
22

23
    def __init__(
24
        self,
25
        *args: Any,
26
        remove_hyperlinks: bool = True,
27
        remove_images: bool = True,
28
        **kwargs: Any,
29
    ) -> None:
30
        """Init params."""
31
        super().__init__(*args, **kwargs)
32
        self._remove_hyperlinks = remove_hyperlinks
33
        self._remove_images = remove_images
34

35
    def markdown_to_tups(self, markdown_text: str) -> List[Tuple[Optional[str], str]]:
36
        """Convert a markdown file to a dictionary.
37

38
        The keys are the headers and the values are the text under each header.
39

40
        """
41
        markdown_tups: List[Tuple[Optional[str], str]] = []
42
        lines = markdown_text.split("\n")
43

44
        current_header = None
45
        current_text = ""
46

47
        for line in lines:
48
            header_match = re.match(r"^#+\s", line)
49
            if header_match:
50
                if current_header is not None:
51
                    if current_text == "" or None:
52
                        continue
53
                    markdown_tups.append((current_header, current_text))
54

55
                current_header = line
56
                current_text = ""
57
            else:
58
                current_text += line + "\n"
59
        markdown_tups.append((current_header, current_text))
60

61
        if current_header is not None:
62
            # pass linting, assert keys are defined
63
            markdown_tups = [
64
                (re.sub(r"#", "", cast(str, key)).strip(), re.sub(r"<.*?>", "", value))
65
                for key, value in markdown_tups
66
            ]
67
        else:
68
            markdown_tups = [
69
                (key, re.sub("<.*?>", "", value)) for key, value in markdown_tups
70
            ]
71

72
        return markdown_tups
73

74
    def remove_images(self, content: str) -> str:
75
        """Get a dictionary of a markdown file from its path."""
76
        pattern = r"!{1}\[\[(.*)\]\]"
77
        return re.sub(pattern, "", content)
78

79
    def remove_hyperlinks(self, content: str) -> str:
80
        """Get a dictionary of a markdown file from its path."""
81
        pattern = r"\[(.*?)\]\((.*?)\)"
82
        return re.sub(pattern, r"\1", content)
83

84
    def _init_parser(self) -> Dict:
85
        """Initialize the parser with the config."""
86
        return {}
87

88
    def parse_tups(
89
        self, filepath: Path, errors: str = "ignore"
90
    ) -> List[Tuple[Optional[str], str]]:
91
        """Parse file into tuples."""
92
        with open(filepath, encoding="utf-8") as f:
93
            content = f.read()
94
        if self._remove_hyperlinks:
95
            content = self.remove_hyperlinks(content)
96
        if self._remove_images:
97
            content = self.remove_images(content)
98
        return self.markdown_to_tups(content)
99

100
    def load_data(
101
        self, file: Path, extra_info: Optional[Dict] = None
102
    ) -> List[Document]:
103
        """Parse file into string."""
104
        tups = self.parse_tups(file)
105
        results = []
106
        # TODO: don't include headers right now
107
        for header, value in tups:
108
            if header is None:
109
                results.append(Document(text=value, metadata=extra_info or {}))
110
            else:
111
                results.append(
112
                    Document(text=f"\n\n{header}\n{value}", metadata=extra_info or {})
113
                )
114
        return results
115
llama-index

Использование cookies