llama-index

Форк
0
107 строк · 3.4 Кб
1
"""Mbox parser.
2

3
Contains simple parser for mbox files.
4

5
"""
6

7
import logging
8
from pathlib import Path
9
from typing import Any, Dict, List, Optional
10

11
from llama_index.legacy.readers.base import BaseReader
12
from llama_index.legacy.schema import Document
13

14
logger = logging.getLogger(__name__)
15

16

17
class MboxReader(BaseReader):
18
    """Mbox parser.
19

20
    Extract messages from mailbox files.
21
    Returns string including date, subject, sender, receiver and
22
    content for each message.
23

24
    """
25

26
    DEFAULT_MESSAGE_FORMAT: str = (
27
        "Date: {_date}\n"
28
        "From: {_from}\n"
29
        "To: {_to}\n"
30
        "Subject: {_subject}\n"
31
        "Content: {_content}"
32
    )
33

34
    def __init__(
35
        self,
36
        *args: Any,
37
        max_count: int = 0,
38
        message_format: str = DEFAULT_MESSAGE_FORMAT,
39
        **kwargs: Any,
40
    ) -> None:
41
        """Init params."""
42
        try:
43
            from bs4 import BeautifulSoup  # noqa
44
        except ImportError:
45
            raise ImportError(
46
                "`beautifulsoup4` package not found: `pip install beautifulsoup4`"
47
            )
48

49
        super().__init__(*args, **kwargs)
50
        self.max_count = max_count
51
        self.message_format = message_format
52

53
    def load_data(
54
        self, file: Path, extra_info: Optional[Dict] = None
55
    ) -> List[Document]:
56
        """Parse file into string."""
57
        # Import required libraries
58
        import mailbox
59
        from email.parser import BytesParser
60
        from email.policy import default
61

62
        from bs4 import BeautifulSoup
63

64
        i = 0
65
        results: List[str] = []
66
        # Load file using mailbox
67
        bytes_parser = BytesParser(policy=default).parse
68
        mbox = mailbox.mbox(file, factory=bytes_parser)  # type: ignore
69

70
        # Iterate through all messages
71
        for _, _msg in enumerate(mbox):
72
            try:
73
                msg: mailbox.mboxMessage = _msg
74
                # Parse multipart messages
75
                if msg.is_multipart():
76
                    for part in msg.walk():
77
                        ctype = part.get_content_type()
78
                        cdispo = str(part.get("Content-Disposition"))
79
                        if ctype == "text/plain" and "attachment" not in cdispo:
80
                            content = part.get_payload(decode=True)  # decode
81
                            break
82
                # Get plain message payload for non-multipart messages
83
                else:
84
                    content = msg.get_payload(decode=True)
85

86
                # Parse message HTML content and remove unneeded whitespace
87
                soup = BeautifulSoup(content)
88
                stripped_content = " ".join(soup.get_text().split())
89
                # Format message to include date, sender, receiver and subject
90
                msg_string = self.message_format.format(
91
                    _date=msg["date"],
92
                    _from=msg["from"],
93
                    _to=msg["to"],
94
                    _subject=msg["subject"],
95
                    _content=stripped_content,
96
                )
97
                # Add message string to results
98
                results.append(msg_string)
99
            except Exception as e:
100
                logger.warning(f"Failed to parse message:\n{_msg}\n with exception {e}")
101

102
            # Increment counter and return if max count is met
103
            i += 1
104
            if self.max_count > 0 and i >= self.max_count:
105
                break
106

107
        return [Document(text=result, metadata=extra_info or {}) for result in results]
108

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.