llama-index

Форк
0
103 строки · 3.4 Кб
1
"""Mongo client."""
2

3
from typing import Dict, Iterable, List, Optional, Union
4

5
from llama_index.legacy.readers.base import BaseReader
6
from llama_index.legacy.schema import Document
7

8

9
class SimpleMongoReader(BaseReader):
10
    """Simple mongo reader.
11

12
    Concatenates each Mongo doc into Document used by LlamaIndex.
13

14
    Args:
15
        host (str): Mongo host.
16
        port (int): Mongo port.
17
    """
18

19
    def __init__(
20
        self,
21
        host: Optional[str] = None,
22
        port: Optional[int] = None,
23
        uri: Optional[str] = None,
24
    ) -> None:
25
        """Initialize with parameters."""
26
        try:
27
            from pymongo import MongoClient
28
        except ImportError as err:
29
            raise ImportError(
30
                "`pymongo` package not found, please run `pip install pymongo`"
31
            ) from err
32

33
        client: MongoClient
34
        if uri:
35
            client = MongoClient(uri)
36
        elif host and port:
37
            client = MongoClient(host, port)
38
        else:
39
            raise ValueError("Either `host` and `port` or `uri` must be provided.")
40

41
        self.client = client
42

43
    def _flatten(self, texts: List[Union[str, List[str]]]) -> List[str]:
44
        result = []
45
        for text in texts:
46
            result += text if isinstance(text, list) else [text]
47
        return result
48

49
    def lazy_load_data(
50
        self,
51
        db_name: str,
52
        collection_name: str,
53
        field_names: List[str] = ["text"],
54
        separator: str = "",
55
        query_dict: Optional[Dict] = None,
56
        max_docs: int = 0,
57
        metadata_names: Optional[List[str]] = None,
58
    ) -> Iterable[Document]:
59
        """Load data from the input directory.
60

61
        Args:
62
            db_name (str): name of the database.
63
            collection_name (str): name of the collection.
64
            field_names(List[str]): names of the fields to be concatenated.
65
                Defaults to ["text"]
66
            separator (str): separator to be used between fields.
67
                Defaults to ""
68
            query_dict (Optional[Dict]): query to filter documents. Read more
69
            at [official docs](https://www.mongodb.com/docs/manual/reference/method/db.collection.find/#std-label-method-find-query)
70
                Defaults to None
71
            max_docs (int): maximum number of documents to load.
72
                Defaults to 0 (no limit)
73
            metadata_names (Optional[List[str]]): names of the fields to be added
74
                to the metadata attribute of the Document. Defaults to None
75

76
        Returns:
77
            List[Document]: A list of documents.
78

79
        """
80
        db = self.client[db_name]
81
        cursor = db[collection_name].find(filter=query_dict or {}, limit=max_docs)
82

83
        for item in cursor:
84
            try:
85
                texts = [item[name] for name in field_names]
86
            except KeyError as err:
87
                raise ValueError(
88
                    f"{err.args[0]} field not found in Mongo document."
89
                ) from err
90

91
            texts = self._flatten(texts)
92
            text = separator.join(texts)
93

94
            if metadata_names is None:
95
                yield Document(text=text)
96
            else:
97
                try:
98
                    metadata = {name: item[name] for name in metadata_names}
99
                except KeyError as err:
100
                    raise ValueError(
101
                        f"{err.args[0]} field not found in Mongo document."
102
                    ) from err
103
                yield Document(text=text, metadata=metadata)
104

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.