llama-index

Форк
0
154 строки · 5.1 Кб
1
"""Google sheets reader."""
2

3
import logging
4
import os
5
from typing import Any, List
6

7
from llama_index.legacy.readers.base import BasePydanticReader
8
from llama_index.legacy.schema import Document
9

10
SCOPES = ["https://www.googleapis.com/auth/spreadsheets.readonly"]
11

12
logger = logging.getLogger(__name__)
13

14
# Copyright 2019 Google LLC
15
#
16
# Licensed under the Apache License, Version 2.0 (the "License");
17
# you may not use this file except in compliance with the License.
18
# You may obtain a copy of the License at
19
#
20
#     http://www.apache.org/licenses/LICENSE-2.0
21
#
22
# Unless required by applicable law or agreed to in writing, software
23
# distributed under the License is distributed on an "AS IS" BASIS,
24
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25
# See the License for the specific language governing permissions and
26
# limitations under the License.
27

28

29
class GoogleSheetsReader(BasePydanticReader):
30
    """Google Sheets reader.
31

32
    Reads a sheet as TSV from Google Sheets
33

34
    """
35

36
    is_remote: bool = True
37

38
    def __init__(self) -> None:
39
        """Initialize with parameters."""
40
        try:
41
            import google  # noqa
42
            import google_auth_oauthlib  # noqa
43
            import googleapiclient  # noqa
44
        except ImportError:
45
            raise ImportError(
46
                "`google_auth_oauthlib`, `googleapiclient` and `google` "
47
                "must be installed to use the GoogleSheetsReader.\n"
48
                "Please run `pip install --upgrade google-api-python-client "
49
                "google-auth-httplib2 google-auth-oauthlib`."
50
            )
51

52
    @classmethod
53
    def class_name(cls) -> str:
54
        return "GoogleSheetsReader"
55

56
    def load_data(self, spreadsheet_ids: List[str]) -> List[Document]:
57
        """Load data from the input directory.
58

59
        Args:
60
            spreadsheet_ids (List[str]): a list of document ids.
61
        """
62
        if spreadsheet_ids is None:
63
            raise ValueError('Must specify a "spreadsheet_ids" in `load_kwargs`.')
64

65
        results = []
66
        for spreadsheet_id in spreadsheet_ids:
67
            sheet = self._load_sheet(spreadsheet_id)
68
            results.append(
69
                Document(
70
                    id_=spreadsheet_id,
71
                    text=sheet,
72
                    metadata={"spreadsheet_id": spreadsheet_id},
73
                )
74
            )
75
        return results
76

77
    def _load_sheet(self, spreadsheet_id: str) -> str:
78
        """Load a sheet from Google Sheets.
79

80
        Args:
81
            spreadsheet_id: the sheet id.
82

83
        Returns:
84
            The sheet data.
85
        """
86
        import googleapiclient.discovery as discovery
87

88
        credentials = self._get_credentials()
89
        sheets_service = discovery.build("sheets", "v4", credentials=credentials)
90
        spreadsheet_data = (
91
            sheets_service.spreadsheets().get(spreadsheetId=spreadsheet_id).execute()
92
        )
93
        sheets = spreadsheet_data.get("sheets")
94
        sheet_text = ""
95

96
        for sheet in sheets:
97
            properties = sheet.get("properties")
98
            title = properties.get("title")
99
            sheet_text += title + "\n"
100
            grid_props = properties.get("gridProperties")
101
            rows = grid_props.get("rowCount")
102
            cols = grid_props.get("columnCount")
103
            range_pattern = f"R1C1:R{rows}C{cols}"
104
            response = (
105
                sheets_service.spreadsheets()
106
                .values()
107
                .get(spreadsheetId=spreadsheet_id, range=range_pattern)
108
                .execute()
109
            )
110
            sheet_text += (
111
                "\n".join("\t".join(row) for row in response.get("values", [])) + "\n"
112
            )
113
        return sheet_text
114

115
    def _get_credentials(self) -> Any:
116
        """Get valid user credentials from storage.
117

118
        The file token.json stores the user's access and refresh tokens, and is
119
        created automatically when the authorization flow completes for the first
120
        time.
121

122
        Returns:
123
            Credentials, the obtained credential.
124
        """
125
        from google.auth.transport.requests import Request
126
        from google.oauth2.credentials import Credentials
127
        from google_auth_oauthlib.flow import InstalledAppFlow
128

129
        creds = None
130
        if os.path.exists("token.json"):
131
            creds = Credentials.from_authorized_user_file("token.json", SCOPES)
132
        # If there are no (valid) credentials available, let the user log in.
133
        if not creds or not creds.valid:
134
            if creds and creds.expired and creds.refresh_token:
135
                creds.refresh(Request())
136
            else:
137
                flow = InstalledAppFlow.from_client_secrets_file(
138
                    "credentials.json", SCOPES
139
                )
140
                creds = flow.run_local_server(port=0)
141
            # Save the credentials for the next run
142
            with open("token.json", "w") as token:
143
                token.write(creds.to_json())
144

145
        return creds
146

147

148
if __name__ == "__main__":
149
    reader = GoogleSheetsReader()
150
    logger.info(
151
        reader.load_data(
152
            spreadsheet_ids=["1VkuitKIyNmkoCJJDmEUmkS_VupSkDcztpRhbUzAU5L8"]
153
        )
154
    )
155

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.