llama-index

gdocs.py
168 строк · 5.7 Кб
Перенос по словам
1
"""Google docs reader."""
2

3
import logging
4
import os
5
from typing import Any, List
6

7
from llama_index.legacy.readers.base import BasePydanticReader
8
from llama_index.legacy.schema import Document
9

10
SCOPES = ["https://www.googleapis.com/auth/documents.readonly"]
11

12
logger = logging.getLogger(__name__)
13

14
# Copyright 2019 Google LLC
15
#
16
# Licensed under the Apache License, Version 2.0 (the "License");
17
# you may not use this file except in compliance with the License.
18
# You may obtain a copy of the License at
19
#
20
#     http://www.apache.org/licenses/LICENSE-2.0
21
#
22
# Unless required by applicable law or agreed to in writing, software
23
# distributed under the License is distributed on an "AS IS" BASIS,
24
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25
# See the License for the specific language governing permissions and
26
# limitations under the License.
27

28

29
class GoogleDocsReader(BasePydanticReader):
30
    """Google Docs reader.
31

32
    Reads a page from Google Docs
33

34
    """
35

36
    is_remote: bool = True
37

38
    def __init__(self) -> None:
39
        """Initialize with parameters."""
40
        try:
41
            import google  # noqa
42
            import google_auth_oauthlib  # noqa
43
            import googleapiclient  # noqa
44
        except ImportError:
45
            raise ImportError(
46
                "`google_auth_oauthlib`, `googleapiclient` and `google` "
47
                "must be installed to use the GoogleDocsReader.\n"
48
                "Please run `pip install --upgrade google-api-python-client "
49
                "google-auth-httplib2 google-auth-oauthlib`."
50
            )
51

52
    @classmethod
53
    def class_name(cls) -> str:
54
        return "GoogleDocsReader"
55

56
    def load_data(self, document_ids: List[str]) -> List[Document]:
57
        """Load data from the input directory.
58

59
        Args:
60
            document_ids (List[str]): a list of document ids.
61
        """
62
        if document_ids is None:
63
            raise ValueError('Must specify a "document_ids" in `load_kwargs`.')
64

65
        results = []
66
        for document_id in document_ids:
67
            doc = self._load_doc(document_id)
68
            results.append(
69
                Document(
70
                    text=doc, id_=document_id, metadata={"document_id": document_id}
71
                )
72
            )
73
        return results
74

75
    def _load_doc(self, document_id: str) -> str:
76
        """Load a document from Google Docs.
77

78
        Args:
79
            document_id: the document id.
80

81
        Returns:
82
            The document text.
83
        """
84
        import googleapiclient.discovery as discovery
85

86
        credentials = self._get_credentials()
87
        docs_service = discovery.build("docs", "v1", credentials=credentials)
88
        doc = docs_service.documents().get(documentId=document_id).execute()
89
        doc_content = doc.get("body").get("content")
90
        return self._read_structural_elements(doc_content)
91

92
    def _get_credentials(self) -> Any:
93
        """Get valid user credentials from storage.
94

95
        The file token.json stores the user's access and refresh tokens, and is
96
        created automatically when the authorization flow completes for the first
97
        time.
98

99
        Returns:
100
            Credentials, the obtained credential.
101
        """
102
        from google.auth.transport.requests import Request
103
        from google.oauth2.credentials import Credentials
104
        from google_auth_oauthlib.flow import InstalledAppFlow
105

106
        creds = None
107
        if os.path.exists("token.json"):
108
            creds = Credentials.from_authorized_user_file("token.json", SCOPES)
109
        # If there are no (valid) credentials available, let the user log in.
110
        if not creds or not creds.valid:
111
            if creds and creds.expired and creds.refresh_token:
112
                creds.refresh(Request())
113
            else:
114
                flow = InstalledAppFlow.from_client_secrets_file(
115
                    "credentials.json", SCOPES
116
                )
117
                creds = flow.run_local_server(port=0)
118
            # Save the credentials for the next run
119
            with open("token.json", "w") as token:
120
                token.write(creds.to_json())
121

122
        return creds
123

124
    def _read_paragraph_element(self, element: Any) -> Any:
125
        """Return the text in the given ParagraphElement.
126

127
        Args:
128
            element: a ParagraphElement from a Google Doc.
129
        """
130
        text_run = element.get("textRun")
131
        if not text_run:
132
            return ""
133
        return text_run.get("content")
134

135
    def _read_structural_elements(self, elements: List[Any]) -> Any:
136
        """Recurse through a list of Structural Elements.
137

138
        Read a document's text where text may be in nested elements.
139

140
        Args:
141
            elements: a list of Structural Elements.
142
        """
143
        text = ""
144
        for value in elements:
145
            if "paragraph" in value:
146
                elements = value.get("paragraph").get("elements")
147
                for elem in elements:
148
                    text += self._read_paragraph_element(elem)
149
            elif "table" in value:
150
                # The text in table cells are in nested Structural Elements
151
                # and tables may be nested.
152
                table = value.get("table")
153
                for row in table.get("tableRows"):
154
                    cells = row.get("tableCells")
155
                    for cell in cells:
156
                        text += self._read_structural_elements(cell.get("content"))
157
            elif "tableOfContents" in value:
158
                # The text in the TOC is also in a Structural Element.
159
                toc = value.get("tableOfContents")
160
                text += self._read_structural_elements(toc.get("content"))
161
        return text
162

163

164
if __name__ == "__main__":
165
    reader = GoogleDocsReader()
166
    logger.info(
167
        reader.load_data(document_ids=["11ctUj_tEf5S8vs_dk8_BNi-Zk8wW5YFhXkKqtmU_4B8"])
168
    )
169
llama-index

Использование cookies