llama-index
274 строки · 9.3 Кб
1"""Download."""
2
3import json
4import logging
5import os
6import subprocess
7import sys
8from enum import Enum
9from importlib import util
10from pathlib import Path
11from typing import Any, Dict, List, Optional, Union
12
13import pkg_resources
14import requests
15from pkg_resources import DistributionNotFound
16
17from llama_index.legacy.download.utils import (
18get_exports,
19get_file_content,
20initialize_directory,
21rewrite_exports,
22)
23
24LLAMA_HUB_CONTENTS_URL = f"https://raw.githubusercontent.com/run-llama/llama-hub/main"
25LLAMA_HUB_PATH = "/llama_hub"
26LLAMA_HUB_URL = LLAMA_HUB_CONTENTS_URL + LLAMA_HUB_PATH
27
28PATH_TYPE = Union[str, Path]
29
30logger = logging.getLogger(__name__)
31LLAMAHUB_ANALYTICS_PROXY_SERVER = "https://llamahub.ai/api/analytics/downloads"
32
33
34class MODULE_TYPE(str, Enum):
35LOADER = "loader"
36TOOL = "tool"
37LLAMAPACK = "llamapack"
38DATASETS = "datasets"
39
40
41def get_module_info(
42local_dir_path: PATH_TYPE,
43remote_dir_path: PATH_TYPE,
44module_class: str,
45refresh_cache: bool = False,
46library_path: str = "library.json",
47disable_library_cache: bool = False,
48) -> Dict:
49"""Get module info."""
50if isinstance(local_dir_path, str):
51local_dir_path = Path(local_dir_path)
52
53local_library_path = f"{local_dir_path}/{library_path}"
54module_id = None # e.g. `web/simple_web`
55extra_files = [] # e.g. `web/simple_web/utils.py`
56
57# Check cache first
58if not refresh_cache and os.path.exists(local_library_path):
59with open(local_library_path) as f:
60library = json.load(f)
61if module_class in library:
62module_id = library[module_class]["id"]
63extra_files = library[module_class].get("extra_files", [])
64
65# Fetch up-to-date library from remote repo if module_id not found
66if module_id is None:
67library_raw_content, _ = get_file_content(
68str(remote_dir_path), f"/{library_path}"
69)
70library = json.loads(library_raw_content)
71if module_class not in library:
72raise ValueError("Loader class name not found in library")
73
74module_id = library[module_class]["id"]
75extra_files = library[module_class].get("extra_files", [])
76
77# create cache dir if needed
78local_library_dir = os.path.dirname(local_library_path)
79if not disable_library_cache:
80if not os.path.exists(local_library_dir):
81os.makedirs(local_library_dir)
82
83# Update cache
84with open(local_library_path, "w") as f:
85f.write(library_raw_content)
86
87if module_id is None:
88raise ValueError("Loader class name not found in library")
89
90return {
91"module_id": module_id,
92"extra_files": extra_files,
93}
94
95
96def download_module_and_reqs(
97local_dir_path: PATH_TYPE,
98remote_dir_path: PATH_TYPE,
99module_id: str,
100extra_files: List[str],
101refresh_cache: bool = False,
102use_gpt_index_import: bool = False,
103base_file_name: str = "base.py",
104override_path: bool = False,
105) -> None:
106"""Load module."""
107if isinstance(local_dir_path, str):
108local_dir_path = Path(local_dir_path)
109
110if override_path:
111module_path = str(local_dir_path)
112else:
113module_path = f"{local_dir_path}/{module_id}"
114
115if refresh_cache or not os.path.exists(module_path):
116os.makedirs(module_path, exist_ok=True)
117
118basepy_raw_content, _ = get_file_content(
119str(remote_dir_path), f"/{module_id}/{base_file_name}"
120)
121if use_gpt_index_import:
122basepy_raw_content = basepy_raw_content.replace(
123"import llama_index.legacy", "import llama_index.legacy"
124)
125basepy_raw_content = basepy_raw_content.replace(
126"from llama_index.legacy", "from llama_index.legacy"
127)
128
129with open(f"{module_path}/{base_file_name}", "w") as f:
130f.write(basepy_raw_content)
131
132# Get content of extra files if there are any
133# and write them under the loader directory
134for extra_file in extra_files:
135extra_file_raw_content, _ = get_file_content(
136str(remote_dir_path), f"/{module_id}/{extra_file}"
137)
138# If the extra file is an __init__.py file, we need to
139# add the exports to the __init__.py file in the modules directory
140if extra_file == "__init__.py":
141loader_exports = get_exports(extra_file_raw_content)
142existing_exports = []
143init_file_path = local_dir_path / "__init__.py"
144# if the __init__.py file do not exists, we need to create it
145mode = "a+" if not os.path.exists(init_file_path) else "r+"
146with open(init_file_path, mode) as f:
147f.write(f"from .{module_id} import {', '.join(loader_exports)}")
148existing_exports = get_exports(f.read())
149rewrite_exports(existing_exports + loader_exports, str(local_dir_path))
150
151with open(f"{module_path}/{extra_file}", "w") as f:
152f.write(extra_file_raw_content)
153
154# install requirements
155requirements_path = f"{local_dir_path}/requirements.txt"
156
157if not os.path.exists(requirements_path):
158# NOTE: need to check the status code
159response_txt, status_code = get_file_content(
160str(remote_dir_path), f"/{module_id}/requirements.txt"
161)
162if status_code == 200:
163with open(requirements_path, "w") as f:
164f.write(response_txt)
165
166# Install dependencies if there are any and not already installed
167if os.path.exists(requirements_path):
168try:
169requirements = pkg_resources.parse_requirements(
170Path(requirements_path).open()
171)
172pkg_resources.require([str(r) for r in requirements])
173except DistributionNotFound:
174subprocess.check_call(
175[sys.executable, "-m", "pip", "install", "-r", requirements_path]
176)
177
178
179def download_llama_module(
180module_class: str,
181llama_hub_url: str = LLAMA_HUB_URL,
182refresh_cache: bool = False,
183custom_dir: Optional[str] = None,
184custom_path: Optional[str] = None,
185library_path: str = "library.json",
186base_file_name: str = "base.py",
187use_gpt_index_import: bool = False,
188disable_library_cache: bool = False,
189override_path: bool = False,
190skip_load: bool = False,
191) -> Any:
192"""Download a module from LlamaHub.
193
194Can be a loader, tool, pack, or more.
195
196Args:
197loader_class: The name of the llama module class you want to download,
198such as `GmailOpenAIAgentPack`.
199refresh_cache: If true, the local cache will be skipped and the
200loader will be fetched directly from the remote repo.
201custom_dir: Custom dir name to download loader into (under parent folder).
202custom_path: Custom dirpath to download loader into.
203library_path: File name of the library file.
204use_gpt_index_import: If true, the loader files will use
205llama_index as the base dependency. By default (False),
206the loader files use llama_index as the base dependency.
207NOTE: this is a temporary workaround while we fully migrate all usages
208to llama_index.
209is_dataset: whether or not downloading a LlamaDataset
210
211Returns:
212A Loader, A Pack, An Agent, or A Dataset
213"""
214# create directory / get path
215dirpath = initialize_directory(custom_path=custom_path, custom_dir=custom_dir)
216
217# fetch info from library.json file
218module_info = get_module_info(
219local_dir_path=dirpath,
220remote_dir_path=llama_hub_url,
221module_class=module_class,
222refresh_cache=refresh_cache,
223library_path=library_path,
224disable_library_cache=disable_library_cache,
225)
226module_id = module_info["module_id"]
227extra_files = module_info["extra_files"]
228
229# download the module, install requirements
230download_module_and_reqs(
231local_dir_path=dirpath,
232remote_dir_path=llama_hub_url,
233module_id=module_id,
234extra_files=extra_files,
235refresh_cache=refresh_cache,
236use_gpt_index_import=use_gpt_index_import,
237base_file_name=base_file_name,
238override_path=override_path,
239)
240if skip_load:
241return None
242
243# loads the module into memory
244if override_path:
245path = f"{dirpath}/{base_file_name}"
246spec = util.spec_from_file_location("custom_module", location=path)
247if spec is None:
248raise ValueError(f"Could not find file: {path}.")
249else:
250path = f"{dirpath}/{module_id}/{base_file_name}"
251spec = util.spec_from_file_location("custom_module", location=path)
252if spec is None:
253raise ValueError(f"Could not find file: {path}.")
254
255module = util.module_from_spec(spec)
256spec.loader.exec_module(module) # type: ignore
257
258return getattr(module, module_class)
259
260
261def track_download(module_class: str, module_type: str) -> None:
262"""Tracks number of downloads via Llamahub proxy.
263
264Args:
265module_class: The name of the llama module being downloaded, e.g.,`GmailOpenAIAgentPack`.
266module_type: Can be "loader", "tool", "llamapack", or "datasets"
267"""
268try:
269requests.post(
270LLAMAHUB_ANALYTICS_PROXY_SERVER,
271json={"type": module_type, "plugin": module_class},
272)
273except Exception as e:
274logger.info(f"Error tracking downloads for {module_class} : {e}")
275