llama-index
118 строк · 4.2 Кб
1import json2import os3from typing import Any, List, Literal4
5from llama_index.legacy.vector_stores.docarray.base import DocArrayVectorStore6
7
8class DocArrayHnswVectorStore(DocArrayVectorStore):9"""Class representing a DocArray HNSW vector store.10
11This class is a lightweight Document Index implementation provided by Docarray.
12It stores vectors on disk in hnswlib, and stores all other data in SQLite.
13"""
14
15def __init__(16self,17work_dir: str,18dim: int = 1536,19dist_metric: Literal["cosine", "ip", "l2"] = "cosine",20max_elements: int = 1024,21ef_construction: int = 200,22ef: int = 10,23M: int = 16,24allow_replace_deleted: bool = True,25num_threads: int = 1,26):27"""Initializes the DocArrayHnswVectorStore.28
29Args:
30work_dir (str): The working directory.
31dim (int, optional): Dimensionality of the vectors. Default is 1536.
32dist_metric (Literal["cosine", "ip", "l2"], optional): The distance
33metric to use. Default is "cosine".
34max_elements (int, optional): defines the maximum number of elements
35that can be stored in the structure(can be increased/shrunk).
36ef_construction (int, optional): defines a construction time/accuracy
37trade-off. Default is 200.
38ef (int, optional): The size of the dynamic candidate list. Default is 10.
39M (int, optional): defines the maximum number of outgoing connections
40in the graph. Default is 16.
41allow_replace_deleted (bool, optional): Whether to allow replacing
42deleted elements. Default is True.
43num_threads (int, optional): Number of threads for index construction.
44Default is 1.
45"""
46import_err_msg = """47`docarray` package not found. Install the package via pip:
48`pip install docarray[hnswlib]`
49"""
50try:51import docarray # noqa52except ImportError:53raise ImportError(import_err_msg)54
55self._work_dir = work_dir56ref_docs_path = os.path.join(self._work_dir, "ref_docs.json")57if os.path.exists(ref_docs_path):58with open(ref_docs_path) as f:59self._ref_docs = json.load(f)60else:61self._ref_docs = {}62
63self._index, self._schema = self._init_index(64dim=dim,65dist_metric=dist_metric,66max_elements=max_elements,67ef_construction=ef_construction,68ef=ef,69M=M,70allow_replace_deleted=allow_replace_deleted,71num_threads=num_threads,72)73
74def _init_index(self, **kwargs: Any): # type: ignore[no-untyped-def]75"""Initializes the HNSW document index.76
77Args:
78**kwargs: Variable length argument list for the HNSW index.
79
80Returns:
81tuple: The HNSW document index and its schema.
82"""
83from docarray.index import HnswDocumentIndex84
85schema = self._get_schema(**kwargs)86index = HnswDocumentIndex[schema] # type: ignore[valid-type]87return index(work_dir=self._work_dir), schema88
89def _find_docs_to_be_removed(self, doc_id: str) -> List[str]:90"""Finds the documents to be removed from the vector store.91
92Args:
93doc_id (str): Reference document ID that should be removed.
94
95Returns:
96List[str]: List of document IDs to be removed.
97"""
98docs = self._ref_docs.get(doc_id, [])99del self._ref_docs[doc_id]100self._save_ref_docs()101return docs102
103def _save_ref_docs(self) -> None:104"""Saves reference documents."""105with open(os.path.join(self._work_dir, "ref_docs.json"), "w") as f:106json.dump(self._ref_docs, f)107
108def _update_ref_docs(self, docs): # type: ignore[no-untyped-def]109"""Updates reference documents.110
111Args:
112docs (List): List of documents to update.
113"""
114for doc in docs:115if doc.metadata["doc_id"] not in self._ref_docs:116self._ref_docs[doc.metadata["doc_id"]] = []117self._ref_docs[doc.metadata["doc_id"]].append(doc.id)118self._save_ref_docs()119