llama-index

hnsw.py
118 строк · 4.2 Кб
Перенос по словам
1
import json
2
import os
3
from typing import Any, List, Literal
4

5
from llama_index.legacy.vector_stores.docarray.base import DocArrayVectorStore
6

7

8
class DocArrayHnswVectorStore(DocArrayVectorStore):
9
    """Class representing a DocArray HNSW vector store.
10

11
    This class is a lightweight Document Index implementation provided by Docarray.
12
    It stores vectors on disk in hnswlib, and stores all other data in SQLite.
13
    """
14

15
    def __init__(
16
        self,
17
        work_dir: str,
18
        dim: int = 1536,
19
        dist_metric: Literal["cosine", "ip", "l2"] = "cosine",
20
        max_elements: int = 1024,
21
        ef_construction: int = 200,
22
        ef: int = 10,
23
        M: int = 16,
24
        allow_replace_deleted: bool = True,
25
        num_threads: int = 1,
26
    ):
27
        """Initializes the DocArrayHnswVectorStore.
28

29
        Args:
30
            work_dir (str): The working directory.
31
            dim (int, optional): Dimensionality of the vectors. Default is 1536.
32
            dist_metric (Literal["cosine", "ip", "l2"], optional): The distance
33
                metric to use. Default is "cosine".
34
            max_elements (int, optional): defines the maximum number of elements
35
                that can be stored in the structure(can be increased/shrunk).
36
            ef_construction (int, optional): defines a construction time/accuracy
37
                trade-off. Default is 200.
38
            ef (int, optional): The size of the dynamic candidate list. Default is 10.
39
            M (int, optional): defines the maximum number of outgoing connections
40
                in the graph. Default is 16.
41
            allow_replace_deleted (bool, optional): Whether to allow replacing
42
                deleted elements. Default is True.
43
            num_threads (int, optional): Number of threads for index construction.
44
                Default is 1.
45
        """
46
        import_err_msg = """
47
                `docarray` package not found. Install the package via pip:
48
                `pip install docarray[hnswlib]`
49
        """
50
        try:
51
            import docarray  # noqa
52
        except ImportError:
53
            raise ImportError(import_err_msg)
54

55
        self._work_dir = work_dir
56
        ref_docs_path = os.path.join(self._work_dir, "ref_docs.json")
57
        if os.path.exists(ref_docs_path):
58
            with open(ref_docs_path) as f:
59
                self._ref_docs = json.load(f)
60
        else:
61
            self._ref_docs = {}
62

63
        self._index, self._schema = self._init_index(
64
            dim=dim,
65
            dist_metric=dist_metric,
66
            max_elements=max_elements,
67
            ef_construction=ef_construction,
68
            ef=ef,
69
            M=M,
70
            allow_replace_deleted=allow_replace_deleted,
71
            num_threads=num_threads,
72
        )
73

74
    def _init_index(self, **kwargs: Any):  # type: ignore[no-untyped-def]
75
        """Initializes the HNSW document index.
76

77
        Args:
78
            **kwargs: Variable length argument list for the HNSW index.
79

80
        Returns:
81
            tuple: The HNSW document index and its schema.
82
        """
83
        from docarray.index import HnswDocumentIndex
84

85
        schema = self._get_schema(**kwargs)
86
        index = HnswDocumentIndex[schema]  # type: ignore[valid-type]
87
        return index(work_dir=self._work_dir), schema
88

89
    def _find_docs_to_be_removed(self, doc_id: str) -> List[str]:
90
        """Finds the documents to be removed from the vector store.
91

92
        Args:
93
            doc_id (str): Reference document ID that should be removed.
94

95
        Returns:
96
            List[str]: List of document IDs to be removed.
97
        """
98
        docs = self._ref_docs.get(doc_id, [])
99
        del self._ref_docs[doc_id]
100
        self._save_ref_docs()
101
        return docs
102

103
    def _save_ref_docs(self) -> None:
104
        """Saves reference documents."""
105
        with open(os.path.join(self._work_dir, "ref_docs.json"), "w") as f:
106
            json.dump(self._ref_docs, f)
107

108
    def _update_ref_docs(self, docs):  # type: ignore[no-untyped-def]
109
        """Updates reference documents.
110

111
        Args:
112
            docs (List): List of documents to update.
113
        """
114
        for doc in docs:
115
            if doc.metadata["doc_id"] not in self._ref_docs:
116
                self._ref_docs[doc.metadata["doc_id"]] = []
117
            self._ref_docs[doc.metadata["doc_id"]].append(doc.id)
118
        self._save_ref_docs()
119
llama-index

Использование cookies