llama-index

Форк
0
204 строки · 6.2 Кб
1
"""Faiss Vector store index.
2

3
An index that is built on top of an existing vector store.
4

5
"""
6

7
import logging
8
import os
9
from typing import Any, List, Optional, cast
10

11
import fsspec
12
import numpy as np
13
from fsspec.implementations.local import LocalFileSystem
14

15
from llama_index.legacy.bridge.pydantic import PrivateAttr
16
from llama_index.legacy.schema import BaseNode
17
from llama_index.legacy.vector_stores.simple import DEFAULT_VECTOR_STORE, NAMESPACE_SEP
18
from llama_index.legacy.vector_stores.types import (
19
    DEFAULT_PERSIST_DIR,
20
    DEFAULT_PERSIST_FNAME,
21
    BasePydanticVectorStore,
22
    VectorStoreQuery,
23
    VectorStoreQueryResult,
24
)
25

26
logger = logging.getLogger()
27

28
DEFAULT_PERSIST_PATH = os.path.join(
29
    DEFAULT_PERSIST_DIR, f"{DEFAULT_VECTOR_STORE}{NAMESPACE_SEP}{DEFAULT_PERSIST_FNAME}"
30
)
31

32

33
class FaissVectorStore(BasePydanticVectorStore):
34
    """Faiss Vector Store.
35

36
    Embeddings are stored within a Faiss index.
37

38
    During query time, the index uses Faiss to query for the top
39
    k embeddings, and returns the corresponding indices.
40

41
    Args:
42
        faiss_index (faiss.Index): Faiss index instance
43

44
    """
45

46
    stores_text: bool = False
47

48
    _faiss_index = PrivateAttr()
49

50
    def __init__(
51
        self,
52
        faiss_index: Any,
53
    ) -> None:
54
        """Initialize params."""
55
        import_err_msg = """
56
            `faiss` package not found. For instructions on
57
            how to install `faiss` please visit
58
            https://github.com/facebookresearch/faiss/wiki/Installing-Faiss
59
        """
60
        try:
61
            import faiss
62
        except ImportError:
63
            raise ImportError(import_err_msg)
64

65
        self._faiss_index = cast(faiss.Index, faiss_index)
66

67
        super().__init__()
68

69
    @classmethod
70
    def from_persist_dir(
71
        cls,
72
        persist_dir: str = DEFAULT_PERSIST_DIR,
73
        fs: Optional[fsspec.AbstractFileSystem] = None,
74
    ) -> "FaissVectorStore":
75
        persist_path = os.path.join(
76
            persist_dir,
77
            f"{DEFAULT_VECTOR_STORE}{NAMESPACE_SEP}{DEFAULT_PERSIST_FNAME}",
78
        )
79
        # only support local storage for now
80
        if fs and not isinstance(fs, LocalFileSystem):
81
            raise NotImplementedError("FAISS only supports local storage for now.")
82
        return cls.from_persist_path(persist_path=persist_path, fs=None)
83

84
    @classmethod
85
    def from_persist_path(
86
        cls,
87
        persist_path: str,
88
        fs: Optional[fsspec.AbstractFileSystem] = None,
89
    ) -> "FaissVectorStore":
90
        import faiss
91

92
        # I don't think FAISS supports fsspec, it requires a path in the SWIG interface
93
        # TODO: copy to a temp file and load into memory from there
94
        if fs and not isinstance(fs, LocalFileSystem):
95
            raise NotImplementedError("FAISS only supports local storage for now.")
96

97
        if not os.path.exists(persist_path):
98
            raise ValueError(f"No existing {__name__} found at {persist_path}.")
99

100
        logger.info(f"Loading {__name__} from {persist_path}.")
101
        faiss_index = faiss.read_index(persist_path)
102
        return cls(faiss_index=faiss_index)
103

104
    def add(
105
        self,
106
        nodes: List[BaseNode],
107
        **add_kwargs: Any,
108
    ) -> List[str]:
109
        """Add nodes to index.
110

111
        NOTE: in the Faiss vector store, we do not store text in Faiss.
112

113
        Args:
114
            nodes: List[BaseNode]: list of nodes with embeddings
115

116
        """
117
        new_ids = []
118
        for node in nodes:
119
            text_embedding = node.get_embedding()
120
            text_embedding_np = np.array(text_embedding, dtype="float32")[np.newaxis, :]
121
            new_id = str(self._faiss_index.ntotal)
122
            self._faiss_index.add(text_embedding_np)
123
            new_ids.append(new_id)
124
        return new_ids
125

126
    @property
127
    def client(self) -> Any:
128
        """Return the faiss index."""
129
        return self._faiss_index
130

131
    def persist(
132
        self,
133
        persist_path: str = DEFAULT_PERSIST_PATH,
134
        fs: Optional[fsspec.AbstractFileSystem] = None,
135
    ) -> None:
136
        """Save to file.
137

138
        This method saves the vector store to disk.
139

140
        Args:
141
            persist_path (str): The save_path of the file.
142

143
        """
144
        # I don't think FAISS supports fsspec, it requires a path in the SWIG interface
145
        # TODO: write to a temporary file and then copy to the final destination
146
        if fs and not isinstance(fs, LocalFileSystem):
147
            raise NotImplementedError("FAISS only supports local storage for now.")
148
        import faiss
149

150
        dirpath = os.path.dirname(persist_path)
151
        if not os.path.exists(dirpath):
152
            os.makedirs(dirpath)
153

154
        faiss.write_index(self._faiss_index, persist_path)
155

156
    def delete(self, ref_doc_id: str, **delete_kwargs: Any) -> None:
157
        """
158
        Delete nodes using with ref_doc_id.
159

160
        Args:
161
            ref_doc_id (str): The doc_id of the document to delete.
162

163
        """
164
        raise NotImplementedError("Delete not yet implemented for Faiss index.")
165

166
    def query(
167
        self,
168
        query: VectorStoreQuery,
169
        **kwargs: Any,
170
    ) -> VectorStoreQueryResult:
171
        """Query index for top k most similar nodes.
172

173
        Args:
174
            query_embedding (List[float]): query embedding
175
            similarity_top_k (int): top k most similar nodes
176

177
        """
178
        if query.filters is not None:
179
            raise ValueError("Metadata filters not implemented for Faiss yet.")
180

181
        query_embedding = cast(List[float], query.query_embedding)
182
        query_embedding_np = np.array(query_embedding, dtype="float32")[np.newaxis, :]
183
        dists, indices = self._faiss_index.search(
184
            query_embedding_np, query.similarity_top_k
185
        )
186
        dists = list(dists[0])
187
        # if empty, then return an empty response
188
        if len(indices) == 0:
189
            return VectorStoreQueryResult(similarities=[], ids=[])
190

191
        # returned dimension is 1 x k
192
        node_idxs = indices[0]
193

194
        filtered_dists = []
195
        filtered_node_idxs = []
196
        for dist, idx in zip(dists, node_idxs):
197
            if idx < 0:
198
                continue
199
            filtered_dists.append(dist)
200
            filtered_node_idxs.append(str(idx))
201

202
        return VectorStoreQueryResult(
203
            similarities=filtered_dists, ids=filtered_node_idxs
204
        )
205

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.