fastembed

Форк
0
/
Supported_Models.ipynb 
355 строк · 11.8 Кб
1
{
2
 "cells": [
3
  {
4
   "cell_type": "code",
5
   "execution_count": 1,
6
   "metadata": {
7
    "ExecuteTime": {
8
     "end_time": "2024-03-30T11:18:52.052764Z",
9
     "start_time": "2024-03-30T11:18:52.039616Z"
10
    }
11
   },
12
   "outputs": [],
13
   "source": [
14
    "%load_ext autoreload\n",
15
    "%autoreload 2"
16
   ]
17
  },
18
  {
19
   "cell_type": "code",
20
   "execution_count": 2,
21
   "metadata": {},
22
   "outputs": [],
23
   "source": [
24
    "import pandas as pd\n",
25
    "\n",
26
    "from fastembed import SparseTextEmbedding, TextEmbedding"
27
   ]
28
  },
29
  {
30
   "cell_type": "markdown",
31
   "metadata": {},
32
   "source": [
33
    "## Supported Text Embedding Models"
34
   ]
35
  },
36
  {
37
   "cell_type": "code",
38
   "execution_count": 6,
39
   "metadata": {},
40
   "outputs": [
41
    {
42
     "data": {
43
      "text/html": [
44
       "<div>\n",
45
       "<style scoped>\n",
46
       "    .dataframe tbody tr th:only-of-type {\n",
47
       "        vertical-align: middle;\n",
48
       "    }\n",
49
       "\n",
50
       "    .dataframe tbody tr th {\n",
51
       "        vertical-align: top;\n",
52
       "    }\n",
53
       "\n",
54
       "    .dataframe thead th {\n",
55
       "        text-align: right;\n",
56
       "    }\n",
57
       "</style>\n",
58
       "<table border=\"1\" class=\"dataframe\">\n",
59
       "  <thead>\n",
60
       "    <tr style=\"text-align: right;\">\n",
61
       "      <th></th>\n",
62
       "      <th>model</th>\n",
63
       "      <th>dim</th>\n",
64
       "      <th>description</th>\n",
65
       "      <th>size_in_GB</th>\n",
66
       "    </tr>\n",
67
       "  </thead>\n",
68
       "  <tbody>\n",
69
       "    <tr>\n",
70
       "      <th>0</th>\n",
71
       "      <td>BAAI/bge-small-en-v1.5</td>\n",
72
       "      <td>384</td>\n",
73
       "      <td>Fast and Default English model</td>\n",
74
       "      <td>0.067</td>\n",
75
       "    </tr>\n",
76
       "    <tr>\n",
77
       "      <th>1</th>\n",
78
       "      <td>BAAI/bge-small-zh-v1.5</td>\n",
79
       "      <td>512</td>\n",
80
       "      <td>Fast and recommended Chinese model</td>\n",
81
       "      <td>0.090</td>\n",
82
       "    </tr>\n",
83
       "    <tr>\n",
84
       "      <th>2</th>\n",
85
       "      <td>sentence-transformers/all-MiniLM-L6-v2</td>\n",
86
       "      <td>384</td>\n",
87
       "      <td>Sentence Transformer model, MiniLM-L6-v2</td>\n",
88
       "      <td>0.090</td>\n",
89
       "    </tr>\n",
90
       "    <tr>\n",
91
       "      <th>3</th>\n",
92
       "      <td>jinaai/jina-embeddings-v2-small-en</td>\n",
93
       "      <td>512</td>\n",
94
       "      <td>English embedding model supporting 8192 sequen...</td>\n",
95
       "      <td>0.120</td>\n",
96
       "    </tr>\n",
97
       "    <tr>\n",
98
       "      <th>4</th>\n",
99
       "      <td>BAAI/bge-small-en</td>\n",
100
       "      <td>384</td>\n",
101
       "      <td>Fast English model</td>\n",
102
       "      <td>0.130</td>\n",
103
       "    </tr>\n",
104
       "    <tr>\n",
105
       "      <th>5</th>\n",
106
       "      <td>BAAI/bge-base-en-v1.5</td>\n",
107
       "      <td>768</td>\n",
108
       "      <td>Base English model, v1.5</td>\n",
109
       "      <td>0.210</td>\n",
110
       "    </tr>\n",
111
       "    <tr>\n",
112
       "      <th>6</th>\n",
113
       "      <td>sentence-transformers/paraphrase-multilingual-...</td>\n",
114
       "      <td>384</td>\n",
115
       "      <td>Sentence Transformer model, paraphrase-multili...</td>\n",
116
       "      <td>0.220</td>\n",
117
       "    </tr>\n",
118
       "    <tr>\n",
119
       "      <th>7</th>\n",
120
       "      <td>BAAI/bge-base-en</td>\n",
121
       "      <td>768</td>\n",
122
       "      <td>Base English model</td>\n",
123
       "      <td>0.420</td>\n",
124
       "    </tr>\n",
125
       "    <tr>\n",
126
       "      <th>8</th>\n",
127
       "      <td>nomic-ai/nomic-embed-text-v1</td>\n",
128
       "      <td>768</td>\n",
129
       "      <td>8192 context length english model</td>\n",
130
       "      <td>0.520</td>\n",
131
       "    </tr>\n",
132
       "    <tr>\n",
133
       "      <th>9</th>\n",
134
       "      <td>nomic-ai/nomic-embed-text-v1.5</td>\n",
135
       "      <td>768</td>\n",
136
       "      <td>8192 context length english model</td>\n",
137
       "      <td>0.520</td>\n",
138
       "    </tr>\n",
139
       "    <tr>\n",
140
       "      <th>10</th>\n",
141
       "      <td>jinaai/jina-embeddings-v2-base-en</td>\n",
142
       "      <td>768</td>\n",
143
       "      <td>English embedding model supporting 8192 sequen...</td>\n",
144
       "      <td>0.520</td>\n",
145
       "    </tr>\n",
146
       "    <tr>\n",
147
       "      <th>11</th>\n",
148
       "      <td>mixedbread-ai/mxbai-embed-large-v1</td>\n",
149
       "      <td>1024</td>\n",
150
       "      <td>MixedBread Base sentence embedding model, does...</td>\n",
151
       "      <td>0.640</td>\n",
152
       "    </tr>\n",
153
       "    <tr>\n",
154
       "      <th>12</th>\n",
155
       "      <td>sentence-transformers/paraphrase-multilingual-...</td>\n",
156
       "      <td>768</td>\n",
157
       "      <td>Sentence-transformers model for tasks like clu...</td>\n",
158
       "      <td>1.000</td>\n",
159
       "    </tr>\n",
160
       "    <tr>\n",
161
       "      <th>13</th>\n",
162
       "      <td>BAAI/bge-large-en-v1.5</td>\n",
163
       "      <td>1024</td>\n",
164
       "      <td>Large English model, v1.5</td>\n",
165
       "      <td>1.200</td>\n",
166
       "    </tr>\n",
167
       "    <tr>\n",
168
       "      <th>14</th>\n",
169
       "      <td>thenlper/gte-large</td>\n",
170
       "      <td>1024</td>\n",
171
       "      <td>Large general text embeddings model</td>\n",
172
       "      <td>1.200</td>\n",
173
       "    </tr>\n",
174
       "    <tr>\n",
175
       "      <th>15</th>\n",
176
       "      <td>intfloat/multilingual-e5-large</td>\n",
177
       "      <td>1024</td>\n",
178
       "      <td>Multilingual model, e5-large. Recommend using ...</td>\n",
179
       "      <td>2.240</td>\n",
180
       "    </tr>\n",
181
       "  </tbody>\n",
182
       "</table>\n",
183
       "</div>"
184
      ],
185
      "text/plain": [
186
       "                                                model   dim  \\\n",
187
       "0                              BAAI/bge-small-en-v1.5   384   \n",
188
       "1                              BAAI/bge-small-zh-v1.5   512   \n",
189
       "2              sentence-transformers/all-MiniLM-L6-v2   384   \n",
190
       "3                  jinaai/jina-embeddings-v2-small-en   512   \n",
191
       "4                                   BAAI/bge-small-en   384   \n",
192
       "5                               BAAI/bge-base-en-v1.5   768   \n",
193
       "6   sentence-transformers/paraphrase-multilingual-...   384   \n",
194
       "7                                    BAAI/bge-base-en   768   \n",
195
       "8                        nomic-ai/nomic-embed-text-v1   768   \n",
196
       "9                      nomic-ai/nomic-embed-text-v1.5   768   \n",
197
       "10                  jinaai/jina-embeddings-v2-base-en   768   \n",
198
       "11                 mixedbread-ai/mxbai-embed-large-v1  1024   \n",
199
       "12  sentence-transformers/paraphrase-multilingual-...   768   \n",
200
       "13                             BAAI/bge-large-en-v1.5  1024   \n",
201
       "14                                 thenlper/gte-large  1024   \n",
202
       "15                     intfloat/multilingual-e5-large  1024   \n",
203
       "\n",
204
       "                                          description  size_in_GB  \n",
205
       "0                      Fast and Default English model       0.067  \n",
206
       "1                  Fast and recommended Chinese model       0.090  \n",
207
       "2            Sentence Transformer model, MiniLM-L6-v2       0.090  \n",
208
       "3   English embedding model supporting 8192 sequen...       0.120  \n",
209
       "4                                  Fast English model       0.130  \n",
210
       "5                            Base English model, v1.5       0.210  \n",
211
       "6   Sentence Transformer model, paraphrase-multili...       0.220  \n",
212
       "7                                  Base English model       0.420  \n",
213
       "8                   8192 context length english model       0.520  \n",
214
       "9                   8192 context length english model       0.520  \n",
215
       "10  English embedding model supporting 8192 sequen...       0.520  \n",
216
       "11  MixedBread Base sentence embedding model, does...       0.640  \n",
217
       "12  Sentence-transformers model for tasks like clu...       1.000  \n",
218
       "13                          Large English model, v1.5       1.200  \n",
219
       "14                Large general text embeddings model       1.200  \n",
220
       "15  Multilingual model, e5-large. Recommend using ...       2.240  "
221
      ]
222
     },
223
     "execution_count": 6,
224
     "metadata": {},
225
     "output_type": "execute_result"
226
    }
227
   ],
228
   "source": [
229
    "supported_models = (\n",
230
    "    pd.DataFrame(TextEmbedding.list_supported_models())\n",
231
    "    .sort_values(\"size_in_GB\")\n",
232
    "    .drop(columns=\"sources\")\n",
233
    "    .reset_index(drop=True)\n",
234
    ")\n",
235
    "supported_models"
236
   ]
237
  },
238
  {
239
   "cell_type": "markdown",
240
   "metadata": {},
241
   "source": [
242
    "## Supported Sparse Text Embedding Models"
243
   ]
244
  },
245
  {
246
   "cell_type": "code",
247
   "execution_count": 7,
248
   "metadata": {
249
    "ExecuteTime": {
250
     "end_time": "2024-03-30T11:19:01.564291Z",
251
     "start_time": "2024-03-30T11:19:01.538768Z"
252
    }
253
   },
254
   "outputs": [
255
    {
256
     "data": {
257
      "text/html": [
258
       "<div>\n",
259
       "<style scoped>\n",
260
       "    .dataframe tbody tr th:only-of-type {\n",
261
       "        vertical-align: middle;\n",
262
       "    }\n",
263
       "\n",
264
       "    .dataframe tbody tr th {\n",
265
       "        vertical-align: top;\n",
266
       "    }\n",
267
       "\n",
268
       "    .dataframe thead th {\n",
269
       "        text-align: right;\n",
270
       "    }\n",
271
       "</style>\n",
272
       "<table border=\"1\" class=\"dataframe\">\n",
273
       "  <thead>\n",
274
       "    <tr style=\"text-align: right;\">\n",
275
       "      <th></th>\n",
276
       "      <th>model</th>\n",
277
       "      <th>vocab_size</th>\n",
278
       "      <th>description</th>\n",
279
       "      <th>size_in_GB</th>\n",
280
       "      <th>sources</th>\n",
281
       "    </tr>\n",
282
       "  </thead>\n",
283
       "  <tbody>\n",
284
       "    <tr>\n",
285
       "      <th>0</th>\n",
286
       "      <td>prithvida/Splade_PP_en_v1</td>\n",
287
       "      <td>30522</td>\n",
288
       "      <td>Misspelled version of the model. Retained for ...</td>\n",
289
       "      <td>0.532</td>\n",
290
       "      <td>{'hf': 'Qdrant/SPLADE_PP_en_v1'}</td>\n",
291
       "    </tr>\n",
292
       "    <tr>\n",
293
       "      <th>1</th>\n",
294
       "      <td>prithivida/Splade_PP_en_v1</td>\n",
295
       "      <td>30522</td>\n",
296
       "      <td>Independent Implementation of SPLADE++ Model f...</td>\n",
297
       "      <td>0.532</td>\n",
298
       "      <td>{'hf': 'Qdrant/SPLADE_PP_en_v1'}</td>\n",
299
       "    </tr>\n",
300
       "  </tbody>\n",
301
       "</table>\n",
302
       "</div>"
303
      ],
304
      "text/plain": [
305
       "                        model  vocab_size  \\\n",
306
       "0   prithvida/Splade_PP_en_v1       30522   \n",
307
       "1  prithivida/Splade_PP_en_v1       30522   \n",
308
       "\n",
309
       "                                         description  size_in_GB  \\\n",
310
       "0  Misspelled version of the model. Retained for ...       0.532   \n",
311
       "1  Independent Implementation of SPLADE++ Model f...       0.532   \n",
312
       "\n",
313
       "                            sources  \n",
314
       "0  {'hf': 'Qdrant/SPLADE_PP_en_v1'}  \n",
315
       "1  {'hf': 'Qdrant/SPLADE_PP_en_v1'}  "
316
      ]
317
     },
318
     "execution_count": 7,
319
     "metadata": {},
320
     "output_type": "execute_result"
321
    }
322
   ],
323
   "source": [
324
    "pd.DataFrame(SparseTextEmbedding.list_supported_models())"
325
   ]
326
  }
327
 ],
328
 "metadata": {
329
  "kernelspec": {
330
   "display_name": "Python 3.8.18 ('base')",
331
   "language": "python",
332
   "name": "python3"
333
  },
334
  "language_info": {
335
   "codemirror_mode": {
336
    "name": "ipython",
337
    "version": 3
338
   },
339
   "file_extension": ".py",
340
   "mimetype": "text/x-python",
341
   "name": "python",
342
   "nbconvert_exporter": "python",
343
   "pygments_lexer": "ipython3",
344
   "version": "3.10.13"
345
  },
346
  "orig_nbformat": 4,
347
  "vscode": {
348
   "interpreter": {
349
    "hash": "c4a27af61e455bc18dcf16f5867a2ff0402fa12b01dd0f6ce3a79ae73ad15e91"
350
   }
351
  }
352
 },
353
 "nbformat": 4,
354
 "nbformat_minor": 2
355
}
356

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.