8
"end_time": "2024-03-30T11:18:52.052764Z",
9
"start_time": "2024-03-30T11:18:52.039616Z"
14
"%load_ext autoreload\n",
24
"import pandas as pd\n",
26
"from fastembed import SparseTextEmbedding, TextEmbedding"
30
"cell_type": "markdown",
33
"## Supported Text Embedding Models"
46
" .dataframe tbody tr th:only-of-type {\n",
47
" vertical-align: middle;\n",
50
" .dataframe tbody tr th {\n",
51
" vertical-align: top;\n",
54
" .dataframe thead th {\n",
55
" text-align: right;\n",
58
"<table border=\"1\" class=\"dataframe\">\n",
60
" <tr style=\"text-align: right;\">\n",
64
" <th>description</th>\n",
65
" <th>size_in_GB</th>\n",
71
" <td>BAAI/bge-small-en-v1.5</td>\n",
73
" <td>Fast and Default English model</td>\n",
78
" <td>BAAI/bge-small-zh-v1.5</td>\n",
80
" <td>Fast and recommended Chinese model</td>\n",
85
" <td>sentence-transformers/all-MiniLM-L6-v2</td>\n",
87
" <td>Sentence Transformer model, MiniLM-L6-v2</td>\n",
92
" <td>jinaai/jina-embeddings-v2-small-en</td>\n",
94
" <td>English embedding model supporting 8192 sequen...</td>\n",
99
" <td>BAAI/bge-small-en</td>\n",
101
" <td>Fast English model</td>\n",
106
" <td>BAAI/bge-base-en-v1.5</td>\n",
108
" <td>Base English model, v1.5</td>\n",
113
" <td>sentence-transformers/paraphrase-multilingual-...</td>\n",
115
" <td>Sentence Transformer model, paraphrase-multili...</td>\n",
120
" <td>BAAI/bge-base-en</td>\n",
122
" <td>Base English model</td>\n",
127
" <td>nomic-ai/nomic-embed-text-v1</td>\n",
129
" <td>8192 context length english model</td>\n",
134
" <td>nomic-ai/nomic-embed-text-v1.5</td>\n",
136
" <td>8192 context length english model</td>\n",
141
" <td>jinaai/jina-embeddings-v2-base-en</td>\n",
143
" <td>English embedding model supporting 8192 sequen...</td>\n",
148
" <td>mixedbread-ai/mxbai-embed-large-v1</td>\n",
150
" <td>MixedBread Base sentence embedding model, does...</td>\n",
155
" <td>sentence-transformers/paraphrase-multilingual-...</td>\n",
157
" <td>Sentence-transformers model for tasks like clu...</td>\n",
162
" <td>BAAI/bge-large-en-v1.5</td>\n",
164
" <td>Large English model, v1.5</td>\n",
169
" <td>thenlper/gte-large</td>\n",
171
" <td>Large general text embeddings model</td>\n",
176
" <td>intfloat/multilingual-e5-large</td>\n",
178
" <td>Multilingual model, e5-large. Recommend using ...</td>\n",
187
"0 BAAI/bge-small-en-v1.5 384 \n",
188
"1 BAAI/bge-small-zh-v1.5 512 \n",
189
"2 sentence-transformers/all-MiniLM-L6-v2 384 \n",
190
"3 jinaai/jina-embeddings-v2-small-en 512 \n",
191
"4 BAAI/bge-small-en 384 \n",
192
"5 BAAI/bge-base-en-v1.5 768 \n",
193
"6 sentence-transformers/paraphrase-multilingual-... 384 \n",
194
"7 BAAI/bge-base-en 768 \n",
195
"8 nomic-ai/nomic-embed-text-v1 768 \n",
196
"9 nomic-ai/nomic-embed-text-v1.5 768 \n",
197
"10 jinaai/jina-embeddings-v2-base-en 768 \n",
198
"11 mixedbread-ai/mxbai-embed-large-v1 1024 \n",
199
"12 sentence-transformers/paraphrase-multilingual-... 768 \n",
200
"13 BAAI/bge-large-en-v1.5 1024 \n",
201
"14 thenlper/gte-large 1024 \n",
202
"15 intfloat/multilingual-e5-large 1024 \n",
204
" description size_in_GB \n",
205
"0 Fast and Default English model 0.067 \n",
206
"1 Fast and recommended Chinese model 0.090 \n",
207
"2 Sentence Transformer model, MiniLM-L6-v2 0.090 \n",
208
"3 English embedding model supporting 8192 sequen... 0.120 \n",
209
"4 Fast English model 0.130 \n",
210
"5 Base English model, v1.5 0.210 \n",
211
"6 Sentence Transformer model, paraphrase-multili... 0.220 \n",
212
"7 Base English model 0.420 \n",
213
"8 8192 context length english model 0.520 \n",
214
"9 8192 context length english model 0.520 \n",
215
"10 English embedding model supporting 8192 sequen... 0.520 \n",
216
"11 MixedBread Base sentence embedding model, does... 0.640 \n",
217
"12 Sentence-transformers model for tasks like clu... 1.000 \n",
218
"13 Large English model, v1.5 1.200 \n",
219
"14 Large general text embeddings model 1.200 \n",
220
"15 Multilingual model, e5-large. Recommend using ... 2.240 "
223
"execution_count": 6,
225
"output_type": "execute_result"
229
"supported_models = (\n",
230
" pd.DataFrame(TextEmbedding.list_supported_models())\n",
231
" .sort_values(\"size_in_GB\")\n",
232
" .drop(columns=\"sources\")\n",
233
" .reset_index(drop=True)\n",
239
"cell_type": "markdown",
242
"## Supported Sparse Text Embedding Models"
247
"execution_count": 7,
250
"end_time": "2024-03-30T11:19:01.564291Z",
251
"start_time": "2024-03-30T11:19:01.538768Z"
260
" .dataframe tbody tr th:only-of-type {\n",
261
" vertical-align: middle;\n",
264
" .dataframe tbody tr th {\n",
265
" vertical-align: top;\n",
268
" .dataframe thead th {\n",
269
" text-align: right;\n",
272
"<table border=\"1\" class=\"dataframe\">\n",
274
" <tr style=\"text-align: right;\">\n",
277
" <th>vocab_size</th>\n",
278
" <th>description</th>\n",
279
" <th>size_in_GB</th>\n",
280
" <th>sources</th>\n",
286
" <td>prithvida/Splade_PP_en_v1</td>\n",
288
" <td>Misspelled version of the model. Retained for ...</td>\n",
290
" <td>{'hf': 'Qdrant/SPLADE_PP_en_v1'}</td>\n",
294
" <td>prithivida/Splade_PP_en_v1</td>\n",
296
" <td>Independent Implementation of SPLADE++ Model f...</td>\n",
298
" <td>{'hf': 'Qdrant/SPLADE_PP_en_v1'}</td>\n",
305
" model vocab_size \\\n",
306
"0 prithvida/Splade_PP_en_v1 30522 \n",
307
"1 prithivida/Splade_PP_en_v1 30522 \n",
309
" description size_in_GB \\\n",
310
"0 Misspelled version of the model. Retained for ... 0.532 \n",
311
"1 Independent Implementation of SPLADE++ Model f... 0.532 \n",
314
"0 {'hf': 'Qdrant/SPLADE_PP_en_v1'} \n",
315
"1 {'hf': 'Qdrant/SPLADE_PP_en_v1'} "
318
"execution_count": 7,
320
"output_type": "execute_result"
324
"pd.DataFrame(SparseTextEmbedding.list_supported_models())"
330
"display_name": "Python 3.8.18 ('base')",
331
"language": "python",
339
"file_extension": ".py",
340
"mimetype": "text/x-python",
342
"nbconvert_exporter": "python",
343
"pygments_lexer": "ipython3",
349
"hash": "c4a27af61e455bc18dcf16f5867a2ff0402fa12b01dd0f6ce3a79ae73ad15e91"