2
"""This example shows how to easily create multiple embeddings over the same library with llmware
4
This recipe can be especially useful when trying to compare the effectiveness of a particular
5
embedding model for a specific domain or library corpus and to run other comparative experiments
6
without being 'locked-in' to a particular model.
8
Note: the example uses four different embedding models:
10
1. mini-lm-sbert - a favorite small, fast Sentence Transformer included in the llmware model catalog by default
11
2. text-embedding-ada-002 - the popular OpenAI embedding model
12
3. industry-bert-sec - an industry fine-tuned embedding model, in the llmware model catalog
13
4. all-mpnet-base-v2 - one of the most popular Sentence Transformers (which we will register and add to the
14
model catalog on the fly
16
To use OpenAI Ada will require an Open API key - if you do not have one, feel free to comment out or
17
select a different model. Any Sentence Transformer or Huggingface embedding model can be used.
24
from llmware.setup import Setup
25
from llmware.library import Library
26
from llmware.retrieval import Query
27
from llmware.models import ModelCatalog
29
os.environ["USER_MANAGED_OPENAI_API_KEY"] = "<INSERT YOUR OPEN API KEY HERE>"
31
os.environ["TOKENIZERS_PARALLELISM"] = "false"
35
def build_lib (library_name, folder="Agreements"):
38
print ("\nupdate: creating library: {}".format(library_name))
40
library = Library().create_new_library(library_name)
44
print ("update: downloading sample files")
46
sample_files_path = Setup().load_sample_files(over_write=False)
50
print("update: parsing and text indexing files")
53
library.add_files(input_folder_path=os.path.join(sample_files_path, folder))
60
def multiple_embeddings_same_db_same_lib(document_folder=None,sample_query=None,vector_db=None, base_library_name=None):
62
print("\nupdate: Step 1- starting here- building library- parsing PDFs into text chunks")
64
lib = build_lib(base_library_name, folder=document_folder)
67
lib_card = lib.get_library_card()
68
print("update: library card - ", lib_card)
70
print("\nupdate: Step 2 - starting to install embeddings")
79
print(f"\nupdate: Embedding #1 - mini-lm-sbert - {vector_db}")
80
lib.install_new_embedding(embedding_model_name="mini-lm-sbert", vector_db=vector_db, batch_size=200)
82
print(f"\nupdate: Embedding #2 - text-embedding-ada-002 - {vector_db}")
83
lib.install_new_embedding(embedding_model_name="text-embedding-ada-002", vector_db=vector_db, batch_size=500)
85
print(f"\nupdate: Embedding #3 - industry-bert-sec - {vector_db}")
86
lib.install_new_embedding(embedding_model_name="industry-bert-sec", vector_db=vector_db, batch_size=100)
91
ModelCatalog().register_sentence_transformer_model(model_name="all-mpnet-base-v2",
92
embedding_dims=768, context_window=384)
95
print(f"\nupdate: Embedding #4 - all-mpnet-base-v2 - {vector_db}")
96
lib.install_new_embedding(embedding_model_name="all-mpnet-base-v2",vector_db=vector_db,batch_size=300)
99
print("\nupdate: Embedding record of the Library")
101
emb_record = lib.get_embedding_status()
102
for j, entries in enumerate(emb_record):
103
print("update: embeddings on library: ", j, entries)
117
query1 = Query(lib, embedding_model_name="mini-lm-sbert")
118
query2 = Query(lib, embedding_model_name="text-embedding-ada-002")
121
minilm_results = query1.semantic_query(sample_query, result_count=12)
122
ada_results = query2.semantic_query(sample_query, result_count=12)
124
print("\n\nupdate: Sample Query using Embeddings")
126
print("\nupdate: Embedding Model # 1 - MiniLM SBERT Results")
127
for i, qr1 in enumerate(minilm_results):
128
print("update: minilm semantic query results: ", i, qr1["distance"], qr1)
130
print("\nupdate: Embedding Model # 2- Ada Results")
131
for j, qr2 in enumerate(ada_results):
132
print("update: ada semantic query results: ", j, qr2["distance"], qr2)
137
if __name__ == "__main__":
146
multiple_embeddings_same_db_same_lib(document_folder="Agreements",
147
sample_query="what is the sale bonus?",
149
base_library_name="multi_embeddings_test_lib_0")