llmware

using_multiple_embeddings.py
148 строк · 6.9 Кб
Перенос по словам
1

2
"""This example shows how to easily create multiple embeddings over the same library with llmware
3

4
    This recipe can be especially useful when trying to compare the effectiveness of a particular
5
    embedding model for a specific domain or library corpus and to run other comparative experiments
6
    without being 'locked-in' to a particular model.
7

8
    Note: the example uses four different embedding models:
9

10
        1.  mini-lm-sbert - a favorite small, fast Sentence Transformer included in the llmware model catalog by default
11
        2.  text-embedding-ada-002 - the popular OpenAI embedding model
12
        3.  industry-bert-sec - an industry fine-tuned embedding model, in the llmware model catalog
13
        4.  all-mpnet-base-v2 - one of the most popular Sentence Transformers (which we will register and add to the
14
            model catalog on the fly
15

16
        To use OpenAI Ada will require an Open API key - if you do not have one, feel free to comment out or
17
        select a different model.  Any Sentence Transformer or Huggingface embedding model can be used.
18

19
"""
20

21

22
import os
23

24
from llmware.setup import Setup
25
from llmware.library import Library
26
from llmware.retrieval import Query
27
from llmware.models import ModelCatalog
28

29
os.environ["USER_MANAGED_OPENAI_API_KEY"] = "<INSERT YOUR OPEN API KEY HERE>"
30

31
os.environ["TOKENIZERS_PARALLELISM"] = "false"  # Avoid a HuggingFace tokenizer warning
32

33

34
#   Note:  this will build a small library that will be used in the embedding examples
35
def build_lib (library_name, folder="Agreements"):
36

37
    # Step 1 - Create library which is the main 'organizing construct' in llmware
38
    print ("\nupdate: creating library: {}".format(library_name))
39

40
    library = Library().create_new_library(library_name)
41

42
    # Step 2 - Pull down the sample files from S3 through the .load_sample_files() command
43
    #   --note: if you need to refresh the sample files, set 'over_write=True'
44
    print ("update: downloading sample files")
45

46
    sample_files_path = Setup().load_sample_files(over_write=False)
47

48
    # Step 3 - point ".add_files" method to the folder of documents that was just created
49
    #   this method parses the documents, text chunks, and captures in MongoDB
50
    print("update: parsing and text indexing files")
51

52
    #   options:   Agreements | UN-Resolutions-500
53
    library.add_files(input_folder_path=os.path.join(sample_files_path, folder))
54

55
    return library
56

57

58
#   use multiple embedding models on the same library and the same vector db
59

60
def multiple_embeddings_same_db_same_lib(document_folder=None,sample_query=None,vector_db=None, base_library_name=None):
61

62
    print("\nupdate: Step 1- starting here- building library- parsing PDFs into text chunks")
63

64
    lib = build_lib(base_library_name, folder=document_folder)
65

66
    # optional - check the status of the library card and embedding
67
    lib_card = lib.get_library_card()
68
    print("update: library card - ", lib_card)
69

70
    print("\nupdate: Step 2 - starting to install embeddings")
71

72
    #   alt embedding models - "mini-lm-sbert" | industry-bert-contracts |  text-embedding-ada-002
73
    #   note: if you want to use text-embedding-ada-002, you will need an OpenAI key and enter into os.environ variable
74
    #   e.g., os.environ["USER_MANAGED_OPENAI_API_KEY"] = "<insert your key>"
75

76
    #   Note: batch size can be configured based on memory of machine and optimized for performance
77
    #   -- generally, between 100-500 is a safe range to optimize performance/memory
78

79
    print(f"\nupdate: Embedding #1 - mini-lm-sbert - {vector_db}")
80
    lib.install_new_embedding(embedding_model_name="mini-lm-sbert", vector_db=vector_db, batch_size=200)
81

82
    print(f"\nupdate: Embedding #2 - text-embedding-ada-002 - {vector_db}")
83
    lib.install_new_embedding(embedding_model_name="text-embedding-ada-002", vector_db=vector_db, batch_size=500)
84

85
    print(f"\nupdate: Embedding #3 - industry-bert-sec - {vector_db}")
86
    lib.install_new_embedding(embedding_model_name="industry-bert-sec", vector_db=vector_db, batch_size=100)
87

88
    # for the last embedding, we will register a popular open source sentence transformer model to use
89
    #   -- see "using_sentence_transformer.py" for more details
90

91
    ModelCatalog().register_sentence_transformer_model(model_name="all-mpnet-base-v2",
92
                                                       embedding_dims=768, context_window=384)
93

94
    # use directly now as an embedding model
95
    print(f"\nupdate: Embedding #4 - all-mpnet-base-v2 - {vector_db}")
96
    lib.install_new_embedding(embedding_model_name="all-mpnet-base-v2",vector_db=vector_db,batch_size=300)
97

98
    #   optional - check the embeddings on the library
99
    print("\nupdate: Embedding record of the Library")
100

101
    emb_record = lib.get_embedding_status()
102
    for j, entries in enumerate(emb_record):
103
        print("update: embeddings on library: ", j, entries)
104

105
    #   Using the Embeddings to Execute Queries
106
    #
107
    #   create query object:
108
    #   1.  if no embedding_model or vector_db passed in constructor, then selects the LAST embedding record, which
109
    #        is the most recent embedding on the library, and uses that combination of model + vector db
110
    #
111
    #   2.  if embedding_model_name only passed, then looks up the first instance of that embedding model
112
    #       in the embedding record, and will use the associated vector db
113
    #
114
    #   3.  if both embedding_model_name and vector_db passed in constructor, then looks up that combo in
115
    #        embedding record.
116

117
    query1 = Query(lib, embedding_model_name="mini-lm-sbert")
118
    query2 = Query(lib, embedding_model_name="text-embedding-ada-002")
119

120
    #   to execute query against any of the query objects
121
    minilm_results = query1.semantic_query(sample_query, result_count=12)
122
    ada_results = query2.semantic_query(sample_query, result_count=12)
123

124
    print("\n\nupdate: Sample Query using Embeddings")
125

126
    print("\nupdate: Embedding Model # 1 - MiniLM SBERT Results")
127
    for i, qr1 in enumerate(minilm_results):
128
        print("update: minilm semantic query results: ", i, qr1["distance"], qr1)
129

130
    print("\nupdate: Embedding Model # 2- Ada Results")
131
    for j, qr2 in enumerate(ada_results):
132
        print("update: ada semantic query results: ", j, qr2["distance"], qr2)
133

134
    return 0
135

136

137
if __name__ == "__main__":
138

139
    #   document folder options:  Agreements | UN-Resolutions-500
140
    #   note: Agreements = ~15 contracts = ~1272 embeddings - takes ~5 minutes to run (without GPU)
141
    #   note: UN-Resolutions-500 = 500 documents = ~12500 embeddings - takes ~15-20 minutes to run (without GPU)
142
    #       -- good sample query for UN-Resolutions, e.g. "what are key initiatives to promote sustainability?"
143
    #
144
    #   try substituting different vector-db, e.g, "pg_vector" | "redis" | "faiss"
145

146
    multiple_embeddings_same_db_same_lib(document_folder="Agreements",
147
                                         sample_query="what is the sale bonus?",
148
                                         vector_db="milvus",
149
                                         base_library_name="multi_embeddings_test_lib_0")
150

151

152

153
llmware

Использование cookies