llmware

using_sentence_transformer.py
100 строк · 4.5 Кб
Перенос по словам
1

2
"""This example shows how to use sentence transformers as a vector embedding model with llmware"""
3

4
"""Note: this example illustrates capability from llmware==0.1.13 - please update pip install, or pull from repo"""
5

6

7
import os
8

9
from llmware.setup import Setup
10
from llmware.library import Library
11
from llmware.retrieval import Query
12
from llmware.models import ModelCatalog
13

14

15
def build_lib (library_name, folder="Agreements"):
16

17
    # Step 1 - Create library which is the main 'organizing construct' in llmware
18
    print ("\nupdate: Step 1 - Creating library: {}".format(library_name))
19

20
    library = Library().create_new_library(library_name)
21

22
    # Step 2 - Pull down the sample files from S3 through the .load_sample_files() command
23
    #   --note: if you need to refresh the sample files, set 'over_write=True'
24
    print ("update: Step 2 - Downloading Sample Files")
25

26
    sample_files_path = Setup().load_sample_files(over_write=False)
27

28
    # Step 3 - point ".add_files" method to the folder of documents that was just created
29
    #   this method parses the documents, text chunks, and captures in MongoDB
30
    print("update: Step 3 - Parsing and Text Indexing Files")
31

32
    #   options:   Agreements | UN-Resolutions-500
33
    library.add_files(input_folder_path=os.path.join(sample_files_path, folder))
34

35
    return library
36

37

38
# start script
39

40
print("update: Step 1- starting here- building library- parsing PDFs into text chunks")
41

42
lib = build_lib("st_embedding_0_454")
43

44
#   register a model from the sentence transformers library/repository
45

46
#   note: "all-MiniLM-L6-v2" is from the SentenceTransformer catalog, e.g.,
47
#       -- https://www.sbert.net/docs/pretrained_models.html
48
#       -- key inputs to register:
49
#           -- "model_name" - should be an existing pre-trained model in the SentenceTransformer catalog
50
#           -- "embedding_dims" - this is the output dimensions, included in the sbert model card info
51
#           -- "context_window" - included in the sbert model card info
52
#           -- *** "model_location" - "st_repo" is reserved word to tell llmware to look in sentence transformers ***
53
#           -- *** "model_family" - "LLMWareSemanticModel" - knows how to load and embed with sentence transformers ***
54

55
#   another sentence transformer to try:  "all-mpnet-base-v2" - embedding_dims = 768 - context_window = 384
56

57
sentence_transformer_pretrained_model_name = "all-MiniLM-L6-v2"
58
embedding_dims = 384
59
context_window = 256
60

61
ModelCatalog().register_sentence_transformer_model(model_name=sentence_transformer_pretrained_model_name,
62
                                                   embedding_dims=embedding_dims, context_window=context_window)
63

64
"""
65
ModelCatalog().add_model_list({"model_name": sentence_transformer_pretrained_model_name,
66
                                "embedding_dims":embedding_dims,
67
                                "context_window":context_window,
68
                                "model_category": "embedding",
69
                                "model_family": "LLMWareSemanticModel",
70
                                "display_name": "MySentenceTransformer", "model_location": "st_repo"})
71
"""
72

73
# to confirm that model has been added to the catalog
74
mc = ModelCatalog().list_all_models()
75
model_card = ModelCatalog().lookup_model_card(sentence_transformer_pretrained_model_name)
76
print("update: model card - ", model_card)
77

78
# use directly now as an embedding model
79
lib.install_new_embedding(embedding_model_name=sentence_transformer_pretrained_model_name,
80
                          vector_db="milvus",batch_size=300)
81

82
#   optional - check the status of the library card and embedding
83
lib_card = lib.get_library_card()
84
print("update: -- after embedding process - check updated library card - ", lib_card)
85

86
#   create query object (note: including embedding_model is optional - only needed if multiple embeddings on library)
87
query_st = Query(lib, embedding_model_name=sentence_transformer_pretrained_model_name)
88

89
#   run multiple queries using query_pgv
90
my_search_results = query_st.semantic_query("What is the sale bonus?", result_count = 24)
91

92
for i, qr in enumerate(my_search_results):
93
    print("update: semantic query results: ", i, qr)
94

95
# if you want to delete the embedding  - uncomment the line below - including the model_name and vector_db
96
# lib.delete_installed_embedding(sentence_transformer_pretrained_model_name, "milvus")
97

98
#   optional - check the embeddings on the library
99
emb_record = lib.get_embedding_status()
100
for j, entries in enumerate(emb_record):
101
    print("update: embeddings on library: ", j, entries)
102

103
llmware

Использование cookies