3
# *** FAST START to create vector embeddings from documents ***
5
# docs2vecs_with_milvus-contracts - parses, text chunks and embeds legal contracts
6
# the sample documents (~80 legal template contracts) can be pulled down from a public S3 repo with the command:
7
# sample_files_path = Setup().load_sample_files()
8
# note: this example requires Milvus + MongoDB - please see the README instructions to install
13
from llmware.library import Library
14
from llmware.retrieval import Query
15
from llmware.setup import Setup
16
from llmware.status import Status
19
def parse_and_generate_vector_embeddings(library_name):
21
# Step 0 - Configuration - we will use these in Step 4 to install the embeddings
22
embedding_model = "industry-bert-contracts"
25
# Step 1 - Create library which is the main 'organizing construct' in llmware
26
print ("\nupdate: Step 1 - Creating library: {}".format(library_name))
28
library = Library().create_new_library(library_name)
30
# Step 2 - Pull down the sample files from S3 through the .load_sample_files() command
31
# --note: if you need to refresh the sample files, set 'over_write=True'
32
print ("update: Step 2 - Downloading Sample Files")
34
sample_files_path = Setup().load_sample_files(over_write=False)
36
# Step 3 - point ".add_files" method to the folder of documents that was just created
37
# this method parses all of the documents, text chunks, and captures in MongoDB
38
print("update: Step 3 - Parsing and Text Indexing Files")
40
library.add_files(input_folder_path=os.path.join(sample_files_path, "AgreementsLarge"))
42
# Step 4 - Install the embeddings
43
print("\nupdate: Step 4 - Generating Embeddings in {} db - with Model- {}".format(vector_db, embedding_model))
45
library.install_new_embedding(embedding_model_name=embedding_model, vector_db=vector_db)
47
# note: for using llmware as part of a larger application, you can check the real-time status by polling Status()
48
# --both the EmbeddingHandler and Parsers write to Status() at intervals while processing
49
update = Status().get_embedding_status(library_name, embedding_model)
50
print("update: Embeddings Complete - Status() check at end of embedding - ", update)
52
# Step 5 - start using the new vector embeddings with Query
53
sample_query = "incentive compensation"
54
print("\n\nupdate: Step 5 - Query: {}".format(sample_query))
56
query_results = Query(library).semantic_query(sample_query, result_count=20)
58
for i, entries in enumerate(query_results):
60
# each query result is a dictionary with many useful keys
62
text = entries["text"]
63
document_source = entries["file_source"]
64
page_num = entries["page_num"]
65
vector_distance = entries["distance"]
67
# for display purposes only, we will only show the first 100 characters of the text
68
if len(text) > 125: text = text[0:125] + " ... "
70
print("\nupdate: query results - {} - document - {} - page num - {} - distance - {} "
71
.format( i, document_source, page_num, vector_distance))
73
print("update: text sample - ", text)
76
if __name__ == "__main__":
78
# pick any name for the library
79
user_selected_name = "contracts"
80
parse_and_generate_vector_embeddings(user_selected_name)