3
# *** FAST START to create vector embeddings from documents ***
5
# docs2vecs_with_milvus-un_resolutions - parses, text chunks and embeds 500 United Nations (UN) Resolutions
6
# the sample documents (500 PDFs - 2-15 pages each) can be pulled down from a public S3 repo with the command:
7
# sample_files_path = Setup().load_sample_files()
9
# note: the example assumes that you have installed Milvus and MongoDB per the separate instructions in the README
14
from llmware.library import Library
15
from llmware.retrieval import Query
16
from llmware.setup import Setup
17
from llmware.status import Status
20
def parse_and_generate_vector_embeddings(library_name):
22
# Step 0 - Configuration - we will use these in Step 4 to install the embeddings
23
embedding_model = "mini-lm-sbert"
26
# Step 1 - Create library which is the main 'organizing construct' in llmware
27
print ("\nupdate: Step 1 - Creating library: {}".format(library_name))
29
library = Library().create_new_library(library_name)
31
# Step 2 - Pull down the sample files from S3 through the .load_sample_files() command
32
# --note: if you need to refresh the sample files, set 'over_write=True'
33
print ("update: Step 2 - Downloading Sample Files")
35
sample_files_path = Setup().load_sample_files(over_write=False)
37
# Step 3 - point ".add_files" method to the folder of documents that was just created
38
# this method parses all of the documents, text chunks, and captures in MongoDB
39
print("update: Step 3 - Parsing and Text Indexing Files")
41
library.add_files(input_folder_path=os.path.join(sample_files_path, "UN-Resolutions-500"))
43
# Step 4 - Install the embeddings
44
print("\nupdate: Step 4 - Generating Embeddings in {} db - with Model- {}".format(vector_db, embedding_model))
46
library.install_new_embedding(embedding_model_name=embedding_model, vector_db=vector_db)
48
# note: for using llmware as part of a larger application, you can check the real-time status by polling Status()
49
# --both the EmbeddingHandler and Parsers write to Status() at intervals while processing
50
update = Status().get_embedding_status(library_name, embedding_model)
51
print("update: Embeddings Complete - Status() check at end of embedding - ", update)
53
# Step 5 - start using the new vector embeddings with Query
54
sample_query = "sustainability issues impacting women"
55
print("\n\nupdate: Step 5 - Query: {}".format(sample_query))
57
query_results = Query(library).semantic_query(sample_query, result_count=20)
59
for i, entries in enumerate(query_results):
61
# each query result is a dictionary with many useful keys
63
text = entries["text"]
64
document_source = entries["file_source"]
65
page_num = entries["page_num"]
66
vector_distance = entries["distance"]
68
# for display purposes only, we will only show the first 100 characters of the text
69
if len(text) > 125: text = text[0:125] + " ... "
71
print("\nupdate: query results - {} - document - {} - page num - {} - distance - {} "
72
.format( i, document_source, page_num, vector_distance))
74
print("update: text sample - ", text)
77
if __name__ == "__main__":
79
# pick any name for the library
80
user_selected_name = "un_resolutions500"
81
parse_and_generate_vector_embeddings(user_selected_name)