2
"""This example shows how to use Qdrant as a vector embedding database with llmware"""
4
""" (A) Python Dependencies -
6
As a first step, you should pip install dependencies not included in the llmware package:
7
-- pip3 install qdrant-client
9
(B) Installing Qdrant -
10
-- Docker - https://qdrant.tech/documentation/guides/installation/
14
-- set os.environ variables to 'automatically' pass in installing embedding
15
-- os.environ["USER_MANAGED_QDRANT_HOST"] = "localhost"
16
-- os.environ["USER_MANAGED_QDRANT_PORT"] = 6333
23
from llmware.setup import Setup
24
from llmware.library import Library
25
from llmware.retrieval import Query
29
def build_lib (library_name, folder="Agreements"):
31
# Step 1 - Create library which is the main 'organizing construct' in llmware
32
print ("\nupdate: Step 1 - Creating library: {}".format(library_name))
34
library = Library().create_new_library(library_name)
36
# Step 2 - Pull down the sample files from S3 through the .load_sample_files() command
37
# --note: if you need to refresh the sample files, set 'over_write=True'
38
print ("update: Step 2 - Downloading Sample Files")
40
sample_files_path = Setup().load_sample_files(over_write=False)
42
# Step 3 - point ".add_files" method to the folder of documents that was just created
43
# this method parses the documents, text chunks, and captures in MongoDB
44
print("update: Step 3 - Parsing and Text Indexing Files")
46
# options: Agreements | UN-Resolutions-500
47
library.add_files(input_folder_path=os.path.join(sample_files_path, folder))
54
print("update: Step 1- starting here- building library- parsing PDFs into text chunks")
56
lib = build_lib("qdrant_0")
58
# optional - check the status of the library card and embedding
59
lib_card = lib.get_library_card()
60
print("update: -- before embedding process - check library card - ", lib_card)
62
print("update: Step 2 - starting to install embeddings")
64
# alt embedding models - "mini-lm-sbert" | industry-bert-contracts | text-embedding-ada-002
65
# note: if you want to use text-embedding-ada-002, you will need an OpenAI key and enter into os.environ variable
66
# e.g., os.environ["USER_MANAGED_OPENAI_API_KEY"] = "<insert your key>"
68
# batch sizes from 100-500 usually give good performance and work on most environments
69
lib.install_new_embedding(embedding_model_name="industry-bert-contracts",vector_db="qdrant",batch_size=300)
71
# optional - check the status of the library card and embedding
72
lib_card = lib.get_library_card()
73
print("update: -- after embedding process - check updated library card - ", lib_card)
76
# note: embedding_model_name is optional, but useful if you create multiple embeddings on the same library
77
# --see other example scripts for multiple embeddings
80
query_pgv = Query(lib, embedding_model_name="industry-bert-contracts")
82
# run multiple queries using query_pgv
83
my_search_results = query_pgv.semantic_query("What is the sale bonus?", result_count = 24)
85
for i, qr in enumerate(my_search_results):
86
print("update: semantic query results: ", i, qr)
88
# if you want to delete the embedding - uncomment the line below
89
# lib.delete_installed_embedding("industry-bert-contracts", "pg_vector")
91
# optional - check the embeddings on the library
92
emb_record = lib.get_embedding_status()
93
for j, entries in enumerate(emb_record):
94
print("update: embeddings on library: ", j, entries)