llmware

using_qdrant.py
93 строки · 3.5 Кб
Перенос по словам
1

2
"""This example shows how to use Qdrant as a vector embedding database with llmware"""
3

4
""" (A) Python Dependencies - 
5

6
    As a first step, you should pip install dependencies not included in the llmware package:
7
        -- pip3 install qdrant-client
8
    
9
    (B) Installing Qdrant - 
10
        -- Docker - https://qdrant.tech/documentation/guides/installation/
11
        
12
    (C) Configurations - 
13
    
14
        -- set os.environ variables to 'automatically' pass in installing embedding
15
        -- os.environ["USER_MANAGED_QDRANT_HOST"] = "localhost"
16
        -- os.environ["USER_MANAGED_QDRANT_PORT"] = 6333
17
        
18
"""
19

20

21
import os
22

23
from llmware.setup import Setup
24
from llmware.library import Library
25
from llmware.retrieval import Query
26

27

28

29
def build_lib (library_name, folder="Agreements"):
30

31
    # Step 1 - Create library which is the main 'organizing construct' in llmware
32
    print ("\nupdate: Step 1 - Creating library: {}".format(library_name))
33

34
    library = Library().create_new_library(library_name)
35

36
    # Step 2 - Pull down the sample files from S3 through the .load_sample_files() command
37
    #   --note: if you need to refresh the sample files, set 'over_write=True'
38
    print ("update: Step 2 - Downloading Sample Files")
39

40
    sample_files_path = Setup().load_sample_files(over_write=False)
41

42
    # Step 3 - point ".add_files" method to the folder of documents that was just created
43
    #   this method parses the documents, text chunks, and captures in MongoDB
44
    print("update: Step 3 - Parsing and Text Indexing Files")
45

46
    #   options:   Agreements | UN-Resolutions-500
47
    library.add_files(input_folder_path=os.path.join(sample_files_path, folder))
48

49
    return library
50

51

52
# start script
53

54
print("update: Step 1- starting here- building library- parsing PDFs into text chunks")
55

56
lib = build_lib("qdrant_0")
57

58
# optional - check the status of the library card and embedding
59
lib_card = lib.get_library_card()
60
print("update: -- before embedding process - check library card - ", lib_card)
61

62
print("update: Step 2 - starting to install embeddings")
63

64
#   alt embedding models - "mini-lm-sbert" | industry-bert-contracts |  text-embedding-ada-002
65
#   note: if you want to use text-embedding-ada-002, you will need an OpenAI key and enter into os.environ variable
66
#   e.g., os.environ["USER_MANAGED_OPENAI_API_KEY"] = "<insert your key>"
67

68
#   batch sizes from 100-500 usually give good performance and work on most environments
69
lib.install_new_embedding(embedding_model_name="industry-bert-contracts",vector_db="qdrant",batch_size=300)
70

71
#   optional - check the status of the library card and embedding
72
lib_card = lib.get_library_card()
73
print("update: -- after embedding process - check updated library card - ", lib_card)
74

75
#   run a query
76
#   note: embedding_model_name is optional, but useful if you create multiple embeddings on the same library
77
#   --see other example scripts for multiple embeddings
78

79
#   create query object
80
query_pgv = Query(lib, embedding_model_name="industry-bert-contracts")
81

82
#   run multiple queries using query_pgv
83
my_search_results = query_pgv.semantic_query("What is the sale bonus?", result_count = 24)
84

85
for i, qr in enumerate(my_search_results):
86
    print("update: semantic query results: ", i, qr)
87

88
# if you want to delete the embedding  - uncomment the line below
89
# lib.delete_installed_embedding("industry-bert-contracts", "pg_vector")
90

91
#   optional - check the embeddings on the library
92
emb_record = lib.get_embedding_status()
93
for j, entries in enumerate(emb_record):
94
    print("update: embeddings on library: ", j, entries)
95

96
llmware

Использование cookies