llmware

using_chromadb.py
114 строк · 4.6 Кб
Перенос по словам
1

2
"""This example shows how to use ChromaDB as a vector embedding database with llmware"""
3

4
""" (A) Python Dependencies - 
5

6
    As a first step, you should pip install the ChromaDB, which is not included in the llmware package:
7
    1.  pip3 install chromadb
8
    
9
    (B) Using ChromaDB - 
10
    
11
    Installing ChromaDB via pip installs everything you need.
12
    However, if you need help, there are many great online sources and communities, e.g.,:
13
        -- ChromaDB documentation - https://docs.trychroma.com/
14
        -- Docker - https://hub.docker.com/u/chromadb
15
        -- please also see the docker-compose-chromadb.yaml script provided in the llmware script repository
16
        
17
    (C) Configurations - 
18

19
    You can configure ChromaDB with environment variables. Here is the list of variable names we currently
20
    support - for more information see ChromaDBConfig.
21
        -- CHROMADB_COLLECTION
22
        -- CHROMADB_PERSISTENT_PATH
23
        -- CHROMADB_HOST
24
        -- CHROMADB_PORT
25
        -- CHROMADB_SSL
26
        -- CHROMADB_HEADERS
27
        -- CHROMADB_SERVER_AUTH_PROVIDER
28
        -- CHROMADB_SERVER_AUTH_CREDENTIALS_PROVIDER
29
        -- CHROMADB_SERVER_AUTH_CREDENTIALS_PROVIDER
30
        -- CHROMADB_PASSWORD
31
        -- CHROMADB_SERVER_AUTH_CREDENTIALS_FILE
32
        -- CHROMADB_SERVER_AUTH_CREDENTIALS
33
        -- CHROMADB_SERVER_AUTH_TOKEN_TRANSPORT_HEADER
34
"""
35

36

37
import os
38

39
from llmware.setup import Setup
40
from llmware.library import Library
41
from llmware.retrieval import Query
42

43
#  example with using ChromaDB as an in-memory database
44
os.environ["CHROMADB_COLLECTION"] = "llmware"
45

46
#  note: in default mode, Chroma will persist in memory only - to persist to disk, then uncomment the following line and add local folder path:
47
#  os.environ["CHROMA_PERSISTENT_PATH"] = "/local/folder/path/to/save/chromadb/"
48

49

50
def build_lib (library_name, folder="Agreements"):
51

52
    # Step 1 - Create library which is the main 'organizing construct' in llmware
53
    print ("\nupdate: Step 1 - Creating library: {}".format(library_name))
54

55
    library = Library().create_new_library(library_name)
56

57
    # Step 2 - Pull down the sample files from S3 through the .load_sample_files() command
58
    #   --note: if you need to refresh the sample files, set 'over_write=True'
59
    print ("update: Step 2 - Downloading Sample Files")
60

61
    sample_files_path = Setup().load_sample_files(over_write=False)
62

63
    # Step 3 - point ".add_files" method to the folder of documents that was just created
64
    #   this method parses the documents, text chunks, and captures in MongoDB
65
    print("update: Step 3 - Parsing and Text Indexing Files")
66

67
    #   options:   Agreements | UN-Resolutions-500
68
    library.add_files(input_folder_path=os.path.join(sample_files_path, folder))
69

70
    return library
71

72

73
# start script
74

75
print("update: Step 1- starting here- building library- parsing PDFs into text chunks")
76

77
lib = build_lib("chromadb_lib_0")
78

79
# optional - check the status of the library card and embedding
80
lib_card = lib.get_library_card()
81
print("update: -- before embedding process - check library card - ", lib_card)
82

83
print("update: Step 2 - starting to install embeddings")
84

85
#   alt embedding models - "mini-lm-sbert" | industry-bert-contracts |  text-embedding-ada-002
86
#   note: if you want to use text-embedding-ada-002, you will need an OpenAI key and enter into os.environ variable
87
#   e.g., os.environ["USER_MANAGED_OPENAI_API_KEY"] = "<insert your key>"
88

89
#   batch sizes from 100-500 usually give good performance and work on most environments
90
lib.install_new_embedding(embedding_model_name="industry-bert-contracts",vector_db="chromadb",batch_size=300)
91

92
#   optional - check the status of the library card and embedding
93
lib_card = lib.get_library_card()
94
print("update: -- after embedding process - check updated library card - ", lib_card)
95

96
#   run a query
97
#   note: embedding_model_name is optional, but useful if you create multiple embeddings on the same library
98
#   --see other example scripts for multiple embeddings
99

100
#   create query object
101
query_chromadb = Query(lib, embedding_model_name="industry-bert-contracts")
102

103
#   run multiple queries using query_chromadb
104
my_search_results = query_chromadb.semantic_query("What is the sale bonus?", result_count = 24)
105

106
for i, qr in enumerate(my_search_results):
107
    print("update: semantic query results: ", i, qr)
108

109
# if you want to delete the embedding  - uncomment the line below
110
# lib.delete_installed_embedding("industry-bert-contracts", "chromadb")
111

112
#   optional - check the embeddings on the library
113
emb_record = lib.get_embedding_status()
114
for j, entries in enumerate(emb_record):
115
    print("update: embeddings on library: ", j, entries)
116
llmware

Использование cookies