llmware

docs2vecs_with_milvus-contracts.py
79 строк · 3.3 Кб
Перенос по словам
1

2
""""
3
#                   *** FAST START to create vector embeddings from documents ***
4
#
5
#   docs2vecs_with_milvus-contracts - parses, text chunks and embeds legal contracts
6
#   the sample documents (~80 legal template contracts) can be pulled down from a public S3 repo with the command:
7
#           sample_files_path = Setup().load_sample_files()
8
#   note:  this example requires Milvus + MongoDB - please see the README instructions to install
9
"""
10

11

12
import os
13
from llmware.library import Library
14
from llmware.retrieval import Query
15
from llmware.setup import Setup
16
from llmware.status import Status
17

18

19
def parse_and_generate_vector_embeddings(library_name):
20

21
    # Step 0 - Configuration - we will use these in Step 4 to install the embeddings
22
    embedding_model = "industry-bert-contracts"
23
    vector_db = "milvus"
24

25
    # Step 1 - Create library which is the main 'organizing construct' in llmware
26
    print ("\nupdate: Step 1 - Creating library: {}".format(library_name))
27

28
    library = Library().create_new_library(library_name)
29

30
    # Step 2 - Pull down the sample files from S3 through the .load_sample_files() command
31
    #   --note: if you need to refresh the sample files, set 'over_write=True'
32
    print ("update: Step 2 - Downloading Sample Files")
33

34
    sample_files_path = Setup().load_sample_files(over_write=False)
35

36
    # Step 3 - point ".add_files" method to the folder of documents that was just created
37
    #   this method parses all of the documents, text chunks, and captures in MongoDB
38
    print("update: Step 3 - Parsing and Text Indexing Files")
39

40
    library.add_files(input_folder_path=os.path.join(sample_files_path, "AgreementsLarge"))
41

42
    # Step 4 - Install the embeddings
43
    print("\nupdate: Step 4 - Generating Embeddings in {} db - with Model- {}".format(vector_db, embedding_model))
44

45
    library.install_new_embedding(embedding_model_name=embedding_model, vector_db=vector_db)
46

47
    # note: for using llmware as part of a larger application, you can check the real-time status by polling Status()
48
    #   --both the EmbeddingHandler and Parsers write to Status() at intervals while processing
49
    update = Status().get_embedding_status(library_name, embedding_model)
50
    print("update: Embeddings Complete - Status() check at end of embedding - ", update)
51

52
    # Step 5 - start using the new vector embeddings with Query
53
    sample_query = "incentive compensation"
54
    print("\n\nupdate: Step 5 - Query: {}".format(sample_query))
55

56
    query_results = Query(library).semantic_query(sample_query, result_count=20)
57

58
    for i, entries in enumerate(query_results):
59

60
        # each query result is a dictionary with many useful keys
61

62
        text = entries["text"]
63
        document_source = entries["file_source"]
64
        page_num = entries["page_num"]
65
        vector_distance = entries["distance"]
66

67
        #  for display purposes only, we will only show the first 100 characters of the text
68
        if len(text) > 125:  text = text[0:125] + " ... "
69

70
        print("\nupdate: query results - {} - document - {} - page num - {} - distance - {} "
71
              .format( i, document_source, page_num, vector_distance))
72

73
        print("update: text sample - ", text)
74

75

76
if __name__ == "__main__":
77

78
    # pick any name for the library
79
    user_selected_name = "contracts"
80
    parse_and_generate_vector_embeddings(user_selected_name)
81

82
llmware

Использование cookies