llmware

docs2vecs_with_milvus-un_resolutions.py
80 строк · 3.3 Кб
Перенос по словам
1

2
"""
3
#                   *** FAST START to create vector embeddings from documents ***
4
#
5
#   docs2vecs_with_milvus-un_resolutions - parses, text chunks and embeds 500 United Nations (UN) Resolutions
6
#   the sample documents (500 PDFs - 2-15 pages each) can be pulled down from a public S3 repo with the command:
7
#           sample_files_path = Setup().load_sample_files()
8

9
#   note: the example assumes that you have installed Milvus and MongoDB per the separate instructions in the README
10
"""
11

12

13
import os
14
from llmware.library import Library
15
from llmware.retrieval import Query
16
from llmware.setup import Setup
17
from llmware.status import Status
18

19

20
def parse_and_generate_vector_embeddings(library_name):
21

22
    # Step 0 - Configuration - we will use these in Step 4 to install the embeddings
23
    embedding_model = "mini-lm-sbert"
24
    vector_db = "milvus"
25

26
    # Step 1 - Create library which is the main 'organizing construct' in llmware
27
    print ("\nupdate: Step 1 - Creating library: {}".format(library_name))
28

29
    library = Library().create_new_library(library_name)
30

31
    # Step 2 - Pull down the sample files from S3 through the .load_sample_files() command
32
    #   --note: if you need to refresh the sample files, set 'over_write=True'
33
    print ("update: Step 2 - Downloading Sample Files")
34

35
    sample_files_path = Setup().load_sample_files(over_write=False)
36

37
    # Step 3 - point ".add_files" method to the folder of documents that was just created
38
    #   this method parses all of the documents, text chunks, and captures in MongoDB
39
    print("update: Step 3 - Parsing and Text Indexing Files")
40

41
    library.add_files(input_folder_path=os.path.join(sample_files_path, "UN-Resolutions-500"))
42

43
    # Step 4 - Install the embeddings
44
    print("\nupdate: Step 4 - Generating Embeddings in {} db - with Model- {}".format(vector_db, embedding_model))
45

46
    library.install_new_embedding(embedding_model_name=embedding_model, vector_db=vector_db)
47

48
    # note: for using llmware as part of a larger application, you can check the real-time status by polling Status()
49
    #   --both the EmbeddingHandler and Parsers write to Status() at intervals while processing
50
    update = Status().get_embedding_status(library_name, embedding_model)
51
    print("update: Embeddings Complete - Status() check at end of embedding - ", update)
52

53
    # Step 5 - start using the new vector embeddings with Query
54
    sample_query = "sustainability issues impacting women"
55
    print("\n\nupdate: Step 5 - Query: {}".format(sample_query))
56

57
    query_results = Query(library).semantic_query(sample_query, result_count=20)
58

59
    for i, entries in enumerate(query_results):
60

61
        # each query result is a dictionary with many useful keys
62

63
        text = entries["text"]
64
        document_source = entries["file_source"]
65
        page_num = entries["page_num"]
66
        vector_distance = entries["distance"]
67

68
        #  for display purposes only, we will only show the first 100 characters of the text
69
        if len(text) > 125:  text = text[0:125] + " ... "
70

71
        print("\nupdate: query results - {} - document - {} - page num - {} - distance - {} "
72
              .format( i, document_source, page_num, vector_distance))
73

74
        print("update: text sample - ", text)
75

76

77
if __name__ == "__main__":
78

79
    # pick any name for the library
80
    user_selected_name = "un_resolutions500"
81
    parse_and_generate_vector_embeddings(user_selected_name)
82

83

84
llmware

Использование cookies