llmware

embeddings_fast_start.py
60 строк · 2.2 Кб
Перенос по словам
1

2
"""
3
#   This example shows the general recipe for creating an embedding.  This scenario uses FAISS for local
4
#   laptop deployment.
5
"""
6

7

8
import os
9
from llmware.library import Library
10
from llmware.retrieval import Query
11
from llmware.setup import Setup
12

13

14
def embeddings_fast_start (library_name, vector_db="faiss"):
15

16
    # Create and populate a library
17
    print (f"\nstep 1 - creating and populating library: {library_name}...")
18
    library = Library().create_new_library(library_name)
19
    sample_files_path = Setup().load_sample_files()
20
    library.add_files(input_folder_path=os.path.join(sample_files_path, "AgreementsLarge"))
21

22
    # To create vector embeddings you just need to specify the embedding model and the vector embedding DB
23
    # For examples of using HuggingFace and SentenceTransformer models, see those examples in this same folder
24

25
    embedding_model = "mini-lm-sbert"
26

27
    print (f"\n > Generating embedding vectors and storing in '{vector_db}'...")
28
    library.install_new_embedding(embedding_model_name=embedding_model, vector_db=vector_db)
29

30
    # Then when doing semantic queries, the most recent vector DB used for embeddings will be used.
31

32
    # We just find the best 3 hits for "Salary"
33
    q = Query(library)
34
    print (f"\n > Running a query for 'Salary'...")
35
    query_results = q.semantic_query(query="Salary", result_count=10, results_only=True)
36

37
    for i, entries in enumerate(query_results):
38

39
        # each query result is a dictionary with many useful keys
40

41
        text = entries["text"]
42
        document_source = entries["file_source"]
43
        page_num = entries["page_num"]
44
        vector_distance = entries["distance"]
45

46
        #  for display purposes only, we will only show the first 100 characters of the text
47
        if len(text) > 125:  text = text[0:125] + " ... "
48

49
        print("\nupdate: query results - {} - document - {} - page num - {} - distance - {} "
50
              .format( i, document_source, page_num, vector_distance))
51

52
        print("update: text sample - ", text)
53

54
    return query_results
55

56

57
if __name__ == "__main__":
58

59
    # set to 'faiss' by default -> switch to 'milvus' once installed and running
60
    db = "faiss"
61
    embeddings_fast_start("embedding_test_1", vector_db=db)
62

63

64
llmware

Использование cookies