2
""" This example shows how to use any gguf model available on HuggingFace, and start using in inferences and
3
workflows with llmware. In this scenario, we will take the following steps:
5
1. Register new GGUF model
6
2. Register new finetune wrapper, if needed
7
3. Start running inferences
13
from llmware.models import ModelCatalog
14
from llmware.prompts import Prompt
16
# Step 1 - register new gguf model - we will pick the popular LLama-2-13B-chat-GGUF
18
ModelCatalog().register_gguf_model(model_name="TheBloke/Llama-2-13B-chat-GGUF-Q2",
19
gguf_model_repo="TheBloke/Llama-2-13B-chat-GGUF",
20
gguf_model_file_name="llama-2-13b-chat.Q2_K.gguf",
21
prompt_wrapper="my_version_inst")
23
# Step 2- if the prompt_wrapper is a standard, e.g., Meta's <INST>, then no need to do anything else
24
# -- however, if the model uses a custom prompt wrapper, then we need to define that too
25
# -- in this case, we are going to create our "own version" of the Meta <INST> wrapper
27
ModelCatalog().register_new_finetune_wrapper("my_version_inst", main_start="<INST>", llm_start="</INST>")
29
# Once we have completed these two steps, we are done - and can begin to use the model like any other
31
prompter = Prompt().load_model("TheBloke/Llama-2-13B-chat-GGUF-Q2")
33
question_list = ["I am interested in gaining an understanding of the banking industry. What topics should I research?",
34
"What are some tips for creating a successful business plan?",
35
"What are the best books to read for a class on American literature?"]
38
for i, entry in enumerate(question_list):
40
start_time = time.time()
42
print(f"query - {i + 1} - {entry}")
44
response = prompter.prompt_main(entry)
47
time_taken = round(time.time() - start_time, 2)
48
llm_response = re.sub("[\n\n]", "\n", response['llm_response'])
49
print(f"llm_response - {i + 1} - {llm_response}")
50
print(f"time_taken - {i + 1} - {time_taken}")