2
from dataclasses import dataclass
3
from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
5
from llmtuner.extras.logging import get_logger
8
from transformers import PreTrainedTokenizer
11
logger = get_logger(__name__)
17
prefix: List[Union[str, Dict[str, str]]]
18
prompt: List[Union[str, Dict[str, str]]]
20
sep: List[Union[str, Dict[str, str]]]
27
tokenizer: "PreTrainedTokenizer",
30
history: Optional[List[Tuple[str, str]]] = None,
31
system: Optional[str] = None
32
) -> Tuple[List[int], List[int]]:
34
Returns a single pair of token ids representing prompt and response respectively.
36
system, history = self._format(query, resp, history, system)
37
encoded_pairs = self._encode(tokenizer, system, history)
39
for query_ids, resp_ids in encoded_pairs[:-1]:
40
prompt_ids = prompt_ids + query_ids + resp_ids
41
prompt_ids, answer_ids = prompt_ids + encoded_pairs[-1][0], encoded_pairs[-1][1]
42
return prompt_ids, answer_ids
46
tokenizer: "PreTrainedTokenizer",
49
history: Optional[List[Tuple[str, str]]] = None,
50
system: Optional[str] = None
51
) -> List[Tuple[List[int], List[int]]]:
53
Returns multiple pairs of token ids representing prompts and responses respectively.
55
system, history = self._format(query, resp, history, system)
56
encoded_pairs = self._encode(tokenizer, system, history)
63
history: Optional[List[Tuple[str, str]]] = None,
64
system: Optional[str] = None
65
) -> Tuple[str, List[Tuple[str, str]]]:
67
Aligns inputs to the standard format.
69
system = system or self.system # use system if provided
70
history = history if (history and self.use_history) else []
71
history = history + [(query, resp)]
72
return system, history
76
tokenizer: "PreTrainedTokenizer"
77
) -> Tuple[List[int], List[int]]:
78
if tokenizer.bos_token_id is not None and getattr(tokenizer, "add_bos_token", True):
79
bos_ids = [tokenizer.bos_token_id]
80
else: # baichuan, qwen and gpt2 models have no bos token
83
if tokenizer.eos_token_id is None:
84
raise ValueError("EOS token is required.")
86
if self.efficient_eos: # used in baichuan, qwen, chatglm, etc.
89
eos_ids = [tokenizer.eos_token_id]
91
return bos_ids, eos_ids
95
tokenizer: "PreTrainedTokenizer",
97
history: List[Tuple[str, str]]
98
) -> List[Tuple[List[int], List[int]]]:
100
Encodes formatted inputs to pairs of token ids.
101
Turn 0: bos + prefix + sep + query resp + eos
102
Turn t: sep + bos + query resp + eos
104
bos_ids, eos_ids = self._get_special_ids(tokenizer)
105
sep_ids = self._convert_inputs_to_ids(tokenizer, context=self.sep)
107
for turn_idx, (query, resp) in enumerate(history):
109
prefix_ids = self._convert_inputs_to_ids(tokenizer, context=self.prefix, system=system)
110
if len(prefix_ids) != 0: # has prefix
111
prefix_ids = bos_ids + prefix_ids + sep_ids
115
prefix_ids = sep_ids + bos_ids
117
query_ids = self._convert_inputs_to_ids(tokenizer, context=self.prompt, query=query, idx=str(turn_idx+1))
118
resp_ids = self._convert_inputs_to_ids(tokenizer, context=[resp])
119
encoded_pairs.append((prefix_ids + query_ids, resp_ids + eos_ids))
122
def _convert_inputs_to_ids(
124
tokenizer: "PreTrainedTokenizer",
125
context: List[Union[str, Dict[str, str]]],
126
system: Optional[str] = None,
127
query: Optional[str] = None,
128
idx: Optional[str] = None
131
Converts context to token ids.
133
if isinstance(getattr(tokenizer, "tokenizer", None), tiktoken.Encoding): # for tiktoken tokenizer (Qwen)
134
kwargs = dict(allowed_special="all")
136
kwargs = dict(add_special_tokens=False)
140
if isinstance(elem, str):
141
elem = elem.replace("{{system}}", system, 1) if system is not None else elem
142
elem = elem.replace("{{query}}", query, 1) if query is not None else elem
143
elem = elem.replace("{{idx}}", idx, 1) if idx is not None else elem
145
token_ids = token_ids + tokenizer.encode(elem, **kwargs)
146
elif isinstance(elem, dict):
147
token_ids = token_ids + [tokenizer.convert_tokens_to_ids(elem.get("token"))]
149
raise ValueError("Input must be string or dict[str, str], got {}".format(type(elem)))
155
class Llama2Template(Template):
159
tokenizer: "PreTrainedTokenizer",
161
history: List[Tuple[str, str]]
162
) -> List[Tuple[List[int], List[int]]]:
164
Encodes formatted inputs to pairs of token ids.
165
Turn 0: bos + prefix + query resp + eos
166
Turn t: bos + query resp + eos
168
bos_ids, eos_ids = self._get_special_ids(tokenizer)
170
for turn_idx, (query, resp) in enumerate(history):
171
if turn_idx == 0: # llama2 template has no sep_ids
172
query = self.prefix[0].replace("{{system}}", system) + query
173
query_ids = self._convert_inputs_to_ids(tokenizer, context=self.prompt, query=query)
174
resp_ids = self._convert_inputs_to_ids(tokenizer, context=[resp])
175
encoded_pairs.append((bos_ids + query_ids, resp_ids + eos_ids))
179
templates: Dict[str, Template] = {}
182
def register_template(
184
prefix: List[Union[str, Dict[str, str]]],
185
prompt: List[Union[str, Dict[str, str]]],
187
sep: List[Union[str, Dict[str, str]]],
188
stop_words: Optional[List[str]] = [],
189
use_history: Optional[bool] = True,
190
efficient_eos: Optional[bool] = False
192
template_class = Llama2Template if "llama2" in name else Template
193
templates[name] = template_class(
198
stop_words=stop_words,
199
use_history=use_history,
200
efficient_eos=efficient_eos
204
def get_template_and_fix_tokenizer(
206
tokenizer: "PreTrainedTokenizer"
208
if tokenizer.eos_token_id is None:
209
tokenizer.eos_token = "<|endoftext|>"
210
logger.info("Add eos token: {}".format(tokenizer.eos_token))
212
if tokenizer.pad_token_id is None:
213
tokenizer.pad_token = tokenizer.eos_token
214
logger.info("Add pad token: {}".format(tokenizer.pad_token))
219
template = templates.get(name, None)
220
assert template is not None, "Template {} does not exist.".format(name)
221
tokenizer.add_special_tokens(
222
dict(additional_special_tokens=template.stop_words),
223
replace_additional_special_tokens=False
234
"### Instruction:\n{{query}}\n\n### Response:\n"
237
"Below is an instruction that describes a task. "
238
"Write a response that appropriately completes the request."
252
"Human: {{query}}###Assistant:"
255
"A chat between a curious human and an artificial intelligence assistant. "
256
"The assistant gives helpful, detailed, and polite answers to the human's questions."
274
{"token": "<reserved_102>"}, # user token
276
{"token": "<reserved_103>"} # assistant token
290
{"token": "<reserved_106>"}, # user token
292
{"token": "<reserved_107>"} # assistant token
306
"Human: {{query}}\n\nBelle: "
321
{"token": "[|Human|]:"},
333
{"token": "[gMASK]"},
338
"[Round {{idx}}]\n\n问:{{query}}\n\n答:"
351
{"token": "[gMASK]"},
353
{"token": "<|system|>"},
358
{"token": "<|user|>"},
361
{"token": "<|assistant|>"},
362
"\n" # add an extra newline to avoid error in ChatGLM's process_response method
365
"You are ChatGLM3, a large language model trained by Zhipu.AI. "
366
"Follow the user's instructions carefully. Respond using markdown."
378
name="chatglm3_raw", # the raw template for tool tuning
380
{"token": "[gMASK]"},
382
{"token": "<|system|>"},
387
{"token": "<|user|>"},
390
{"token": "<|assistant|>"}
393
"You are ChatGLM3, a large language model trained by Zhipu.AI. "
394
"Follow the user's instructions carefully. Respond using markdown."
411
"User: {{query}}\n\nAssistant:"
419
name="deepseekcoder",
424
"### Instruction:\n{{query}}\n### Response:\n"
427
"You are an AI programming assistant, utilizing the Deepseek Coder model, "
428
"developed by Deepseek Company, and you only answer questions related to computer science. "
429
"For politically sensitive questions, security and privacy issues, "
430
"and other non-computer science questions, you will refuse to answer\n"
434
{"token": "<|EOT|>"},
450
"Human: {{query}}\nAssistant:"
453
"A chat between a curious user and an artificial intelligence assistant. "
454
"The assistant gives helpful, detailed, and polite answers to the user's questions."
468
"User: {{query}}\nFalcon:"
484
"<|User|>:{{query}}",
503
"<<SYS>>\n{{system}}\n<</SYS>>\n\n"
506
"[INST] {{query}} [/INST]"
509
"You are a helpful, respectful and honest assistant. "
510
"Always answer as helpfully as possible, while being safe. "
511
"Your answers should not include any harmful, unethical, "
512
"racist, sexist, toxic, dangerous, or illegal content. "
513
"Please ensure that your responses are socially unbiased and positive in nature.\n\n"
514
"If a question does not make any sense, or is not factually coherent, "
515
"explain why instead of answering something not correct. "
516
"If you don't know the answer to a question, please don't share false information."
525
"<<SYS>>\n{{system}}\n<</SYS>>\n\n"
528
"[INST] {{query}} [/INST]"
530
system="You are a helpful assistant. 你是一个乐于助人的助手。",
541
"[INST] {{query}} [/INST]"
554
"GPT4 Correct User: {{query}}",
555
{"token": "<|end_of_turn|>"},
556
"GPT4 Correct Assistant:"
560
{"token": "<|end_of_turn|>"}
572
{"token": "<|im_start|>"},
576
{"token": "<|im_start|>"},
578
{"token": "<|im_end|>"},
580
{"token": "<|im_start|>"},
583
system="You are a helpful assistant.",
585
{"token": "<|im_end|>"},
598
{"token": "<|system|>"},
602
{"token": "<|user|>"},
604
{"token": "<|end|>"},
606
{"token": "<|assistant|>"}
610
{"token": "<|end|>"},
621
Supports language model inference without histories.
641
"USER: {{query}} ASSISTANT:"
644
"A chat between a curious user and an artificial intelligence assistant. "
645
"The assistant gives helpful, detailed, and polite answers to the user's questions."
657
"Human: {{query}} Assistant:"
660
"以下是用户和人工智能助手之间的对话。用户以Human开头,人工智能助手以Assistant开头,"
661
"会对人类提出的问题给出有帮助、高质量、详细和礼貌的回答,并且总是拒绝参与与不道德、"
662
"不安全、有争议、政治敏感等相关的话题、问题和指示。\n"
674
"Human: {{query}}\n\nAssistant: "
684
{"token": "<|System|>"},
688
{"token": "<|Human|>"},
690
{"token": "<|YaYi|>"},
694
"You are a helpful, respectful and honest assistant named YaYi "
695
"developed by Beijing Wenge Technology Co.,Ltd. "
696
"Always answer as helpfully as possible, while being safe. "
697
"Your answers should not include any harmful, unethical, "
698
"racist, sexist, toxic, dangerous, or illegal content. "
699
"Please ensure that your responses are socially unbiased and positive in nature.\n\n"
700
"If a question does not make any sense, or is not factually coherent, "
701
"explain why instead of answering something not correct. "
702
"If you don't know the answer to a question, please don't share false information."
719
"<|im_start|>user\n{{query}}<|im_end|>\n<|im_start|>assistant\n"
735
{"token": "<|system|>"},
740
{"token": "<|user|>"},
743
{"token": "<|assistant|>"}
745
system="You are a friendly chatbot who always responds in the style of a pirate",
756
{"token": "<human>"},