rulm
25 строк · 929.0 Байт
1from transformers import AutoTokenizer, AutoConfig
2
3from src.util.dl import fix_tokenizer
4
5
6def test_fix_tokenizer():
7model_name = "TheBloke/Llama-2-7B-fp16"
8tokenizer = AutoTokenizer.from_pretrained(model_name)
9config = AutoConfig.from_pretrained(model_name)
10tokenizer = fix_tokenizer(tokenizer, config)
11assert tokenizer.bos_token_id == 1
12assert tokenizer.eos_token_id == 2
13assert tokenizer.pad_token_id == 0
14assert tokenizer.unk_token_id == 0
15assert tokenizer.model_max_length == 4096
16
17model_name = "ai-forever/ruGPT-3.5-13B"
18tokenizer = AutoTokenizer.from_pretrained(model_name)
19config = AutoConfig.from_pretrained(model_name)
20tokenizer = fix_tokenizer(tokenizer, config)
21assert tokenizer.pad_token_id == 0
22assert tokenizer.bos_token_id == 2
23assert tokenizer.eos_token_id == 3
24assert tokenizer.unk_token_id == 1
25assert tokenizer.model_max_length == 2048
26