22
import ppfleetx.models.language_model.gpt as gpt
23
from paddle.static import InputSpec
24
from ppfleetx.core.module.basic_module import BasicModule
25
from ppfleetx.data.tokenizers import GPTTokenizer
26
from ppfleetx.distributed.apis import env
27
from ppfleetx.models.language_model.gpt.dygraph.sequence_parallel_utils import (
28
register_sequence_parallel_allreduce_hooks,
30
from ppfleetx.utils.log import logger
34
from paddlenlp.transformers.gpt.tokenizer import GPTChineseTokenizer
35
from paddlenlp.transformers.segment_parallel_utils import split_inputs_sequence_dim
37
from .metrics import Accuracy, AccuracyAndF1, Mcc, PearsonAndSpearman
38
from .utils import process_configs
41
"GPT": (GPTTokenizer, "gpt2"),
42
"GPT-cn": (GPTChineseTokenizer, "gpt-cpm-large-cn"),
46
def get_model_size(l, h, v, s):
51
P += (4 * h * h + 4 * h) * l
53
P += (2 * (2 * h)) * l
55
P += (8 * h * h + 5 * h) * l
58
logger.info("Model Size: {:.2f} B".format(P / 1000.0 / 1000.0 / 1000.0))
61
def vocab_size_with_padding(vocab_size, div_unit, mp_degree):
62
padded_size = vocab_size
63
multiple = div_unit * mp_degree
64
while (padded_size % multiple) != 0:
67
" > padded vocab (size: {}) with {} dummy tokens "
68
"(new size: {})".format(vocab_size, padded_size - vocab_size, padded_size)
73
class LanguageModule(BasicModule):
74
def __init__(self, configs):
75
self.nranks = paddle.distributed.get_world_size()
76
self.data_world_size = env.get_data_world_size()
77
super(LanguageModule, self).__init__(configs)
79
self.loss_fn = self.get_loss_fn()
81
def process_configs(self, configs):
82
configs = process_configs(configs)
85
def forward(self, tokens, ids):
86
return self.model(tokens, ids)
88
def training_step(self, batch):
89
tokens, position_ids, labels, loss_mask = batch
90
if self.nranks > 1 and paddle.distributed.fleet.get_hybrid_communicate_group().get_sep_parallel_world_size() > 1:
91
tokens = split_inputs_sequence_dim(tokens)
92
position_ids = split_inputs_sequence_dim(position_ids)
93
labels = split_inputs_sequence_dim(labels)
95
loss_mask.stop_gradient = True
96
labels.stop_gradient = True
97
position_ids.stop_gradient = True
99
preds = self(tokens, position_ids)
100
loss = self.loss_fn(preds, labels, loss_mask)
104
def training_step_end(self, log_dict):
105
speed = 1.0 / log_dict["train_cost"]
106
default_global_tokens_num = self.configs.Global.global_batch_size * self.configs.Data.Train.dataset.max_seq_len
109
"loss_scale: %.9f," % (log_dict["loss_scale"]) if log_dict.get("loss_scale", None) is not None else ""
112
", max_memory_allocated: %.1f MB, max_memory_reserved: %.1f MB, " \
113
"memory_allocated: %.1f MB, memory_reserved: %.1f MB" \
114
% (log_dict["max_memory_allocated"], log_dict["max_memory_reserved"],log_dict["memory_allocated"], log_dict["memory_reserved"]) if "max_memory_allocated" in log_dict else ""
117
"[train] epoch: [%d/%d], batch: [%d/%d], loss: %.9f, avg_batch_cost: %.5f sec, speed: %.2f step/s, "
118
"ips_total: %.0f tokens/s, ips: %.0f tokens/s, ips_per_device:%.0f tokens/s/device, %s learning rate: %.5e, found_inf: %.0f %s"
121
log_dict["total_epoch"],
123
log_dict["total_step"],
125
log_dict["train_cost"],
127
speed * default_global_tokens_num,
128
speed * default_global_tokens_num / self.data_world_size,
129
speed * default_global_tokens_num / paddle.distributed.get_world_size(),
132
log_dict["found_inf"],
137
def validation_step(self, batch):
138
tokens, position_ids, labels, loss_mask = batch
139
preds = self(tokens, position_ids)
140
preds = paddle.cast(preds, dtype="float32")
141
loss = self.loss_fn(preds, labels, loss_mask)
144
def validation_step_end(self, log_dict):
145
speed = 1.0 / log_dict["eval_cost"]
147
"[eval] epoch: %d, batch: %d/%d, loss: %.9f, avg_eval_cost: %.5f sec, speed: %.2f step/s"
151
log_dict["total_batch"],
153
log_dict["eval_cost"],
158
def test_step(self, batch):
159
tokens, position_ids, labels, loss_mask = batch
160
preds = self(tokens, position_ids)
161
preds = paddle.cast(preds, dtype="float32")
162
loss = self.loss_fn(preds, labels, loss_mask)
165
def test_step_end(self, log_dict):
166
speed = 1.0 / log_dict["test_cost"]
168
"[test] epoch: %d, batch: %d, loss: %.9f, avg_test_cost: %.5f sec, speed: %.2f step/s"
169
% (log_dict["epoch"], log_dict["batch"], log_dict["loss"], log_dict["test_cost"], speed)
172
def training_epoch_end(self, log_dict):
173
logger.info("[Training] epoch: %d, total time: %.5f sec" % (log_dict["epoch"], log_dict["train_cost"]))
176
class GPTModule(LanguageModule):
177
def __init__(self, configs):
178
super(GPTModule, self).__init__(configs)
179
if configs.Model.sequence_parallel:
180
register_sequence_parallel_allreduce_hooks(
181
self, configs.Engine.accumulate_steps, configs.Distributed.fuse_sequence_parallel_allreduce
185
model_setting = copy.deepcopy(self.configs.Model)
186
if "Compress" in self.configs and "Quantization" in self.configs.Compress:
187
quant_setting = copy.deepcopy(self.configs.Compress.Quantization)
188
skip_tensor_map = quant_setting.get("skip_tensor_map", {})
189
freeze_embedding = quant_setting.get("freeze_embedding", False)
190
model_setting["skip_tensor_map"] = skip_tensor_map
191
model_setting["freeze_embedding"] = freeze_embedding
192
model_setting.pop("module")
194
model_name = model_setting.pop("name")
195
tokenizer_class, pretrained_name = MODEL_CLASSES[model_name]
196
self.tokenizer = tokenizer_class.from_pretrained(pretrained_name)
198
model_setting["vocab_size"] = vocab_size_with_padding(
199
model_setting.get("vocab_size", self.tokenizer.vocab_size),
200
model_setting.pop("vocab_size_divisible_unit", 128),
201
self.configs.Distributed.get("mp_degree", 1),
204
l = model_setting["num_layers"]
205
h = model_setting["hidden_size"]
206
v = model_setting["vocab_size"]
207
s = self.configs.Data.Train.dataset.max_seq_len
208
get_model_size(l, h, v, s)
211
model_setting.pop("sequence_parallel")
212
model = gpt.GPTForPretraining(gpt.GPTModel(**model_setting))
214
model_setting["num_partitions"] = self.configs.Distributed.mp_degree
215
if self.configs.Distributed.pp_degree == 1:
216
model_setting.pop("virtual_pp_degree", None)
217
model = gpt.GPTForPretrainingHybrid(gpt.GPTModelHybrid(**model_setting))
219
model = gpt.GPTForPretrainingPipe(**model_setting)
223
def get_loss_fn(self):
225
loss_fn = gpt.GPTPretrainingCriterion()
227
loss_fn = gpt.GPTPretrainingCriterionHybird(sequence_parallel=self.configs.Model.sequence_parallel)
230
def pretreating_batch(self, batch):
231
if self.configs.Distributed.pp_degree > 1:
232
tokens, position_ids, labels, loss_mask = batch
233
data = [(tokens, position_ids), (labels, loss_mask)]
238
def input_spec(self):
240
InputSpec(shape=[None, None], name="tokens", dtype="int64"),
241
InputSpec(shape=[None, None], name="ids", dtype="int64"),
244
def inference_end(self, outputs):
245
for k, v in outputs.items():
246
for i in range(v.shape[0]):
247
out_ids = [int(x) for x in v[i]]
248
ret_str = self.tokenizer.decode(out_ids)
253
class GPTFinetuneModule(BasicModule):
254
def __init__(self, configs):
255
self.nranks = paddle.distributed.get_world_size()
256
self.data_world_size = env.get_data_world_size()
257
super(GPTFinetuneModule, self).__init__(configs)
260
assert self.loss_config is not None
261
assert "train" in self.loss_config
262
assert "eval" in self.loss_config
264
train_loss = copy.deepcopy(self.loss_config.train)
265
train_loss_cls = train_loss.pop("name")
266
self.loss_fn = eval(f"paddle.nn.loss.{train_loss_cls}")(**train_loss)
268
eval_loss = copy.deepcopy(self.loss_config.eval)
269
eval_loss_cls = eval_loss.pop("name")
270
self.eval_loss_fn = eval(f"paddle.nn.loss.{eval_loss_cls}")(**eval_loss)
273
assert self.metric_config is not None
274
assert "eval" in self.metric_config
276
if "train" in self.metric_config:
277
train_metric = copy.deepcopy(self.metric_config.train)
278
train_metric_cls = train_metric.pop("name")
279
self.train_metric = eval(f"{train_metric_cls}")(**train_metric)
281
eval_metric = copy.deepcopy(self.metric_config.eval)
282
eval_metric_cls = eval_metric.pop("name")
283
self.eval_metric = eval(f"{eval_metric_cls}")(**eval_metric)
285
self.best_metric = 0.0
287
def process_configs(self, configs):
291
model_setting = copy.deepcopy(self.configs.Model)
292
model_setting.pop("module")
294
self.metric_config = model_setting.pop("metric", None)
295
self.loss_config = model_setting.pop("loss", None)
297
pretrained = model_setting.pop("pretrained")
298
num_classes = model_setting.pop("num_classes", 2)
299
assert pretrained is not None
301
model_name = model_setting.pop("name")
302
tokenizer_class, pretrained_name = MODEL_CLASSES[model_name]
303
self.tokenizer = tokenizer_class.from_pretrained(pretrained_name)
305
model_setting["vocab_size"] = vocab_size_with_padding(
306
model_setting.get("vocab_size", self.tokenizer.vocab_size),
307
model_setting.pop("vocab_size_divisible_unit", 128),
308
self.configs.Distributed.get("mp_degree", 1),
311
l = model_setting["num_layers"]
312
h = model_setting["hidden_size"]
313
v = model_setting["vocab_size"]
314
num_heads = model_setting["num_attention_heads"]
315
s = self.configs.Data.Train.dataset.max_length
316
get_model_size(l, h, v, s)
319
model = gpt.GPTForSequenceClassification(gpt.GPTModel(**model_setting), num_classes)
321
raise NotImplementedError
323
pretrained_path = pretrained + ".pdparams"
324
assert os.path.exists(pretrained_path), f"{pretrained_path} is not exists!"
325
model_dict = paddle.load(pretrained_path)
329
def is_fused(model_state):
330
for key in model_state:
331
if "qkv_proj" in key:
335
def split_params(model_state, num_layers):
336
for idx in range(num_layers):
337
qkv_b = model_state.pop(f"gpt.decoder.layers.{idx}.self_attn.qkv_proj.bias")
338
qkv_w = model_state.pop(f"gpt.decoder.layers.{idx}.self_attn.qkv_proj.weight")
340
qkv_b = qkv_b.reshape((num_heads, 3, -1))
341
qkv_w = qkv_w.reshape((h, num_heads, 3, -1))
343
q_w, k_w, v_w = np.split(qkv_w, 3, axis=2)
344
q_w = q_w.reshape((h, -1))
345
k_w = k_w.reshape((h, -1))
346
v_w = v_w.reshape((h, -1))
348
q_b, k_b, v_b = np.split(qkv_b, 3, axis=1)
349
q_b = q_b.reshape((-1))
350
k_b = k_b.reshape((-1))
351
v_b = v_b.reshape((-1))
353
model_state[f"gpt.decoder.layers.{idx}.self_attn.q_proj.bias"] = q_b
354
model_state[f"gpt.decoder.layers.{idx}.self_attn.q_proj.weight"] = q_w
356
model_state[f"gpt.decoder.layers.{idx}.self_attn.k_proj.bias"] = k_b
357
model_state[f"gpt.decoder.layers.{idx}.self_attn.k_proj.weight"] = k_w
359
model_state[f"gpt.decoder.layers.{idx}.self_attn.v_proj.bias"] = v_b
360
model_state[f"gpt.decoder.layers.{idx}.self_attn.v_proj.weight"] = v_w
364
def fuse_params(model_state, num_layers):
365
for idx in range(num_layers):
366
q_b = model_state.pop(f"gpt.decoder.layers.{idx}.self_attn.q_proj.bias")
367
q_w = model_state.pop(f"gpt.decoder.layers.{idx}.self_attn.q_proj.weight")
369
k_b = model_state.pop(f"gpt.decoder.layers.{idx}.self_attn.k_proj.bias")
370
k_w = model_state.pop(f"gpt.decoder.layers.{idx}.self_attn.k_proj.weight")
372
v_b = model_state.pop(f"gpt.decoder.layers.{idx}.self_attn.v_proj.bias")
373
v_w = model_state.pop(f"gpt.decoder.layers.{idx}.self_attn.v_proj.weight")
375
q_w = q_w.reshape((h, num_heads, -1))
376
k_w = k_w.reshape((h, num_heads, -1))
377
v_w = v_w.reshape((h, num_heads, -1))
379
qkv_w = np.stack([q_w, k_w, v_w], axis=2)
380
qkv_w = qkv_w.reshape((h, -1))
382
q_b = q_b.reshape((num_heads, -1))
383
k_b = k_b.reshape((num_heads, -1))
384
v_b = v_b.reshape((num_heads, -1))
385
qkv_b = np.stack([q_b, k_b, v_b], axis=1)
386
qkv_b = qkv_b.reshape((-1))
388
model_state[f"gpt.decoder.layers.{idx}.self_attn.qkv_proj.weight"] = qkv_w
389
model_state[f"gpt.decoder.layers.{idx}.self_attn.qkv_proj.bias"] = qkv_b
392
fused = is_fused(model.state_dict())
393
load_fused = is_fused(model_dict)
395
if fused is True and load_fused is False:
396
model_dict = fuse_params(model_dict, l)
397
elif fused is False and load_fused is True:
398
model_dict = split_params(model_dict, l)
400
for name, param in model.state_dict().items():
401
if name in model_dict and param.dtype != model_dict[name].dtype:
402
model_dict[name] = model_dict[name].cast(param.dtype)
404
model.set_state_dict(model_dict)
405
logger.info(f"Load pretrained weight from {pretrained_path}")
409
def forward(self, tokens):
410
return self.model(tokens)
412
def training_step(self, batch):
413
input_ids, labels = batch
415
input_ids.stop_gradient = True
416
labels.stop_gradient = True
418
logits = self(input_ids)
419
loss = self.loss_fn(logits, labels)
423
def training_step_end(self, log_dict):
424
speed = 1.0 / log_dict["train_cost"]
425
default_global_tokens_num = self.configs.Global.global_batch_size * self.configs.Data.Train.dataset.max_length
428
"[train] epoch: [%d/%d], step: [%d/%d], learning rate: %.7f, loss: %.9f, avg_batch_cost: %.5f sec, speed: %.2f step/s, "
429
"ips_total: %.0f tokens/s, ips: %.0f tokens/s"
432
log_dict["total_epoch"],
434
log_dict["total_batch"],
437
log_dict["train_cost"],
439
speed * default_global_tokens_num,
440
speed * default_global_tokens_num / self.data_world_size,
444
def validation_step(self, batch):
445
input_ids, labels = batch
447
input_ids.stop_gradient = True
448
labels.stop_gradient = True
450
logits = self(input_ids)
451
loss = self.eval_loss_fn(logits, labels)
452
correct = self.eval_metric.compute(logits, labels)
453
self.eval_metric.update(correct)
456
def validation_step_end(self, log_dict):
457
speed = 1.0 / log_dict["eval_cost"]
459
"[eval] epoch: %d, batch: %d, loss: %.9f, avg_eval_cost: %.5f sec, speed: %.2f step/s"
460
% (log_dict["epoch"], log_dict["batch"], log_dict["loss"], log_dict["eval_cost"], speed)
463
def test_step(self, batch):
464
tokens, position_ids, labels, loss_mask = batch
465
preds = self(tokens, position_ids)
466
preds = paddle.cast(preds, dtype="float32")
467
loss = self.eval_loss_fn(preds, labels, loss_mask)
470
def test_step_end(self, log_dict):
471
speed = 1.0 / log_dict["test_cost"]
473
"[test] epoch: %d, batch: %d, loss: %.9f, avg_test_cost: %.5f sec, speed: %.2f step/s"
474
% (log_dict["epoch"], log_dict["batch"], log_dict["loss"], log_dict["test_cost"], speed)
477
def training_epoch_end(self, log_dict):
478
logger.info("[Training] epoch: %d, total time: %.5f sec" % (log_dict["epoch"], log_dict["train_cost"]))
480
def validation_epoch_end(self, log_dict):
481
res = self.eval_metric.accumulate()
482
self.eval_metric.reset()
483
if isinstance(self.eval_metric, AccuracyAndF1):
484
msg = "acc: %.5f, precision: %.5f, recall: %.5f, f1: %.5f, acc and f1: %.5f" % (
492
elif isinstance(self.eval_metric, Mcc):
493
msg = "mcc: %.5f" % (res[0])
495
elif isinstance(self.eval_metric, PearsonAndSpearman):
496
msg = "pearson: %.5f, spearman: %.5f, pearson and spearman: %.5f" % (res[0], res[1], res[2])
499
msg = "acc: %.5f" % (res)
502
if metric > self.best_metric:
503
self.best_metric = metric
506
"[Eval] epoch: %d, total time: %.5f sec, %s, best_metric: %.5f"
507
% (log_dict["epoch"], log_dict["eval_cost"], msg, self.best_metric)
511
class GPTGenerationModule(BasicModule):
512
def __init__(self, configs):
513
self.configs = configs
514
self.generation_cfgs = configs.Generation
515
self.nranks = paddle.distributed.get_world_size()
517
super().__init__(configs)
519
def process_configs(self, configs):
520
configs = process_configs(configs)
524
model_setting = copy.deepcopy(self.configs.Model)
525
if "Compress" in self.configs and "Quantization" in self.configs.Compress:
526
quant_setting = copy.deepcopy(self.configs.Compress.Quantization)
527
skip_tensor_map = quant_setting.get("skip_tensor_map", {})
528
freeze_embedding = quant_setting.get("freeze_embedding", False)
529
model_setting["skip_tensor_map"] = skip_tensor_map
530
model_setting["freeze_embedding"] = freeze_embedding
531
model_setting.pop("module")
533
model_name = model_setting.pop("name")
534
tokenizer_class, pretrained_name = MODEL_CLASSES[model_name]
535
self.tokenizer = tokenizer_class.from_pretrained(pretrained_name)
537
model_setting["vocab_size"] = vocab_size_with_padding(
538
model_setting.get("vocab_size", self.tokenizer.vocab_size),
539
model_setting.pop("vocab_size_divisible_unit", 128),
540
self.configs.Distributed.get("mp_degree", 1),
544
model = gpt.GPTForGeneration(gpt.GPTModel(**model_setting), self.generation_cfgs)
547
self.nranks == self.configs.Distributed.dp_degree
548
), "only support single card and data parallel in generation task."
549
model = gpt.GPTForGenerationHybrid(gpt.GPTModelHybrid(**model_setting), self.generation_cfgs)
551
self.generation_cfgs["max_dec_len"] = self.adjust_length_to_model(self.generation_cfgs["max_dec_len"], 512)
553
self.generation_cfgs["bos_token_id"] = self.tokenizer.eos_token_id
554
self.generation_cfgs["eos_token_id"] = self.tokenizer.eos_token_id
555
self.generation_cfgs["pad_token_id"] = self.tokenizer.eos_token_id
559
def adjust_length_to_model(self, length, max_sequence_length):
560
if length < 0 or length > max_sequence_length:
561
length = max_sequence_length
564
def left_padding(self, inputs, pad_id, padding="longest"):
565
assert "input_ids" in inputs, "input_ids should be in inputs!"
567
for ids in inputs["input_ids"]:
568
max_length = max(max_length, len(ids))
570
def extend_max_lenth(value, max_length, to_pad_id):
571
return [to_pad_id] * (max_length - len(value)) + value
573
def extend_filed(name, max_length, to_pad_id):
574
values = inputs[name]
576
for index, value in enumerate(values):
577
res.append(extend_max_lenth(value, max_length, to_pad_id))
580
extend_filed("input_ids", max_length, pad_id)
581
if "attention_mask" in inputs:
582
extend_filed("attention_mask", max_length, 0)
583
if "position_ids" in inputs:
584
extend_filed("position_ids", max_length, 0)
588
def generate(self, input_text):
589
return self(input_text)
591
def forward(self, input_text):
592
input_ids = self.tokenizer.encode(input_text)
593
inputs = {"input_ids": [input_ids]}
595
inputs = self.left_padding(inputs, self.tokenizer.eos_token_id)
596
input_ids = inputs["input_ids"]
598
if len(input_ids) == 0:
602
input_ids = paddle.to_tensor(input_ids, dtype="int64")
604
ids, scores = self.model(input_ids=input_ids)
606
generated_sequences = []
607
for i, generated_ids in enumerate(ids):
608
generated_ids = generated_ids.numpy().tolist()
610
text = self.tokenizer.convert_ids_to_string(generated_ids)
611
sequence = input_text + text
612
generated_sequences.append(sequence)
614
return generated_sequences
616
def input_spec(self):
617
return [InputSpec(shape=[None, None], name="input_ids", dtype="int64")]
620
class GPTEvalModule(LanguageModule):
621
def __init__(self, configs):
622
self.eval_cfgs = configs.Offline_Eval
624
super().__init__(configs)
626
self.post_process_configs()
628
self.first_step = True
630
self.score_name = "loss" if not self.eval_cfgs.cloze_eval else "number correct"
632
def post_process_configs(self):
633
self.configs.pop("Optimizer", None)
634
self.configs.pop("Inference", None)
636
self.configs.Data.pop("Train", None)
637
self.configs.Data.pop("Test", None)
638
self.configs.Data.Eval.pop("sampler", None)
639
self.configs.Data.Eval.loader.collate_fn = "gpt_collate_fn"
640
self.configs.Data.Eval.loader.batch_size = self.eval_cfgs.batch_size
641
self.configs.Data.Eval.dataset.input_dir = self.eval_cfgs.eval_path
642
self.configs.Data.Eval.dataset.max_seq_len = self.eval_cfgs.max_seq_len
644
self.configs.Engine.logging_freq = self.eval_cfgs.logging_freq
646
if not self.eval_cfgs.cloze_eval:
647
self.configs.Data.Eval.dataset.name = "LM_Eval_Dataset"
648
self.configs.Data.Eval.dataset.overlapping_eval = self.eval_cfgs.overlapping_eval
650
self.configs.Data.Eval.dataset.name = "Lambada_Eval_Dataset"
653
model_setting = copy.deepcopy(self.configs.Model)
654
if "Compress" in self.configs and "Quantization" in self.configs.Compress:
655
quant_setting = copy.deepcopy(self.configs.Compress.Quantization)
656
skip_tensor_map = quant_setting.get("skip_tensor_map", {})
657
freeze_embedding = quant_setting.get("freeze_embedding", False)
658
model_setting["skip_tensor_map"] = skip_tensor_map
659
model_setting["freeze_embedding"] = freeze_embedding
660
model_setting.pop("module")
662
model_name = model_setting.pop("name")
663
tokenizer_class, pretrained_name = MODEL_CLASSES[model_name]
664
self.tokenizer = tokenizer_class.from_pretrained(pretrained_name)
666
model_setting["vocab_size"] = vocab_size_with_padding(
667
model_setting.get("vocab_size", self.tokenizer.vocab_size),
668
model_setting.pop("vocab_size_divisible_unit", 128),
669
self.configs.Distributed.get("mp_degree", 1),
673
model = gpt.GPTForPretraining(gpt.GPTModel(**model_setting))
675
raise RuntimeError("Only single-card offline eval is supported in GPTModel now.")
679
def forward(self, tokens, ids, mask):
680
return self.model(tokens, ids, mask)
682
def validation_step(self, batch):
683
tokens, loss_mask, attention_mask, position_ids, labels, info = batch
685
preds = self(tokens, position_ids, attention_mask)
687
if not self.eval_cfgs.cloze_eval:
689
self.num_original_tokens = info.numpy()[0][0]
690
self.num_tokenized_tokens = info.numpy()[0][1]
692
masked_lm_loss = paddle.nn.functional.cross_entropy(preds, labels, reduction="none")
693
loss = paddle.sum(masked_lm_loss * loss_mask)
697
self.num_examples = info.numpy()[0][0]
699
outputs = paddle.argmax(preds, -1)
700
acc = paddle.cast(outputs == labels, "float32")
701
acc = paddle.where(paddle.cast(loss_mask, "bool"), acc, paddle.ones_like(acc))
702
acc = paddle.sum(paddle.prod(acc, -1))
705
self.first_step = False
707
def validation_step_end(self, log_dict):
708
speed = 1.0 / log_dict["eval_cost"]
710
if not self.eval_cfgs.cloze_eval:
711
self.total_score += log_dict["loss"] * self.configs.Engine.logging_freq / (self.num_tokenized_tokens - 1)
713
self.total_score += log_dict["loss"] * self.configs.Engine.logging_freq
716
"[eval] epoch: %d, batch: %d, %s: %.9f, speed: %.2f step/s"
717
% (log_dict["epoch"], log_dict["batch"], self.score_name, self.total_score, speed)
720
def validation_epoch_end(self, log_dict):
721
if not self.eval_cfgs.cloze_eval:
722
total_loss = float(self.total_score)
723
ppl = math.exp(min(20, total_loss))
724
token_ratio = (self.num_tokenized_tokens - 1) / (self.num_original_tokens - 1)
725
adjusted_ppl = math.exp(min(20, total_loss * token_ratio))
726
string = " validation results on {} | ".format(self.eval_cfgs.eval_path)
727
string += "avg loss: {:.4E} | ".format(total_loss)
728
string += "ppl: {:.4E} | ".format(ppl)
729
string += "adjusted ppl: {:.4E} | ".format(adjusted_ppl)
730
string += "token ratio: {} |".format(token_ratio)
732
num_correct = float(self.total_score)
733
acc = float(num_correct / self.num_examples)
734
string = " validation results on {} | ".format(self.eval_cfgs.eval_path)
735
string += "number correct: {:.4E} | ".format(num_correct)
736
string += "total examples: {:.4E} | ".format(self.num_examples)
737
string += "avg accuracy: {:.4E}".format(acc)
741
def input_spec(self):
743
InputSpec(shape=[None, None], name="tokens", dtype="int64"),
744
InputSpec(shape=[None, None], name="ids", dtype="int64"),