paddlenlp
503 строки · 20.2 Кб
1# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14
15import math16
17import numpy as np18import paddle19from paddle.optimizer.lr import LambdaDecay20
21from paddlenlp.transformers import normalize_chars, tokenize_special_chars22
23
24def create_dataloader(dataset, mode="train", batch_size=1, batchify_fn=None, trans_fn=None):25if trans_fn:26dataset = dataset.map(trans_fn)27
28shuffle = True if mode == "train" else False29if mode == "train":30batch_sampler = paddle.io.DistributedBatchSampler(dataset, batch_size=batch_size, shuffle=shuffle)31else:32batch_sampler = paddle.io.BatchSampler(dataset, batch_size=batch_size, shuffle=shuffle)33
34return paddle.io.DataLoader(dataset=dataset, batch_sampler=batch_sampler, collate_fn=batchify_fn, return_list=True)35
36
37class LinearDecayWithWarmup(LambdaDecay):38def __init__(self, learning_rate, total_steps, warmup, last_epoch=-1, verbose=False):39"""40Creates a learning rate scheduler, which increases learning rate linearly
41from 0 to given `learning_rate`, after this warmup period learning rate
42would be decreased linearly from the base learning rate to 0.
43
44Args:
45learning_rate (float):
46The base learning rate. It is a python float number.
47total_steps (int):
48The number of training steps.
49warmup (int or float):
50If int, it means the number of steps for warmup. If float, it means
51the proportion of warmup in total training steps.
52last_epoch (int, optional):
53The index of last epoch. It can be set to restart training. If
54None, it means initial learning rate.
55Defaults to -1.
56verbose (bool, optional):
57If True, prints a message to stdout for each update.
58Defaults to False.
59"""
60
61warmup_steps = warmup if isinstance(warmup, int) else int(math.floor(warmup * total_steps))62
63def lr_lambda(current_step):64if current_step < warmup_steps:65return float(current_step) / float(max(1, warmup_steps))66return max(0.0, 1.0 - current_step / total_steps)67
68super(LinearDecayWithWarmup, self).__init__(learning_rate, lr_lambda, last_epoch, verbose)69
70
71def convert_example(example, tokenizer, max_seq_length=512, is_test=False):72"""73Builds model inputs from a sequence or a pair of sequences for sequence
74classification tasks by concatenating and adding special tokens. And
75creates a mask from the two sequences for sequence-pair classification
76tasks.
77
78The convention in Electra/EHealth is:
79
80- single sequence:
81input_ids: ``[CLS] X [SEP]``
82token_type_ids: `` 0 0 0``
83position_ids: `` 0 1 2``
84
85- a senquence pair:
86input_ids: ``[CLS] X [SEP] Y [SEP]``
87token_type_ids: `` 0 0 0 1 1``
88position_ids: `` 0 1 2 3 4``
89
90Args:
91example (obj:`dict`):
92A dictionary of input data, containing text and label if it has.
93tokenizer (obj:`PretrainedTokenizer`):
94A tokenizer inherits from :class:`paddlenlp.transformers.PretrainedTokenizer`.
95Users can refer to the superclass for more information.
96max_seq_length (obj:`int`):
97The maximum total input sequence length after tokenization.
98Sequences longer will be truncated, and the shorter will be padded.
99is_test (obj:`bool`, default to `False`):
100Whether the example contains label or not.
101
102Returns:
103input_ids (obj:`list[int]`):
104The list of token ids.
105token_type_ids (obj:`list[int]`):
106List of sequence pair mask.
107position_ids (obj:`list[int]`):
108List of position ids.
109label(obj:`numpy.array`, data type of int64, optional):
110The input label if not is_test.
111"""
112text_a = example["text_a"]113text_b = example.get("text_b", None)114
115text_a = tokenize_special_chars(normalize_chars(text_a))116if text_b is not None:117text_b = tokenize_special_chars(normalize_chars(text_b))118
119encoded_inputs = tokenizer(text=text_a, text_pair=text_b, max_seq_len=max_seq_length, return_position_ids=True)120input_ids = encoded_inputs["input_ids"]121token_type_ids = encoded_inputs["token_type_ids"]122position_ids = encoded_inputs["position_ids"]123
124if is_test:125return input_ids, token_type_ids, position_ids126label = np.array([example["label"]], dtype="int64")127return input_ids, token_type_ids, position_ids, label128
129
130def convert_example_ner(example, tokenizer, max_seq_length=512, pad_label_id=-100, is_test=False):131"""132Builds model inputs from a sequence and creates labels for named-
133entity recognition task CMeEE.
134
135For example, a sample should be:
136
137- input_ids: ``[CLS] x1 x2 [SEP] [PAD]``
138- token_type_ids: `` 0 0 0 0 0``
139- position_ids: `` 0 1 2 3 0``
140- attention_mask: `` 1 1 1 1 0``
141- label_oth: `` 32 3 32 32 32`` (optional, label ids of others)
142- label_sym: `` 4 4 4 4 4`` (optional, label ids of symptom)
143
144Args:
145example (obj:`dict`):
146A dictionary of input data, containing text and label if it has.
147tokenizer (obj:`PretrainedTokenizer`):
148A tokenizer inherits from :class:`paddlenlp.transformers.PretrainedTokenizer`.
149Users can refer to the superclass for more information.
150max_seq_length (obj:`int`):
151The maximum total input sequence length after tokenization.
152Sequences longer will be truncated, and the shorter will be padded.
153is_test (obj:`bool`, default to `False`):
154Whether the example contains label or not.
155
156Returns:
157encoded_output (obj: `dict[str, list|np.array]`):
158The sample dictionary including `input_ids`, `token_type_ids`,
159`position_ids`, `attention_mask`, `label_oth` (optional),
160`label_sym` (optional)
161"""
162
163encoded_inputs = {}164text = example["text"]165if len(text) > max_seq_length - 2:166text = text[: max_seq_length - 2]167text = ["[CLS]"] + [x.lower() for x in text] + ["[SEP]"]168input_len = len(text)169encoded_inputs["input_ids"] = tokenizer.convert_tokens_to_ids(text)170encoded_inputs["token_type_ids"] = np.zeros(input_len)171encoded_inputs["position_ids"] = list(range(input_len))172encoded_inputs["attention_mask"] = np.ones(input_len)173
174if not is_test:175labels = example["labels"]176if input_len - 2 < len(labels[0]):177labels[0] = labels[0][: input_len - 2]178if input_len - 2 < len(labels[1]):179labels[1] = labels[1][: input_len - 2]180encoded_inputs["label_oth"] = [pad_label_id[0]] + labels[0] + [pad_label_id[0]]181encoded_inputs["label_sym"] = [pad_label_id[1]] + labels[1] + [pad_label_id[1]]182
183return encoded_inputs184
185
186def convert_example_spo(example, tokenizer, num_classes, max_seq_length=512, is_test=False):187"""188Builds model inputs from a sequence and creates labels for SPO prediction
189task CMeIE.
190
191For example, a sample should be:
192
193- input_ids: ``[CLS] x1 x2 [SEP] [PAD]``
194- token_type_ids: `` 0 0 0 0 0``
195- position_ids: `` 0 1 2 3 0``
196- attention_mask: `` 1 1 1 1 0``
197- ent_label: ``[[0 1 0 0 0], # start ids are set as 1
198[0 0 1 0 0]] # end ids are set as 1
199- spo_label: a tensor of shape [num_classes, max_batch_len, max_batch_len].
200Set [predicate_id, subject_start_id, object_start_id] as 1
201when (subject, predicate, object) exists.
202
203Args:
204example (obj:`dict`):
205A dictionary of input data, containing text and label if it has.
206tokenizer (obj:`PretrainedTokenizer`):
207A tokenizer inherits from :class:`paddlenlp.transformers.PretrainedTokenizer`.
208Users can refer to the superclass for more information.
209num_classes (obj:`int`):
210The number of predicates.
211max_seq_length (obj:`int`):
212The maximum total input sequence length after tokenization.
213Sequences longer will be truncated, and the shorter will be padded.
214is_test (obj:`bool`, default to `False`):
215Whether the example contains label or not.
216
217Returns:
218encoded_output (obj: `dict[str, list|np.array]`):
219The sample dictionary including `input_ids`, `token_type_ids`,
220`position_ids`, `attention_mask`, `ent_label` (optional),
221`spo_label` (optional)
222"""
223encoded_inputs = {}224text = example["text"]225if len(text) > max_seq_length - 2:226text = text[: max_seq_length - 2]227text = ["[CLS]"] + [x.lower() for x in text] + ["[SEP]"]228input_len = len(text)229encoded_inputs["input_ids"] = tokenizer.convert_tokens_to_ids(text)230encoded_inputs["token_type_ids"] = np.zeros(input_len)231encoded_inputs["position_ids"] = list(range(input_len))232encoded_inputs["attention_mask"] = np.ones(input_len)233if not is_test:234encoded_inputs["ent_label"] = example["ent_label"]235encoded_inputs["spo_label"] = example["spo_label"]236return encoded_inputs237
238
239class NERChunkEvaluator(paddle.metric.Metric):240"""241NERChunkEvaluator computes the precision, recall and F1-score for chunk detection.
242It is often used in sequence tagging tasks, such as Named Entity Recognition (NER).
243
244Args:
245label_list (list):
246The label list.
247
248Note:
249Difference from `paddlenlp.metric.ChunkEvaluator`:
250
251- `paddlenlp.metric.ChunkEvaluator`
252All sequences with non-'O' labels are taken as chunks when computing num_infer.
253- `NERChunkEvaluator`
254Only complete sequences are taken as chunks, namely `B- I- E-` or `S-`.
255"""
256
257def __init__(self, label_list):258super(NERChunkEvaluator, self).__init__()259self.id2label = [dict(enumerate(x)) for x in label_list]260self.num_classes = [len(x) for x in label_list]261self.num_infer = 0262self.num_label = 0263self.num_correct = 0264
265def compute(self, lengths, predictions, labels):266"""267Computes the prediction, recall and F1-score for chunk detection.
268
269Args:
270lengths (Tensor):
271The valid length of every sequence, a tensor with shape `[batch_size]`.
272predictions (Tensor):
273The predictions index, a tensor with shape `[batch_size, sequence_length]`.
274labels (Tensor):
275The labels index, a tensor with shape `[batch_size, sequence_length]`.
276
277Returns:
278tuple: Returns tuple (`num_infer_chunks, num_label_chunks, num_correct_chunks`).
279
280With the fields:
281
282- `num_infer_chunks` (Tensor): The number of the inference chunks.
283- `num_label_chunks` (Tensor): The number of the label chunks.
284- `num_correct_chunks` (Tensor): The number of the correct chunks.
285"""
286assert len(predictions) == len(labels)287assert len(predictions) == len(self.id2label)288preds = [x.numpy() for x in predictions]289labels = [x.numpy() for x in labels]290
291preds_chunk = set()292label_chunk = set()293for idx, (pred, label) in enumerate(zip(preds, labels)):294for i, case in enumerate(pred):295case = [self.id2label[idx][x] for x in case[: lengths[i]]]296preds_chunk |= self.extract_chunk(case, i)297for i, case in enumerate(label):298case = [self.id2label[idx][x] for x in case[: lengths[i]]]299label_chunk |= self.extract_chunk(case, i)300
301num_infer = len(preds_chunk)302num_label = len(label_chunk)303num_correct = len(preds_chunk & label_chunk)304return num_infer, num_label, num_correct305
306def update(self, correct):307num_infer, num_label, num_correct = correct308self.num_infer += num_infer309self.num_label += num_label310self.num_correct += num_correct311
312def accumulate(self):313precision = self.num_correct / (self.num_infer + 1e-6)314recall = self.num_correct / (self.num_label + 1e-6)315f1 = 2 * precision * recall / (precision + recall + 1e-6)316return precision, recall, f1317
318def reset(self):319self.num_infer = 0320self.num_label = 0321self.num_correct = 0322
323def name(self):324return "precision", "recall", "f1"325
326def extract_chunk(self, sequence, cid=0):327chunks = set()328
329start_idx, cur_idx = 0, 0330while cur_idx < len(sequence):331if sequence[cur_idx][0] == "B":332start_idx = cur_idx333cur_idx += 1334while cur_idx < len(sequence) and sequence[cur_idx][0] == "I":335if sequence[cur_idx][2:] == sequence[start_idx][2:]:336cur_idx += 1337else:338break339if cur_idx < len(sequence) and sequence[cur_idx][0] == "E":340if sequence[cur_idx][2:] == sequence[start_idx][2:]:341chunks.add((cid, sequence[cur_idx][2:], start_idx, cur_idx))342cur_idx += 1343elif sequence[cur_idx][0] == "S":344chunks.add((cid, sequence[cur_idx][2:], cur_idx, cur_idx))345cur_idx += 1346else:347cur_idx += 1348
349return chunks350
351
352class SPOChunkEvaluator(paddle.metric.Metric):353"""354SPOChunkEvaluator computes the precision, recall and F1-score for multiple
355chunk detections, including Named Entity Recognition (NER) and SPO Prediction.
356
357Args:
358num_classes (int):
359The number of predicates.
360"""
361
362def __init__(self, num_classes=None):363super(SPOChunkEvaluator, self).__init__()364self.num_classes = num_classes365self.num_infer_ent = 0366self.num_infer_spo = 1e-10367self.num_label_ent = 0368self.num_label_spo = 1e-10369self.num_correct_ent = 0370self.num_correct_spo = 0371
372def compute(self, lengths, ent_preds, spo_preds, ent_labels, spo_labels):373"""374Computes the prediction, recall and F1-score for NER and SPO prediction.
375
376Args:
377lengths (Tensor):
378The valid length of every sequence, a tensor with shape `[batch_size]`.
379ent_preds (Tensor):
380The predictions of entities.
381A tensor with shape `[batch_size, sequence_length, 2]`.
382`ent_preds[:, :, 0]` denotes the start indexes of entities.
383`ent_preds[:, :, 1]` denotes the end indexes of entities.
384spo_preds (Tensor):
385The predictions of predicates between all possible entities.
386A tensor with shape `[batch_size, num_classes, sequence_length, sequence_length]`.
387ent_labels (list[list|tuple]):
388The entity labels' indexes. A list of pair `[start_index, end_index]`.
389spo_labels (list[list|tuple]):
390The SPO labels' indexes. A list of triple `[[subject_start_index, subject_end_index],
391predicate_id, [object_start_index, object_end_index]]`.
392
393Returns:
394tuple:
395Returns tuple (`num_infer_chunks, num_label_chunks, num_correct_chunks`).
396The `ent` denotes results of NER and the `spo` denotes results of SPO prediction.
397
398With the fields:
399
400- `num_infer_chunks` (dict): The number of the inference chunks.
401- `num_label_chunks` (dict): The number of the label chunks.
402- `num_correct_chunks` (dict): The number of the correct chunks.
403"""
404ent_preds = ent_preds.numpy()405spo_preds = spo_preds.numpy()406
407ent_pred_list = []408ent_idxs_list = []409for idx, ent_pred in enumerate(ent_preds):410seq_len = lengths[idx] - 2411start = np.where(ent_pred[:, 0] > 0.5)[0]412end = np.where(ent_pred[:, 1] > 0.5)[0]413ent_pred = []414ent_idxs = {}415for x in start:416y = end[end >= x]417if (x == 0) or (x > seq_len):418continue419if len(y) > 0:420y = y[0]421if y > seq_len:422continue423ent_idxs[x] = (x - 1, y - 1)424ent_pred.append((x - 1, y - 1))425ent_pred_list.append(ent_pred)426ent_idxs_list.append(ent_idxs)427
428spo_preds = spo_preds > 0429spo_pred_list = [[] for _ in range(len(spo_preds))]430idxs, preds, subs, objs = np.nonzero(spo_preds)431for idx, p_id, s_id, o_id in zip(idxs, preds, subs, objs):432obj = ent_idxs_list[idx].get(o_id, None)433if obj is None:434continue435sub = ent_idxs_list[idx].get(s_id, None)436if sub is None:437continue438spo_pred_list[idx].append((sub, p_id, obj))439
440correct = {"ent": 0, "spo": 0}441infer = {"ent": 0, "spo": 0}442label = {"ent": 0, "spo": 0}443for ent_pred, ent_true in zip(ent_pred_list, ent_labels):444ent_true = [tuple(x) for x in ent_true]445infer["ent"] += len(set(ent_pred))446label["ent"] += len(set(ent_true))447correct["ent"] += len(set(ent_pred) & set(ent_true))448
449for spo_pred, spo_true in zip(spo_pred_list, spo_labels):450spo_true = [(tuple(s), p, tuple(o)) for s, p, o in spo_true]451infer["spo"] += len(set(spo_pred))452label["spo"] += len(set(spo_true))453correct["spo"] += len(set(spo_pred) & set(spo_true))454
455return infer, label, correct456
457def update(self, corrects):458assert len(corrects) == 3459for item in corrects:460assert isinstance(item, dict)461for value in item.values():462if not self._is_number_or_matrix(value):463raise ValueError("The numbers must be a number(int) or a numpy ndarray.")464num_infer, num_label, num_correct = corrects465self.num_infer_ent += num_infer["ent"]466self.num_infer_spo += num_infer["spo"]467self.num_label_ent += num_label["ent"]468self.num_label_spo += num_label["spo"]469self.num_correct_ent += num_correct["ent"]470self.num_correct_spo += num_correct["spo"]471
472def accumulate(self):473spo_precision = self.num_correct_spo / self.num_infer_spo474spo_recall = self.num_correct_spo / self.num_label_spo475spo_f1 = 2 * self.num_correct_spo / (self.num_infer_spo + self.num_label_spo)476ent_precision = self.num_correct_ent / self.num_infer_ent if self.num_infer_ent > 0 else 0.0477ent_recall = self.num_correct_ent / self.num_label_ent if self.num_label_ent > 0 else 0.0478ent_f1 = (4792 * ent_precision * ent_recall / (ent_precision + ent_recall) if (ent_precision + ent_recall) != 0 else 0.0480)481return {"entity": (ent_precision, ent_recall, ent_f1), "spo": (spo_precision, spo_recall, spo_f1)}482
483def _is_number_or_matrix(self, var):484def _is_number_(var):485return (486isinstance(var, int)487or isinstance(var, np.int64)488or isinstance(var, float)489or (isinstance(var, np.ndarray) and var.shape == (1,))490)491
492return _is_number_(var) or isinstance(var, np.ndarray)493
494def reset(self):495self.num_infer_ent = 0496self.num_infer_spo = 1e-10497self.num_label_ent = 0498self.num_label_spo = 1e-10499self.num_correct_ent = 0500self.num_correct_spo = 0501
502def name(self):503return {"entity": ("precision", "recall", "f1"), "spo": ("precision", "recall", "f1")}504