1
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
3
# Licensed under the Apache License, Version 2.0 (the "License");
4
# you may not use this file except in compliance with the License.
5
# You may obtain a copy of the License at
7
# http://www.apache.org/licenses/LICENSE-2.0
9
# Unless required by applicable law or agreed to in writing, software
10
# distributed under the License is distributed on an "AS IS" BASIS,
11
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
# See the License for the specific language governing permissions and
13
# limitations under the License.
23
from paddlenlp.data import (
24
DataCollatorForLanguageModeling,
25
DataCollatorForTokenClassification,
26
DataCollatorForWholeWordMask,
27
DataCollatorWithPadding,
28
default_data_collator,
30
from paddlenlp.trainer import set_seed
31
from paddlenlp.transformers import BertTokenizer
34
class DataCollatorIntegrationTest(unittest.TestCase):
36
self.tmpdirname = tempfile.mkdtemp()
38
vocab_tokens = ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
39
self.vocab_file = os.path.join(self.tmpdirname, "vocab.txt")
40
with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
41
vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
44
shutil.rmtree(self.tmpdirname)
46
def test_default_with_dict(self):
47
features = [{"label": i, "inputs": [0, 1, 2, 3, 4, 5]} for i in range(8)]
48
batch = default_data_collator(features)
50
self.assertTrue(batch["labels"].equal_all(paddle.to_tensor(list(range(8)))))
51
self.assertEqual(batch["labels"].dtype, paddle.int64)
52
self.assertEqual(batch["inputs"].shape, [8, 6])
55
features = [{"label_ids": [0, 1, 2], "inputs": [0, 1, 2, 3, 4, 5]} for i in range(8)]
56
batch = default_data_collator(features)
57
self.assertTrue(batch["labels"].equal_all(paddle.to_tensor([[0, 1, 2]] * 8)))
58
self.assertEqual(batch["labels"].dtype, paddle.int64)
59
self.assertEqual(batch["inputs"].shape, [8, 6])
61
# Features can already be tensors
62
features = [{"label": i, "inputs": np.random.randint(0, 10, [10])} for i in range(8)]
63
batch = default_data_collator(features)
64
self.assertTrue(batch["labels"].equal_all(paddle.to_tensor(list(range(8)))))
65
self.assertEqual(batch["labels"].dtype, paddle.int64)
66
self.assertEqual(batch["inputs"].shape, [8, 10])
68
# Labels can already be tensors
69
features = [{"label": paddle.to_tensor(i), "inputs": np.random.randint(0, 10, [10])} for i in range(8)]
71
batch = default_data_collator(features)
72
self.assertEqual(batch["labels"].dtype, paddle.int64)
73
self.assertTrue(batch["labels"].equal_all(paddle.to_tensor(list(range(8)))))
74
self.assertEqual(batch["labels"].dtype, paddle.int64)
75
self.assertEqual(batch["inputs"].shape, [8, 10])
77
def test_default_classification_and_regression(self):
78
data_collator = default_data_collator
80
features = [{"input_ids": [0, 1, 2, 3, 4], "label": i} for i in range(4)]
81
batch = data_collator(features)
82
self.assertEqual(batch["labels"].dtype, paddle.int64)
84
features = [{"input_ids": [0, 1, 2, 3, 4], "label": float(i)} for i in range(4)]
85
batch = data_collator(features)
86
self.assertEqual(batch["labels"].dtype, paddle.float32)
88
def test_default_with_no_labels(self):
89
features = [{"label": None, "inputs": [0, 1, 2, 3, 4, 5]} for i in range(8)]
90
batch = default_data_collator(features)
91
self.assertTrue("labels" not in batch)
92
self.assertEqual(batch["inputs"].shape, [8, 6])
95
features = [{"label_ids": None, "inputs": [0, 1, 2, 3, 4, 5]} for i in range(8)]
96
batch = default_data_collator(features)
97
self.assertTrue("labels" not in batch)
98
self.assertEqual(batch["inputs"].shape, [8, 6])
100
def test_data_collator_with_padding(self):
101
tokenizer = BertTokenizer(self.vocab_file)
102
features = [{"input_ids": [0, 1, 2]}, {"input_ids": [0, 1, 2, 3, 4, 5]}]
104
data_collator = DataCollatorWithPadding(tokenizer)
105
batch = data_collator(features)
106
self.assertEqual(batch["input_ids"].shape, [2, 6])
107
self.assertEqual(batch["input_ids"][0].tolist(), [0, 1, 2] + [tokenizer.pad_token_id] * 3)
109
data_collator = DataCollatorWithPadding(tokenizer, padding="max_length", max_length=10)
110
batch = data_collator(features)
111
self.assertEqual(batch["input_ids"].shape, [2, 10])
113
data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8)
114
batch = data_collator(features)
115
self.assertEqual(batch["input_ids"].shape, [2, 8])
117
def test_data_collator_for_token_classification(self):
118
tokenizer = BertTokenizer(self.vocab_file)
120
{"input_ids": [0, 1, 2], "labels": [0, 1, 2]},
121
{"input_ids": [0, 1, 2, 3, 4, 5], "labels": [0, 1, 2, 3, 4, 5]},
123
data_collator = DataCollatorForTokenClassification(tokenizer)
124
batch = data_collator(features)
125
self.assertEqual(batch["input_ids"].shape, [2, 6])
126
self.assertEqual(batch["input_ids"][0].tolist(), [0, 1, 2] + [tokenizer.pad_token_id] * 3)
127
self.assertEqual(batch["labels"].shape, [2, 6])
128
self.assertEqual(batch["labels"][0].tolist(), [0, 1, 2] + [-100] * 3)
130
data_collator = DataCollatorForTokenClassification(tokenizer, padding="max_length", max_length=10)
131
batch = data_collator(features)
132
self.assertEqual(batch["input_ids"].shape, [2, 10])
133
self.assertEqual(batch["labels"].shape, [2, 10])
135
data_collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=8)
136
batch = data_collator(features)
137
self.assertEqual(batch["input_ids"].shape, [2, 8])
138
self.assertEqual(batch["labels"].shape, [2, 8])
140
data_collator = DataCollatorForTokenClassification(tokenizer, label_pad_token_id=-1)
141
batch = data_collator(features)
142
self.assertEqual(batch["input_ids"].shape, [2, 6])
143
self.assertEqual(batch["input_ids"][0].tolist(), [0, 1, 2] + [tokenizer.pad_token_id] * 3)
144
self.assertEqual(batch["labels"].shape, [2, 6])
145
self.assertEqual(batch["labels"][0].tolist(), [0, 1, 2] + [-1] * 3)
147
for feature in features:
148
feature.pop("labels")
150
batch = data_collator(features)
151
self.assertEqual(batch["input_ids"].shape, [2, 6])
152
self.assertEqual(batch["input_ids"][0].tolist(), [0, 1, 2] + [tokenizer.pad_token_id] * 3)
154
def test_data_collator_for_token_classification_works_with_pt_tensors(self):
155
tokenizer = BertTokenizer(self.vocab_file)
157
{"input_ids": paddle.to_tensor([0, 1, 2]), "labels": paddle.to_tensor([0, 1, 2])},
158
{"input_ids": paddle.to_tensor([0, 1, 2, 3, 4, 5]), "labels": paddle.to_tensor([0, 1, 2, 3, 4, 5])},
161
data_collator = DataCollatorForTokenClassification(tokenizer)
162
batch = data_collator(features)
163
self.assertEqual(batch["input_ids"].shape, [2, 6])
164
self.assertEqual(batch["input_ids"][0].tolist(), [0, 1, 2] + [tokenizer.pad_token_id] * 3)
165
self.assertEqual(batch["labels"].shape, [2, 6])
166
self.assertEqual(batch["labels"][0].tolist(), [0, 1, 2] + [-100] * 3)
168
data_collator = DataCollatorForTokenClassification(tokenizer, padding="max_length", max_length=10)
169
batch = data_collator(features)
170
self.assertEqual(batch["input_ids"].shape, [2, 10])
171
self.assertEqual(batch["labels"].shape, [2, 10])
173
data_collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=8)
174
batch = data_collator(features)
175
self.assertEqual(batch["input_ids"].shape, [2, 8])
176
self.assertEqual(batch["labels"].shape, [2, 8])
178
data_collator = DataCollatorForTokenClassification(tokenizer, label_pad_token_id=-1)
179
batch = data_collator(features)
180
self.assertEqual(batch["input_ids"].shape, [2, 6])
181
self.assertEqual(batch["input_ids"][0].tolist(), [0, 1, 2] + [tokenizer.pad_token_id] * 3)
182
self.assertEqual(batch["labels"].shape, [2, 6])
183
self.assertEqual(batch["labels"][0].tolist(), [0, 1, 2] + [-1] * 3)
185
for feature in features:
186
feature.pop("labels")
188
batch = data_collator(features)
189
self.assertEqual(batch["input_ids"].shape, [2, 6])
190
self.assertEqual(batch["input_ids"][0].tolist(), [0, 1, 2] + [tokenizer.pad_token_id] * 3)
192
def _test_no_pad_and_pad(self, no_pad_features, pad_features):
193
tokenizer = BertTokenizer(self.vocab_file)
194
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
195
batch = data_collator(no_pad_features)
196
self.assertEqual(batch["input_ids"].shape, [2, 10])
197
self.assertEqual(batch["labels"].shape, [2, 10])
199
batch = data_collator(pad_features)
200
self.assertEqual(batch["input_ids"].shape, [2, 10])
201
self.assertEqual(batch["labels"].shape, [2, 10])
203
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False, pad_to_multiple_of=8)
204
batch = data_collator(no_pad_features)
205
self.assertEqual(batch["input_ids"].shape, [2, 16])
206
self.assertEqual(batch["labels"].shape, [2, 16])
208
batch = data_collator(pad_features)
209
self.assertEqual(batch["input_ids"].shape, [2, 16])
210
self.assertEqual(batch["labels"].shape, [2, 16])
212
tokenizer._pad_token = None
213
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
214
with self.assertRaises(ValueError):
215
# Expect error due to padding token missing
216
data_collator(pad_features)
218
set_seed(3) # For reproducibility
219
tokenizer = BertTokenizer(self.vocab_file)
220
data_collator = DataCollatorForLanguageModeling(tokenizer)
221
batch = data_collator(no_pad_features)
222
self.assertEqual(batch["input_ids"].shape, [2, 10])
223
self.assertEqual(batch["labels"].shape, [2, 10])
225
masked_tokens = batch["input_ids"] == tokenizer.mask_token_id
226
self.assertTrue(paddle.any(masked_tokens))
227
self.assertTrue(all(x == -100 for x in batch["labels"][~masked_tokens].tolist()))
229
batch = data_collator(pad_features)
230
self.assertEqual(batch["input_ids"].shape, [2, 10])
231
self.assertEqual(batch["labels"].shape, [2, 10])
233
masked_tokens = batch["input_ids"] == tokenizer.mask_token_id
234
self.assertTrue(paddle.any(masked_tokens))
235
self.assertTrue(all(x == -100 for x in batch["labels"][~masked_tokens].tolist()))
237
data_collator = DataCollatorForLanguageModeling(tokenizer, pad_to_multiple_of=8)
238
batch = data_collator(no_pad_features)
239
self.assertEqual(batch["input_ids"].shape, [2, 16])
240
self.assertEqual(batch["labels"].shape, [2, 16])
242
masked_tokens = batch["input_ids"] == tokenizer.mask_token_id
243
self.assertTrue(paddle.any(masked_tokens))
244
self.assertTrue(all(x == -100 for x in batch["labels"][~masked_tokens].tolist()))
246
batch = data_collator(pad_features)
247
self.assertEqual(batch["input_ids"].shape, [2, 16])
248
self.assertEqual(batch["labels"].shape, [2, 16])
250
masked_tokens = batch["input_ids"] == tokenizer.mask_token_id
251
self.assertTrue(paddle.any(masked_tokens))
252
self.assertTrue(all(x == -100 for x in batch["labels"][~masked_tokens].tolist()))
254
def test_data_collator_for_language_modeling(self):
255
no_pad_features = [{"input_ids": list(range(10))}, {"input_ids": list(range(10))}]
256
pad_features = [{"input_ids": list(range(5))}, {"input_ids": list(range(10))}]
257
self._test_no_pad_and_pad(no_pad_features, pad_features)
259
no_pad_features = [list(range(10)), list(range(10))]
260
pad_features = [list(range(5)), list(range(10))]
261
self._test_no_pad_and_pad(no_pad_features, pad_features)
263
def test_data_collator_for_whole_word_mask(self):
264
features = [{"input_ids": list(range(10))}, {"input_ids": list(range(10))}]
266
tokenizer = BertTokenizer(self.vocab_file)
267
data_collator = DataCollatorForWholeWordMask(tokenizer, return_tensors="pd")
268
batch = data_collator(features)
270
self.assertEqual(batch["input_ids"].shape, [2, 10])
271
self.assertEqual(batch["labels"].shape, [2, 10])
274
tokenizer = BertTokenizer(self.vocab_file)
276
{"input_ids": [0, 1, 2, 3, 4], "token_type_ids": [0, 1, 2, 3, 4], "next_sentence_label": i}
279
data_collator = DataCollatorForLanguageModeling(tokenizer)
280
batch = data_collator(features)
282
self.assertEqual(batch["input_ids"].shape, [2, 5])
283
self.assertEqual(batch["token_type_ids"].shape, [2, 5])
284
self.assertEqual(batch["labels"].shape, [2, 5])
286
batch["next_sentence_label"].shape,
292
data_collator = DataCollatorForLanguageModeling(tokenizer, pad_to_multiple_of=8)
293
batch = data_collator(features)
295
self.assertEqual(batch["input_ids"].shape, [2, 8])
296
self.assertEqual(batch["token_type_ids"].shape, [2, 8])
297
self.assertEqual(batch["labels"].shape, [2, 8])
299
batch["next_sentence_label"].shape,
306
tokenizer = BertTokenizer(self.vocab_file)
309
"input_ids": paddle.to_tensor([0, 1, 2, 3, 4]),
310
"token_type_ids": paddle.to_tensor([0, 1, 2, 3, 4]),
311
"sentence_order_label": i,
315
data_collator = DataCollatorForLanguageModeling(tokenizer)
316
batch = data_collator(features)
318
self.assertEqual(batch["input_ids"].shape, [2, 5])
319
self.assertEqual(batch["token_type_ids"].shape, [2, 5])
320
self.assertEqual(batch["labels"].shape, [2, 5])
322
batch["sentence_order_label"].shape,
328
data_collator = DataCollatorForLanguageModeling(tokenizer, pad_to_multiple_of=8)
329
batch = data_collator(features)
331
self.assertEqual(batch["input_ids"].shape, [2, 8])
332
self.assertEqual(batch["token_type_ids"].shape, [2, 8])
333
self.assertEqual(batch["labels"].shape, [2, 8])
335
batch["sentence_order_label"].shape,