transformers
106 строк · 4.7 Кб
1import unittest2from pathlib import Path3from tempfile import TemporaryDirectory4
5from transformers import AutoConfig, TFAutoModel, is_tensorflow_text_available, is_tf_available6from transformers.models.bert.tokenization_bert import BertTokenizer7from transformers.testing_utils import require_tensorflow_text, require_tf, slow8
9
10if is_tf_available():11import tensorflow as tf12
13from transformers.modeling_tf_utils import keras14
15if is_tensorflow_text_available():16from transformers.models.bert import TFBertTokenizer17
18
19TOKENIZER_CHECKPOINTS = ["google-bert/bert-base-uncased", "google-bert/bert-base-cased"]20TINY_MODEL_CHECKPOINT = "hf-internal-testing/tiny-bert-tf-only"21
22if is_tf_available():23from transformers.modeling_tf_utils import keras24
25class ModelToSave(keras.Model):26def __init__(self, tokenizer):27super().__init__()28self.tokenizer = tokenizer29config = AutoConfig.from_pretrained(TINY_MODEL_CHECKPOINT)30self.bert = TFAutoModel.from_config(config)31
32def call(self, inputs):33tokenized = self.tokenizer(inputs)34out = self.bert(tokenized)35return out["pooler_output"]36
37
38@require_tf
39@require_tensorflow_text
40class BertTokenizationTest(unittest.TestCase):41# The TF tokenizers are usually going to be used as pretrained tokenizers from existing model checkpoints,42# so that's what we focus on here.43
44def setUp(self):45super().setUp()46
47self.tokenizers = [BertTokenizer.from_pretrained(checkpoint) for checkpoint in TOKENIZER_CHECKPOINTS]48self.tf_tokenizers = [TFBertTokenizer.from_pretrained(checkpoint) for checkpoint in TOKENIZER_CHECKPOINTS]49assert len(self.tokenizers) == len(self.tf_tokenizers)50
51self.test_sentences = [52"This is a straightforward English test sentence.",53"This one has some weird characters\rto\nsee\r\nif those\u00E9break things.",54"Now we're going to add some Chinese: 一 二 三 一二三",55"And some much more rare Chinese: 齉 堃 齉堃",56"Je vais aussi écrire en français pour tester les accents",57"Classical Irish also has some unusual characters, so in they go: Gaelaċ, ꝼ",58]59self.paired_sentences = list(zip(self.test_sentences, self.test_sentences[::-1]))60
61def test_output_equivalence(self):62for tokenizer, tf_tokenizer in zip(self.tokenizers, self.tf_tokenizers):63for test_inputs in (self.test_sentences, self.paired_sentences):64python_outputs = tokenizer(test_inputs, return_tensors="tf", padding="longest")65tf_outputs = tf_tokenizer(test_inputs)66
67for key in python_outputs.keys():68self.assertTrue(tf.reduce_all(python_outputs[key].shape == tf_outputs[key].shape))69self.assertTrue(tf.reduce_all(tf.cast(python_outputs[key], tf.int64) == tf_outputs[key]))70
71@slow72def test_different_pairing_styles(self):73for tf_tokenizer in self.tf_tokenizers:74merged_outputs = tf_tokenizer(self.paired_sentences)75separated_outputs = tf_tokenizer(76text=[sentence[0] for sentence in self.paired_sentences],77text_pair=[sentence[1] for sentence in self.paired_sentences],78)79for key in merged_outputs.keys():80self.assertTrue(tf.reduce_all(tf.cast(merged_outputs[key], tf.int64) == separated_outputs[key]))81
82@slow83def test_graph_mode(self):84for tf_tokenizer in self.tf_tokenizers:85compiled_tokenizer = tf.function(tf_tokenizer)86for test_inputs in (self.test_sentences, self.paired_sentences):87test_inputs = tf.constant(test_inputs)88compiled_outputs = compiled_tokenizer(test_inputs)89eager_outputs = tf_tokenizer(test_inputs)90
91for key in eager_outputs.keys():92self.assertTrue(tf.reduce_all(eager_outputs[key] == compiled_outputs[key]))93
94@slow95def test_export_for_inference(self):96for tf_tokenizer in self.tf_tokenizers:97model = ModelToSave(tokenizer=tf_tokenizer)98test_inputs = tf.convert_to_tensor(self.test_sentences)99out = model(test_inputs) # Build model with some sample inputs100with TemporaryDirectory() as tempdir:101save_path = Path(tempdir) / "saved.model"102model.export(save_path)103loaded_model = tf.saved_model.load(save_path)104loaded_output = loaded_model.serve(test_inputs)105# We may see small differences because the loaded model is compiled, so we need an epsilon for the test106self.assertLessEqual(tf.reduce_max(tf.abs(out - loaded_output)), 1e-5)107