transformers

Форк
0
/
test_tokenization_bert_tf.py 
106 строк · 4.7 Кб
1
import unittest
2
from pathlib import Path
3
from tempfile import TemporaryDirectory
4

5
from transformers import AutoConfig, TFAutoModel, is_tensorflow_text_available, is_tf_available
6
from transformers.models.bert.tokenization_bert import BertTokenizer
7
from transformers.testing_utils import require_tensorflow_text, require_tf, slow
8

9

10
if is_tf_available():
11
    import tensorflow as tf
12

13
    from transformers.modeling_tf_utils import keras
14

15
if is_tensorflow_text_available():
16
    from transformers.models.bert import TFBertTokenizer
17

18

19
TOKENIZER_CHECKPOINTS = ["google-bert/bert-base-uncased", "google-bert/bert-base-cased"]
20
TINY_MODEL_CHECKPOINT = "hf-internal-testing/tiny-bert-tf-only"
21

22
if is_tf_available():
23
    from transformers.modeling_tf_utils import keras
24

25
    class ModelToSave(keras.Model):
26
        def __init__(self, tokenizer):
27
            super().__init__()
28
            self.tokenizer = tokenizer
29
            config = AutoConfig.from_pretrained(TINY_MODEL_CHECKPOINT)
30
            self.bert = TFAutoModel.from_config(config)
31

32
        def call(self, inputs):
33
            tokenized = self.tokenizer(inputs)
34
            out = self.bert(tokenized)
35
            return out["pooler_output"]
36

37

38
@require_tf
39
@require_tensorflow_text
40
class BertTokenizationTest(unittest.TestCase):
41
    # The TF tokenizers are usually going to be used as pretrained tokenizers from existing model checkpoints,
42
    # so that's what we focus on here.
43

44
    def setUp(self):
45
        super().setUp()
46

47
        self.tokenizers = [BertTokenizer.from_pretrained(checkpoint) for checkpoint in TOKENIZER_CHECKPOINTS]
48
        self.tf_tokenizers = [TFBertTokenizer.from_pretrained(checkpoint) for checkpoint in TOKENIZER_CHECKPOINTS]
49
        assert len(self.tokenizers) == len(self.tf_tokenizers)
50

51
        self.test_sentences = [
52
            "This is a straightforward English test sentence.",
53
            "This one has some weird characters\rto\nsee\r\nif  those\u00E9break things.",
54
            "Now we're going to add some Chinese: 一 二 三 一二三",
55
            "And some much more rare Chinese: 齉 堃 齉堃",
56
            "Je vais aussi écrire en français pour tester les accents",
57
            "Classical Irish also has some unusual characters, so in they go: Gaelaċ, ꝼ",
58
        ]
59
        self.paired_sentences = list(zip(self.test_sentences, self.test_sentences[::-1]))
60

61
    def test_output_equivalence(self):
62
        for tokenizer, tf_tokenizer in zip(self.tokenizers, self.tf_tokenizers):
63
            for test_inputs in (self.test_sentences, self.paired_sentences):
64
                python_outputs = tokenizer(test_inputs, return_tensors="tf", padding="longest")
65
                tf_outputs = tf_tokenizer(test_inputs)
66

67
                for key in python_outputs.keys():
68
                    self.assertTrue(tf.reduce_all(python_outputs[key].shape == tf_outputs[key].shape))
69
                    self.assertTrue(tf.reduce_all(tf.cast(python_outputs[key], tf.int64) == tf_outputs[key]))
70

71
    @slow
72
    def test_different_pairing_styles(self):
73
        for tf_tokenizer in self.tf_tokenizers:
74
            merged_outputs = tf_tokenizer(self.paired_sentences)
75
            separated_outputs = tf_tokenizer(
76
                text=[sentence[0] for sentence in self.paired_sentences],
77
                text_pair=[sentence[1] for sentence in self.paired_sentences],
78
            )
79
            for key in merged_outputs.keys():
80
                self.assertTrue(tf.reduce_all(tf.cast(merged_outputs[key], tf.int64) == separated_outputs[key]))
81

82
    @slow
83
    def test_graph_mode(self):
84
        for tf_tokenizer in self.tf_tokenizers:
85
            compiled_tokenizer = tf.function(tf_tokenizer)
86
            for test_inputs in (self.test_sentences, self.paired_sentences):
87
                test_inputs = tf.constant(test_inputs)
88
                compiled_outputs = compiled_tokenizer(test_inputs)
89
                eager_outputs = tf_tokenizer(test_inputs)
90

91
                for key in eager_outputs.keys():
92
                    self.assertTrue(tf.reduce_all(eager_outputs[key] == compiled_outputs[key]))
93

94
    @slow
95
    def test_export_for_inference(self):
96
        for tf_tokenizer in self.tf_tokenizers:
97
            model = ModelToSave(tokenizer=tf_tokenizer)
98
            test_inputs = tf.convert_to_tensor(self.test_sentences)
99
            out = model(test_inputs)  # Build model with some sample inputs
100
            with TemporaryDirectory() as tempdir:
101
                save_path = Path(tempdir) / "saved.model"
102
                model.export(save_path)
103
                loaded_model = tf.saved_model.load(save_path)
104
            loaded_output = loaded_model.serve(test_inputs)
105
            # We may see small differences because the loaded model is compiled, so we need an epsilon for the test
106
            self.assertLessEqual(tf.reduce_max(tf.abs(out - loaded_output)), 1e-5)
107

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.