pytorch

transformer.cpp
1523 строки · 58.9 Кб
Перенос по словам
1
#include <gtest/gtest.h>
2

3
#include <torch/torch.h>
4

5
#include <test/cpp/api/support.h>
6

7
using namespace torch::nn;
8

9
struct TransformerTest : torch::test::SeedingFixture {};
10

11
// a generic function to set constants for parameters so we have fixed result
12
// for deterministic test
13
template <typename Model>
14
void set_parameter_to_constants(
15
    Model& model,
16
    const torch::TensorOptions& tensor_options) {
17
  torch::NoGradGuard guard;
18
  for (auto& p : model->parameters()) {
19
    auto sz = p.view(-1).size(0);
20
    p.copy_(torch::cos(torch::arange(0, sz, tensor_options).view(p.sizes())));
21
  }
22
}
23

24
// a generic function to provide consistent encoder/decoder layer for all the
25
// transformer tests
26
template <typename T_LAYER, typename T_OPTIONS>
27
T_LAYER get_a_test_layer(
28
    const torch::TensorOptions& tensor_options,
29
    bool use_callable_activation) {
30
  int64_t d_model = 4;
31
  int64_t nhead = 2;
32
  int64_t dim_feedforward = 16;
33
  double dropout = 0.0;
34

35
  // activation is always ReLU here and it can be adjusted later depending on
36
  // the usage
37
  T_LAYER layer(T_OPTIONS(d_model, nhead)
38
                    .dim_feedforward(dim_feedforward)
39
                    .dropout(dropout));
40
  if (tensor_options.device() == torch::kCUDA) {
41
    layer->to(torch::kCUDA);
42
  }
43
  if (use_callable_activation) {
44
    layer.get()->options.activation(
45
        [&](const torch::Tensor& t) { return torch::nn::functional::relu(t); });
46
  }
47

48
  // set constant weights of the model
49
  set_parameter_to_constants<T_LAYER>(layer, tensor_options);
50

51
  return layer;
52
}
53

54
void transformer_encoder_layer_test_helper(
55
    bool is_cuda,
56
    bool use_callable_activation) {
57
  // this is a deterministic test for TransformerEncoderLayer
58
  torch::Device device = is_cuda ? torch::kCUDA : torch::kCPU;
59
  torch::TensorOptions tensor_options =
60
      torch::TensorOptions().dtype(torch::kFloat32).device(device);
61

62
  TransformerEncoderLayer model =
63
      get_a_test_layer<TransformerEncoderLayer, TransformerEncoderLayerOptions>(
64
          tensor_options, use_callable_activation);
65

66
  // relu test case 1
67
  torch::Tensor encoder_input =
68
      torch::tensor({{{20, 30, 40, 50}}}, tensor_options);
69
  torch::Tensor result = model(encoder_input).detach();
70
  torch::Tensor ref_output = torch::tensor(
71
      {{{2.258703, 0.127985, -0.697881, 0.170862}}}, tensor_options);
72
  ASSERT_EQ(result.sizes(), ref_output.sizes());
73
  ASSERT_TRUE(
74
      torch::allclose(result, ref_output, 1e-7, 1e-5, /*equal_nan=*/true));
75

76
  // all 0 values are NOT masked. This should't mask anything
77
  torch::Tensor mask = torch::tensor({{0}}, tensor_options) == 1;
78
  result = model(
79
               encoder_input,
80
               /*src_mask=*/torch::Tensor{},
81
               /*src_key_padding_mask=*/mask)
82
               .detach();
83
  ASSERT_EQ(result.sizes(), ref_output.sizes());
84
  ASSERT_TRUE(
85
      torch::allclose(result, ref_output, 1e-7, 1e-5, /*equal_nan=*/true));
86

87
  // all 1 values are masked. Since there is only 1 input embedding this will
88
  // result in nan.
89
  mask = torch::tensor({{1}}, tensor_options) == 1;
90
  result = model(
91
               encoder_input,
92
               /*src_mask=*/torch::Tensor{},
93
               /*src_key_padding_mask=*/mask)
94
               .detach();
95
  ASSERT_TRUE(torch::isnan(result).all().item().to<bool>());
96

97
  // relu test case 2
98
  encoder_input =
99
      torch::tensor({{{1, 2, 3, 4}}, {{5, 6, 7, 8}}}, tensor_options);
100
  result = model(encoder_input).detach();
101
  ref_output = torch::tensor(
102
      {{{2.272644, 0.119035, -0.691669, 0.153486}},
103
       {{2.272644, 0.119035, -0.691669, 0.153486}}},
104
      tensor_options);
105
  ASSERT_EQ(result.sizes(), ref_output.sizes());
106
  ASSERT_TRUE(
107
      torch::allclose(result, ref_output, 1e-7, 1e-5, /*equal_nan=*/true));
108

109
  // all 0 values are NOT masked
110
  mask = torch::tensor({{0, 0}}, tensor_options) == 1;
111
  result = model(
112
               encoder_input,
113
               /*src_mask=*/torch::Tensor{},
114
               /*src_key_padding_mask=*/mask)
115
               .detach();
116
  ASSERT_EQ(result.sizes(), ref_output.sizes());
117
  ASSERT_TRUE(
118
      torch::allclose(result, ref_output, 1e-7, 1e-5, /*equal_nan=*/true));
119

120
  // mask with 1 and 0
121
  mask = torch::tensor({{1, 0}}, tensor_options) == 1;
122
  result = model(
123
               encoder_input,
124
               /*src_mask=*/torch::Tensor{},
125
               /*src_key_padding_mask=*/mask)
126
               .detach();
127
  ref_output = torch::tensor(
128
      {{{2.301516, 0.092249, -0.679101, 0.103088}},
129
       {{2.301516, 0.092249, -0.679101, 0.103088}}},
130
      tensor_options);
131
  ASSERT_EQ(result.sizes(), ref_output.sizes());
132
  ASSERT_TRUE(
133
      torch::allclose(result, ref_output, 1e-7, 1e-5, /*equal_nan=*/true));
134

135
  // relu test case 3
136
  encoder_input = torch::tensor(
137
      {{{0.7462, 0.6653, 0.5679, 0.4891}, {0.5387, 0.1655, 0.3565, 0.0471}},
138
       {{0.8335, 0.2799, 0.5031, 0.2947}, {0.1402, 0.0318, 0.7636, 0.1346}},
139
       {{0.6333, 0.9344, 0.1376, 0.9938}, {0.8924, 0.2872, 0.6692, 0.2944}},
140
       {{0.9897, 0.6915, 0.3154, 0.1733}, {0.8645, 0.3513, 0.3064, 0.0767}},
141
       {{0.8117, 0.2366, 0.4838, 0.7881}, {0.3718, 0.4945, 0.9511, 0.0864}}},
142
      tensor_options);
143
  result = model(encoder_input).detach();
144
  ref_output = torch::tensor(
145
      {{{2.428589, 0.020835, -0.602055, -0.085249},
146
        {2.427987, 0.021213, -0.602496, -0.084103}},
147
       {{2.424689, 0.019155, -0.604793, -0.085672},
148
        {2.413863, 0.022211, -0.612486, -0.072490}},
149
       {{2.433774, 0.021598, -0.598343, -0.087548},
150
        {2.425104, 0.019748, -0.604515, -0.084839}},
151
       {{2.436185, 0.022682, -0.596625, -0.087261},
152
        {2.433556, 0.021891, -0.598509, -0.086832}},
153
       {{2.416246, 0.017512, -0.610712, -0.082961},
154
        {2.422901, 0.024187, -0.606178, -0.074929}}},
155
      tensor_options);
156
  ASSERT_EQ(result.sizes(), ref_output.sizes());
157
  ASSERT_TRUE(
158
      torch::allclose(result, ref_output, 1e-7, 1e-5, /*equal_nan=*/true));
159

160
  // all 0 values are NOT masked
161
  mask = torch::zeros({2, 5}, tensor_options) == 1;
162
  result = model(
163
               encoder_input,
164
               /*src_mask=*/torch::Tensor{},
165
               /*src_key_padding_mask=*/mask)
166
               .detach();
167
  ASSERT_EQ(result.sizes(), ref_output.sizes());
168
  ASSERT_TRUE(
169
      torch::allclose(result, ref_output, 1e-7, 1e-5, /*equal_nan=*/true));
170

171
  // mask with 0s and 1s
172
  mask[0][1] = 1;
173
  mask[1][3] = 1;
174
  mask[1][4] = 1;
175
  result = model(
176
               encoder_input,
177
               /*src_mask=*/torch::Tensor{},
178
               /*src_key_padding_mask=*/mask)
179
               .detach();
180
  ref_output = torch::tensor(
181
      {{{2.429026, 0.020793, -0.601741, -0.085642},
182
        {2.428811, 0.021445, -0.601912, -0.084252}},
183
       {{2.425009, 0.019155, -0.604566, -0.085899},
184
        {2.415408, 0.02249, -0.611415, -0.073}},
185
       {{2.434199, 0.021682, -0.598039, -0.087699},
186
        {2.42598, 0.019941, -0.603896, -0.085091}},
187
       {{2.436457, 0.022736, -0.59643, -0.08736},
188
        {2.434021, 0.022093, -0.598179, -0.08679}},
189
       {{2.416531, 0.017498, -0.610513, -0.083181},
190
        {2.4242, 0.024653, -0.605266, -0.074959}}},
191
      tensor_options);
192
  ASSERT_EQ(result.sizes(), ref_output.sizes());
193
  ASSERT_TRUE(
194
      torch::allclose(result, ref_output, 1e-7, 1e-5, /*equal_nan=*/true));
195

196
  // gelu test case 1
197
  model.get()->options.activation(torch::kGELU);
198
  encoder_input = torch::tensor({{{20, 30, 40, 50}}}, tensor_options);
199
  result = model(encoder_input).detach();
200
  ref_output = torch::tensor(
201
      {{{2.249815, 0.131006, -0.702199, 0.177868}}}, tensor_options);
202
  ASSERT_EQ(result.sizes(), ref_output.sizes());
203
  ASSERT_TRUE(
204
      torch::allclose(result, ref_output, 1e-7, 1e-5, /*equal_nan=*/true));
205

206
  // gelu test case 2
207
  encoder_input = torch::tensor(
208
      {{{0.7462, 0.6653, 0.5679, 0.4891}, {0.5387, 0.1655, 0.3565, 0.0471}},
209
       {{0.8335, 0.2799, 0.5031, 0.2947}, {0.1402, 0.0318, 0.7636, 0.1346}},
210
       {{0.6333, 0.9344, 0.1376, 0.9938}, {0.8924, 0.2872, 0.6692, 0.2944}},
211
       {{0.9897, 0.6915, 0.3154, 0.1733}, {0.8645, 0.3513, 0.3064, 0.0767}},
212
       {{0.8117, 0.2366, 0.4838, 0.7881}, {0.3718, 0.4945, 0.9511, 0.0864}}},
213
      tensor_options);
214
  result = model(encoder_input);
215
  ref_output = torch::tensor(
216
      {{{2.42163188, 0.03227153, -0.60714219, -0.05908082},
217
        {2.42151276, 0.03302179, -0.60722523, -0.05762651}},
218
       {{2.41926761, 0.02974034, -0.60879519, -0.0621269},
219
        {2.41626395, 0.03539356, -0.61087842, -0.04978623}},
220
       {{2.42382808, 0.03218872, -0.6055963, -0.06073591},
221
        {2.41983477, 0.03085259, -0.60840145, -0.06046414}},
222
       {{2.42500749, 0.03328855, -0.60476388, -0.0595334},
223
        {2.4237977, 0.03290575, -0.60561789, -0.05940082}},
224
       {{2.41383916, 0.02686345, -0.61256377, -0.06380707},
225
        {2.42000277, 0.03800944, -0.60824798, -0.04754947}}},
226
      tensor_options);
227
  ASSERT_EQ(result.sizes(), ref_output.sizes());
228
  ASSERT_TRUE(
229
      torch::allclose(result, ref_output, 1e-7, 1e-5, /*equal_nan=*/true));
230
}
231

232
TEST_F(TransformerTest, TransformerEncoderLayer) {
233
  transformer_encoder_layer_test_helper(
234
      /*is_cuda=*/false, /*use_callable_activation=*/false);
235
  transformer_encoder_layer_test_helper(
236
      /*is_cuda=*/false, /*use_callable_activation=*/true);
237
}
238

239
TEST_F(TransformerTest, TransformerEncoderLayer_CUDA) {
240
  transformer_encoder_layer_test_helper(
241
      /*is_cuda=*/true, /*use_callable_activation=*/false);
242
  transformer_encoder_layer_test_helper(
243
      /*is_cuda=*/true, /*use_callable_activation=*/true);
244
}
245

246
void transformer_decoder_layer_test_helper(
247
    bool is_cuda,
248
    bool use_callable_activation) {
249
  torch::Device device = is_cuda ? torch::kCUDA : torch::kCPU;
250
  torch::TensorOptions tensor_options =
251
      torch::TensorOptions().dtype(torch::kFloat32).device(device);
252

253
  TransformerDecoderLayer model =
254
      get_a_test_layer<TransformerDecoderLayer, TransformerDecoderLayerOptions>(
255
          tensor_options, use_callable_activation);
256

257
  // deterministic input
258
  torch::Tensor decoder_input =
259
      torch::tensor({{{20, 30, 40, 50}}}, tensor_options);
260
  torch::Tensor memory_input =
261
      torch::tensor({{{60, 70, 80, 90}}}, tensor_options);
262
  torch::Tensor result = model(decoder_input, memory_input).detach();
263
  torch::Tensor ref_output = torch::tensor(
264
      {{{2.314351, 0.094805, -0.671322, 0.101977}}}, tensor_options);
265
  ASSERT_EQ(result.sizes().size(), ref_output.sizes().size());
266
  ASSERT_TRUE(torch::allclose(
267
      result,
268
      ref_output,
269
      1e-7,
270
      1e-5,
271
      /*equal_nan=*/true));
272

273
  // deterministic input
274
  decoder_input =
275
      torch::tensor({{{9, 10, 11, 12}}, {{11, 12, 13, 14}}}, tensor_options);
276
  memory_input = torch::tensor({{{1, 2, 3, 4}}}, tensor_options);
277
  result = model(decoder_input, memory_input).detach();
278
  ref_output = torch::tensor(
279
      {{{2.422245, 0.051716, -0.606338, -0.024756}},
280
       {{2.422245, 0.051716, -0.606338, -0.024756}}},
281
      tensor_options);
282
  ASSERT_EQ(result.sizes().size(), ref_output.sizes().size());
283
  ASSERT_TRUE(torch::allclose(
284
      result,
285
      ref_output,
286
      1e-7,
287
      1e-5,
288
      /*equal_nan=*/true));
289

290
  // deterministic input
291
  decoder_input =
292
      torch::tensor({{{1, 2, 3, 4}}, {{5, 6, 7, 8}}}, tensor_options);
293
  memory_input =
294
      torch::tensor({{{9, 10, 11, 12}}, {{11, 12, 13, 14}}}, tensor_options);
295
  result = model(decoder_input, memory_input).detach();
296
  ref_output = torch::tensor(
297
      {{{2.343536, 0.085561, -0.654954, 0.074991}},
298
       {{2.343536, 0.085561, -0.654954, 0.074991}}},
299
      tensor_options);
300
  ASSERT_EQ(result.sizes().size(), ref_output.sizes().size());
301
  ASSERT_TRUE(torch::allclose(
302
      result,
303
      ref_output,
304
      1e-7,
305
      1e-5,
306
      /*equal_nan=*/true));
307

308
  // deterministic input
309
  decoder_input = torch::tensor(
310
      {{{0.4517, 0.6793, 0.5313, 0.0034}, {0.2678, 0.3677, 0.4459, 0.7166}},
311
       {{0.8100, 0.3716, 0.4096, 0.1976}, {0.6958, 0.8844, 0.6081, 0.8315}},
312
       {{0.0494, 0.9343, 0.5955, 0.3830}, {0.5404, 0.3464, 0.9378, 0.6200}}},
313
      tensor_options);
314
  memory_input = torch::tensor(
315
      {{{0.7462, 0.6653, 0.5679, 0.4891}, {0.5387, 0.1655, 0.3565, 0.0471}},
316
       {{0.8335, 0.2799, 0.5031, 0.2947}, {0.1402, 0.0318, 0.7636, 0.1346}},
317
       {{0.6333, 0.9344, 0.1376, 0.9938}, {0.8924, 0.2872, 0.6692, 0.2944}},
318
       {{0.9897, 0.6915, 0.3154, 0.1733}, {0.8645, 0.3513, 0.3064, 0.0767}},
319
       {{0.8117, 0.2366, 0.4838, 0.7881}, {0.3718, 0.4945, 0.9511, 0.0864}}},
320
      tensor_options);
321
  result = model(decoder_input, memory_input).detach();
322
  ref_output = torch::tensor(
323
      {{{2.430065, 0.027862, -0.601136, -0.073096},
324
        {2.431935, 0.028907, -0.599809, -0.072488}},
325
       {{2.428457, 0.027053, -0.602275, -0.073462},
326
        {2.431970, 0.029387, -0.599789, -0.071621}},
327
       {{2.431934, 0.028196, -0.599802, -0.073809},
328
        {2.432306, 0.028858, -0.599542, -0.072846}}},
329
      tensor_options);
330
  ASSERT_EQ(result.sizes().size(), ref_output.sizes().size());
331
  ASSERT_TRUE(torch::allclose(
332
      result,
333
      ref_output,
334
      1e-7,
335
      1e-5,
336
      /*equal_nan=*/true));
337

338
  // key_padding_mask
339
  torch::Tensor t_mask = {};
340
  torch::Tensor m_mask = {};
341
  torch::Tensor key_padding_mask = torch::zeros({2, 3}, tensor_options) == 1;
342
  result = model(decoder_input, memory_input, t_mask, m_mask, key_padding_mask)
343
               .detach();
344
  ref_output = torch::tensor(
345
      {{{2.430065, 0.027862, -0.601136, -0.073096},
346
        {2.431935, 0.028907, -0.599809, -0.072488}},
347
       {{2.428457, 0.027053, -0.602275, -0.073462},
348
        {2.431970, 0.029387, -0.599789, -0.071621}},
349
       {{2.431934, 0.028196, -0.599802, -0.073809},
350
        {2.432306, 0.028858, -0.599542, -0.072846}}},
351
      tensor_options);
352
  ASSERT_EQ(result.sizes().size(), ref_output.sizes().size());
353
  ASSERT_TRUE(torch::allclose(
354
      result,
355
      ref_output,
356
      1e-7,
357
      1e-5,
358
      /*equal_nan=*/true));
359

360
  // key_padding_mask
361
  key_padding_mask[0][2] = 1;
362
  key_padding_mask[1][1] = 1;
363
  key_padding_mask[1][2] = 1;
364
  result = model(decoder_input, memory_input, t_mask, m_mask, key_padding_mask)
365
               .detach();
366
  ref_output = torch::tensor(
367
      {{{2.430025, 0.027643, -0.601164, -0.073476},
368
        {2.4323, 0.029375, -0.599553, -0.071881}},
369
       {{2.428523, 0.026838, -0.602226, -0.07391},
370
        {2.432634, 0.029842, -0.599318, -0.071253}},
371
       {{2.432278, 0.028152, -0.599555, -0.074139},
372
        {2.432659, 0.029244, -0.599294, -0.072382}}},
373
      tensor_options);
374
  ASSERT_EQ(result.sizes().size(), ref_output.sizes().size());
375
  ASSERT_TRUE(torch::allclose(
376
      result,
377
      ref_output,
378
      1e-7,
379
      1e-5,
380
      /*equal_nan=*/true));
381

382
  // memory_key_padding_mask
383
  torch::Tensor t_key_padding_mask = {};
384
  key_padding_mask = torch::zeros({2, 5}, tensor_options) == 1;
385
  result = model(
386
               decoder_input,
387
               memory_input,
388
               t_mask,
389
               m_mask,
390
               t_key_padding_mask,
391
               key_padding_mask)
392
               .detach();
393
  ref_output = torch::tensor(
394
      {{{2.430065, 0.027862, -0.601136, -0.073096},
395
        {2.431935, 0.028907, -0.599809, -0.072488}},
396
       {{2.428457, 0.027053, -0.602275, -0.073462},
397
        {2.431970, 0.029387, -0.599789, -0.071621}},
398
       {{2.431934, 0.028196, -0.599802, -0.073809},
399
        {2.432306, 0.028858, -0.599542, -0.072846}}},
400
      tensor_options);
401
  ASSERT_EQ(result.sizes().size(), ref_output.sizes().size());
402
  ASSERT_TRUE(torch::allclose(
403
      result,
404
      ref_output,
405
      1e-7,
406
      1e-5,
407
      /*equal_nan=*/true));
408

409
  // memory_key_padding_mask
410
  key_padding_mask[0][4] = 1;
411
  key_padding_mask[1][3] = 1;
412
  key_padding_mask[1][4] = 1;
413
  result = model(
414
               decoder_input,
415
               memory_input,
416
               t_mask,
417
               m_mask,
418
               t_key_padding_mask,
419
               key_padding_mask)
420
               .detach();
421
  ref_output = torch::tensor(
422
      {{{2.429757, 0.027358, -0.601351, -0.073816},
423
        {2.432692, 0.028583, -0.599263, -0.073634}},
424
       {{2.428247, 0.02662, -0.602419, -0.074123},
425
        {2.432657, 0.029055, -0.599293, -0.072732}},
426
       {{2.431515, 0.027687, -0.600096, -0.074459},
427
        {2.433075, 0.028543, -0.598987, -0.073985}}},
428
      tensor_options);
429
  ASSERT_EQ(result.sizes().size(), ref_output.sizes().size());
430
  ASSERT_TRUE(torch::allclose(
431
      result,
432
      ref_output,
433
      1e-7,
434
      1e-5,
435
      /*equal_nan=*/true));
436
}
437

438
TEST_F(TransformerTest, TransformerDecoderLayer) {
439
  transformer_decoder_layer_test_helper(
440
      /*is_cuda=*/false, /*use_callable_activation=*/false);
441
  transformer_decoder_layer_test_helper(
442
      /*is_cuda=*/false, /*use_callable_activation=*/true);
443
}
444

445
TEST_F(TransformerTest, TransformerDecoderLayer_CUDA) {
446
  transformer_decoder_layer_test_helper(
447
      /*is_cuda=*/true, /*use_callable_activation=*/false);
448
  transformer_decoder_layer_test_helper(
449
      /*is_cuda=*/true, /*use_callable_activation=*/true);
450
}
451

452
void transformer_decoder_layer_test_helper_gelu(
453
    bool is_cuda,
454
    bool use_callable_activation) {
455
  torch::Device device = is_cuda ? torch::kCUDA : torch::kCPU;
456
  torch::TensorOptions tensor_options =
457
      torch::TensorOptions().dtype(torch::kFloat32).device(device);
458

459
  TransformerDecoderLayer model =
460
      get_a_test_layer<TransformerDecoderLayer, TransformerDecoderLayerOptions>(
461
          tensor_options, use_callable_activation);
462
  if (use_callable_activation) {
463
    model.get()->options.activation(
464
        [&](const torch::Tensor& t) { return torch::nn::functional::gelu(t); });
465
  } else {
466
    model.get()->options.activation(torch::kGELU);
467
  }
468

469
  // deterministic input
470
  torch::Tensor decoder_input =
471
      torch::tensor({{{20, 30, 40, 50}}}, tensor_options);
472
  torch::Tensor memory_input =
473
      torch::tensor({{{60, 70, 80, 90}}}, tensor_options);
474
  torch::Tensor result = model(decoder_input, memory_input).detach();
475
  torch::Tensor ref_output = torch::tensor(
476
      {{{2.306435, 0.095946, -0.675796, 0.10687}}}, tensor_options);
477
  ASSERT_EQ(result.sizes().size(), ref_output.sizes().size());
478
  ASSERT_TRUE(torch::allclose(
479
      result,
480
      ref_output,
481
      1e-7,
482
      1e-5,
483
      /*equal_nan=*/true));
484

485
  // deterministic input
486
  decoder_input =
487
      torch::tensor({{{9, 10, 11, 12}}, {{11, 12, 13, 14}}}, tensor_options);
488
  memory_input = torch::tensor({{{1, 2, 3, 4}}}, tensor_options);
489
  result = model(decoder_input, memory_input).detach();
490
  ref_output = torch::tensor(
491
      {{{2.415448, 0.054389, -0.610932, -0.0156613}},
492
       {{2.415448, 0.054389, -0.610932, -0.0156613}}},
493
      tensor_options);
494
  ASSERT_EQ(result.sizes().size(), ref_output.sizes().size());
495
  ASSERT_TRUE(torch::allclose(
496
      result,
497
      ref_output,
498
      1e-7,
499
      1e-5,
500
      /*equal_nan=*/true));
501

502
  // deterministic input
503
  decoder_input =
504
      torch::tensor({{{1, 2, 3, 4}}, {{5, 6, 7, 8}}}, tensor_options);
505
  memory_input =
506
      torch::tensor({{{9, 10, 11, 12}}, {{11, 12, 13, 14}}}, tensor_options);
507
  result = model(decoder_input, memory_input).detach();
508
  ref_output = torch::tensor(
509
      {{{2.338531, 0.087709, -0.65776, 0.080646}},
510
       {{2.338531, 0.087709, -0.65776, 0.080646}}},
511
      tensor_options);
512
  ASSERT_EQ(result.sizes().size(), ref_output.sizes().size());
513
  ASSERT_TRUE(torch::allclose(
514
      result,
515
      ref_output,
516
      1e-7,
517
      1e-5,
518
      /*equal_nan=*/true));
519

520
  // deterministic input
521
  decoder_input = torch::tensor(
522
      {{{0.4517, 0.6793, 0.5313, 0.0034}, {0.2678, 0.3677, 0.4459, 0.7166}},
523
       {{0.8100, 0.3716, 0.4096, 0.1976}, {0.6958, 0.8844, 0.6081, 0.8315}},
524
       {{0.0494, 0.9343, 0.5955, 0.3830}, {0.5404, 0.3464, 0.9378, 0.6200}}},
525
      tensor_options);
526
  memory_input = torch::tensor(
527
      {{{0.7462, 0.6653, 0.5679, 0.4891}, {0.5387, 0.1655, 0.3565, 0.0471}},
528
       {{0.8335, 0.2799, 0.5031, 0.2947}, {0.1402, 0.0318, 0.7636, 0.1346}},
529
       {{0.6333, 0.9344, 0.1376, 0.9938}, {0.8924, 0.2872, 0.6692, 0.2944}},
530
       {{0.9897, 0.6915, 0.3154, 0.1733}, {0.8645, 0.3513, 0.3064, 0.0767}},
531
       {{0.8117, 0.2366, 0.4838, 0.7881}, {0.3718, 0.4945, 0.9511, 0.0864}}},
532
      tensor_options);
533
  result = model(decoder_input, memory_input).detach();
534
  ref_output = torch::tensor(
535
      {{{2.42049104, 0.03443088, -0.60793706, -0.05436271},
536
        {2.42210631, 0.03546578, -0.60679895, -0.05357488}},
537
       {{2.41907674, 0.0336104, -0.60892977, -0.05490462},
538
        {2.42216881, 0.03586554, -0.6067524, -0.05289126}},
539
       {{2.42205716, 0.03488046, -0.60683681, -0.05460596},
540
        {2.42240309, 0.0354595, -0.60659063, -0.05378816}}},
541
      tensor_options);
542
  ASSERT_EQ(result.sizes().size(), ref_output.sizes().size());
543
  ASSERT_TRUE(torch::allclose(
544
      result,
545
      ref_output,
546
      1e-7,
547
      1e-5,
548
      /*equal_nan=*/true));
549
}
550

551
TEST_F(TransformerTest, TransformerDecoderLayer_gelu) {
552
  transformer_decoder_layer_test_helper_gelu(
553
      /*is_cuda=*/false, /*use_callable_activation=*/false);
554
  transformer_decoder_layer_test_helper_gelu(
555
      /*is_cuda=*/false, /*use_callable_activation=*/true);
556
}
557

558
TEST_F(TransformerTest, TransformerDecoderLayer_gelu_CUDA) {
559
  transformer_decoder_layer_test_helper_gelu(
560
      /*is_cuda=*/true, /*use_callable_activation=*/false);
561
  transformer_decoder_layer_test_helper_gelu(
562
      /*is_cuda=*/true, /*use_callable_activation=*/true);
563
}
564

565
void transformer_encoder_test_helper(
566
    bool is_cuda,
567
    bool use_callable_activation) {
568
  // this is a deterministic test for TransformerEncoderLayer
569
  torch::Device device = is_cuda ? torch::kCUDA : torch::kCPU;
570
  torch::TensorOptions tensor_options =
571
      torch::TensorOptions().dtype(torch::kFloat32).device(device);
572

573
  TransformerEncoderLayer encoder_layer =
574
      get_a_test_layer<TransformerEncoderLayer, TransformerEncoderLayerOptions>(
575
          tensor_options, use_callable_activation);
576

577
  TransformerEncoder model(TransformerEncoderOptions(encoder_layer, 1));
578
  if (is_cuda) {
579
    model->to(torch::kCUDA);
580
  }
581

582
  torch::Tensor encoder_input = torch::tensor(
583
      {{{0.7462, 0.6653, 0.5679, 0.4891}, {0.5387, 0.1655, 0.3565, 0.0471}},
584
       {{0.8335, 0.2799, 0.5031, 0.2947}, {0.1402, 0.0318, 0.7636, 0.1346}},
585
       {{0.6333, 0.9344, 0.1376, 0.9938}, {0.8924, 0.2872, 0.6692, 0.2944}},
586
       {{0.9897, 0.6915, 0.3154, 0.1733}, {0.8645, 0.3513, 0.3064, 0.0767}},
587
       {{0.8117, 0.2366, 0.4838, 0.7881}, {0.3718, 0.4945, 0.9511, 0.0864}}},
588
      tensor_options);
589
  torch::Tensor result = model(encoder_input).detach();
590
  torch::Tensor ref_output = torch::tensor(
591
      {{{2.428589, 0.020835, -0.602055, -0.085249},
592
        {2.427987, 0.021213, -0.602496, -0.084103}},
593
       {{2.424689, 0.019155, -0.604793, -0.085672},
594
        {2.413863, 0.022211, -0.612486, -0.072490}},
595
       {{2.433774, 0.021598, -0.598343, -0.087548},
596
        {2.425104, 0.019748, -0.604515, -0.084839}},
597
       {{2.436185, 0.022682, -0.596625, -0.087261},
598
        {2.433556, 0.021891, -0.598509, -0.086832}},
599
       {{2.416246, 0.017512, -0.610712, -0.082961},
600
        {2.422901, 0.024187, -0.606178, -0.074929}}},
601
      tensor_options);
602
  ASSERT_EQ(result.sizes(), ref_output.sizes());
603
  ASSERT_TRUE(
604
      torch::allclose(result, ref_output, 1e-7, 1e-5, /*equal_nan=*/true));
605

606
  // all 0 values are NOT masked
607
  torch::Tensor mask = torch::zeros({2, 5}, tensor_options) == 1;
608
  result = model(
609
               encoder_input,
610
               /*src_mask=*/torch::Tensor{},
611
               /*src_key_padding_mask=*/mask)
612
               .detach();
613
  ASSERT_EQ(result.sizes(), ref_output.sizes());
614
  ASSERT_TRUE(
615
      torch::allclose(result, ref_output, 1e-7, 1e-5, /*equal_nan=*/true));
616

617
  // mask with 0s and 1s
618
  mask[0][1] = 1;
619
  mask[1][3] = 1;
620
  mask[1][4] = 1;
621
  result = model(
622
               encoder_input,
623
               /*src_mask=*/torch::Tensor{},
624
               /*src_key_padding_mask=*/mask)
625
               .detach();
626
  ref_output = torch::tensor(
627
      {{{2.429026, 0.020793, -0.601741, -0.085642},
628
        {2.428811, 0.021445, -0.601912, -0.084252}},
629
       {{2.425009, 0.019155, -0.604566, -0.085899},
630
        {2.415408, 0.02249, -0.611415, -0.073}},
631
       {{2.434199, 0.021682, -0.598039, -0.087699},
632
        {2.42598, 0.019941, -0.603896, -0.085091}},
633
       {{2.436457, 0.022736, -0.59643, -0.08736},
634
        {2.434021, 0.022093, -0.598179, -0.08679}},
635
       {{2.416531, 0.017498, -0.610513, -0.083181},
636
        {2.4242, 0.024653, -0.605266, -0.074959}}},
637
      tensor_options);
638
  ASSERT_EQ(result.sizes(), ref_output.sizes());
639
  ASSERT_TRUE(
640
      torch::allclose(result, ref_output, 1e-7, 1e-5, /*equal_nan=*/true));
641

642
  // test case 2, multiple layers no norm
643
  model = TransformerEncoder(TransformerEncoderOptions(encoder_layer, 2));
644
  if (is_cuda) {
645
    model->to(torch::kCUDA);
646
  }
647
  result = model(
648
               encoder_input,
649
               /*src_mask=*/torch::Tensor{},
650
               /*src_key_padding_mask=*/mask)
651
               .detach();
652
  ref_output = torch::tensor(
653
      {{{2.419051, 0.017446, -0.608738, -0.085003},
654
        {2.419102, 0.017452, -0.608703, -0.085026}},
655
       {{2.419043, 0.017445, -0.608744, -0.084999},
656
        {2.419052, 0.017446, -0.608738, -0.085004}},
657
       {{2.419067, 0.017448, -0.608727, -0.085010},
658
        {2.419098, 0.017452, -0.608706, -0.085024}},
659
       {{2.419072, 0.017449, -0.608724, -0.085012},
660
        {2.419119, 0.017455, -0.608691, -0.085034}},
661
       {{2.419019, 0.017442, -0.608761, -0.084989},
662
        {2.419075, 0.017449, -0.608722, -0.085014}}},
663
      tensor_options);
664
  ASSERT_EQ(result.sizes(), ref_output.sizes());
665
  ASSERT_TRUE(
666
      torch::allclose(result, ref_output, 1e-7, 1e-5, /*equal_nan=*/true));
667

668
  model = TransformerEncoder(TransformerEncoderOptions(encoder_layer, 6));
669
  if (is_cuda) {
670
    model->to(torch::kCUDA);
671
  }
672
  result = model(
673
               encoder_input,
674
               /*src_mask=*/torch::Tensor{},
675
               /*src_key_padding_mask=*/mask)
676
               .detach();
677
  ref_output = torch::tensor(
678
      {{{2.419101, 0.017453, -0.608703, -0.085025},
679
        {2.419101, 0.017453, -0.608704, -0.085025}},
680
       {{2.419101, 0.017453, -0.608703, -0.085025},
681
        {2.419101, 0.017453, -0.608704, -0.085025}},
682
       {{2.419101, 0.017453, -0.608703, -0.085025},
683
        {2.419101, 0.017453, -0.608704, -0.085025}},
684
       {{2.419101, 0.017453, -0.608703, -0.085025},
685
        {2.419101, 0.017453, -0.608704, -0.085025}},
686
       {{2.419101, 0.017453, -0.608703, -0.085025},
687
        {2.419101, 0.017453, -0.608704, -0.085025}}},
688
      tensor_options);
689
  ASSERT_EQ(result.sizes(), ref_output.sizes());
690
  ASSERT_TRUE(
691
      torch::allclose(result, ref_output, 1e-7, 1e-5, /*equal_nan=*/true));
692

693
  // test case 3, multiple layers with norm
694
  LayerNorm norm(LayerNormOptions({encoder_layer.get()->options.d_model()}));
695
  model = TransformerEncoder(
696
      TransformerEncoderOptions(encoder_layer, 2).norm(AnyModule(norm)));
697
  if (is_cuda) {
698
    model->to(torch::kCUDA);
699
  }
700
  result = model(
701
               encoder_input,
702
               /*src_mask=*/torch::Tensor{},
703
               /*src_key_padding_mask=*/mask)
704
               .detach();
705
  ref_output = torch::tensor(
706
      {{{1.695949, -0.357635, -0.893077, -0.445238},
707
        {1.695955, -0.357639, -0.893050, -0.445266}},
708
       {{1.695948, -0.357634, -0.893082, -0.445233},
709
        {1.695950, -0.357635, -0.893077, -0.445238}},
710
       {{1.695951, -0.357636, -0.893069, -0.445246},
711
        {1.695955, -0.357639, -0.893052, -0.445264}},
712
       {{1.695952, -0.357636, -0.893066, -0.445249},
713
        {1.695957, -0.357641, -0.893041, -0.445276}},
714
       {{1.695946, -0.357632, -0.893095, -0.445220},
715
        {1.695952, -0.357637, -0.893065, -0.445251}}},
716
      tensor_options);
717
  ASSERT_EQ(result.sizes(), ref_output.sizes());
718
  ASSERT_TRUE(
719
      torch::allclose(result, ref_output, 1e-7, 1e-5, /*equal_nan=*/true));
720

721
  model = TransformerEncoder(
722
      TransformerEncoderOptions(encoder_layer, 6).norm(AnyModule(norm)));
723
  if (is_cuda) {
724
    model->to(torch::kCUDA);
725
  }
726
  result = model(
727
               encoder_input,
728
               /*src_mask=*/torch::Tensor{},
729
               /*src_key_padding_mask=*/mask)
730
               .detach();
731
  ref_output = torch::tensor(
732
      {{{1.695955, -0.357639, -0.893051, -0.445265},
733
        {1.695955, -0.357639, -0.893051, -0.445265}},
734
       {{1.695955, -0.357639, -0.893051, -0.445265},
735
        {1.695955, -0.357639, -0.893051, -0.445265}},
736
       {{1.695955, -0.357639, -0.893051, -0.445265},
737
        {1.695955, -0.357639, -0.893051, -0.445265}},
738
       {{1.695955, -0.357639, -0.893051, -0.445265},
739
        {1.695955, -0.357639, -0.893051, -0.445265}},
740
       {{1.695955, -0.357639, -0.893051, -0.445265},
741
        {1.695955, -0.357639, -0.893051, -0.445265}}},
742
      tensor_options);
743
  ASSERT_EQ(result.sizes(), ref_output.sizes());
744
  ASSERT_TRUE(
745
      torch::allclose(result, ref_output, 1e-7, 1e-5, /*equal_nan=*/true));
746
}
747

748
TEST_F(TransformerTest, TransformerEncoder) {
749
  transformer_encoder_test_helper(
750
      /*is_cuda=*/false, /*use_callable_activation=*/false);
751
  transformer_encoder_test_helper(
752
      /*is_cuda=*/false, /*use_callable_activation=*/true);
753
}
754

755
TEST_F(TransformerTest, TransformerEncoder_CUDA) {
756
  transformer_encoder_test_helper(
757
      /*is_cuda=*/true, /*use_callable_activation=*/false);
758
  transformer_encoder_test_helper(
759
      /*is_cuda=*/true, /*use_callable_activation=*/true);
760
}
761

762
TEST_F(TransformerTest, PrettyPrintTransformerEncoderLayer) {
763
  ASSERT_EQ(
764
      c10::str(TransformerEncoderLayer(4, 2)),
765
      "torch::nn::TransformerEncoderLayerImpl(\n"
766
      "  (self_attn): torch::nn::MultiheadAttention(\n"
767
      "    (out_proj): torch::nn::Linear(in_features=4, out_features=4, bias=true)\n"
768
      "  )\n"
769
      "  (linear1): torch::nn::Linear(in_features=4, out_features=2048, bias=true)\n"
770
      "  (dropout): torch::nn::Dropout(p=0.1, inplace=false)\n"
771
      "  (linear2): torch::nn::Linear(in_features=2048, out_features=4, bias=true)\n"
772
      "  (norm1): torch::nn::LayerNorm([4], eps=1e-05, elementwise_affine=true)\n"
773
      "  (norm2): torch::nn::LayerNorm([4], eps=1e-05, elementwise_affine=true)\n"
774
      "  (dropout1): torch::nn::Dropout(p=0.1, inplace=false)\n"
775
      "  (dropout2): torch::nn::Dropout(p=0.1, inplace=false)\n"
776
      ")");
777
}
778

779
TEST_F(TransformerTest, PrettyPrintTransformerEncoder) {
780
  LayerNorm norm = LayerNorm(LayerNormOptions({4}));
781
  TransformerEncoderOptions options(
782
      TransformerEncoderOptions(TransformerEncoderLayerOptions(4, 2), 2)
783
          .norm(AnyModule(norm)));
784
  ASSERT_EQ(
785
      c10::str(TransformerEncoder(options)),
786
      "torch::nn::TransformerEncoderImpl(\n"
787
      "  (layers): torch::nn::ModuleList(\n"
788
      "    (0): torch::nn::TransformerEncoderLayerImpl(\n"
789
      "      (self_attn): torch::nn::MultiheadAttention(\n"
790
      "        (out_proj): torch::nn::Linear(in_features=4, out_features=4, bias=true)\n"
791
      "      )\n"
792
      "      (linear1): torch::nn::Linear(in_features=4, out_features=2048, bias=true)\n"
793
      "      (dropout): torch::nn::Dropout(p=0.1, inplace=false)\n"
794
      "      (linear2): torch::nn::Linear(in_features=2048, out_features=4, bias=true)\n"
795
      "      (norm1): torch::nn::LayerNorm([4], eps=1e-05, elementwise_affine=true)\n"
796
      "      (norm2): torch::nn::LayerNorm([4], eps=1e-05, elementwise_affine=true)\n"
797
      "      (dropout1): torch::nn::Dropout(p=0.1, inplace=false)\n"
798
      "      (dropout2): torch::nn::Dropout(p=0.1, inplace=false)\n"
799
      "    )\n"
800
      "    (1): torch::nn::TransformerEncoderLayerImpl(\n"
801
      "      (self_attn): torch::nn::MultiheadAttention(\n"
802
      "        (out_proj): torch::nn::Linear(in_features=4, out_features=4, bias=true)\n"
803
      "      )\n"
804
      "      (linear1): torch::nn::Linear(in_features=4, out_features=2048, bias=true)\n"
805
      "      (dropout): torch::nn::Dropout(p=0.1, inplace=false)\n"
806
      "      (linear2): torch::nn::Linear(in_features=2048, out_features=4, bias=true)\n"
807
      "      (norm1): torch::nn::LayerNorm([4], eps=1e-05, elementwise_affine=true)\n"
808
      "      (norm2): torch::nn::LayerNorm([4], eps=1e-05, elementwise_affine=true)\n"
809
      "      (dropout1): torch::nn::Dropout(p=0.1, inplace=false)\n"
810
      "      (dropout2): torch::nn::Dropout(p=0.1, inplace=false)\n"
811
      "    )\n"
812
      "  )\n"
813
      "  (norm): torch::nn::LayerNorm([4], eps=1e-05, elementwise_affine=true)\n"
814
      ")");
815
}
816

817
TEST_F(TransformerTest, PrettyPrintTransformerDecoderLayer) {
818
  ASSERT_EQ(
819
      c10::str(TransformerDecoderLayer(4, 2)),
820
      "torch::nn::TransformerDecoderLayerImpl(\n"
821
      "  (self_attn): torch::nn::MultiheadAttention(\n"
822
      "    (out_proj): torch::nn::Linear(in_features=4, out_features=4, bias=true)\n"
823
      "  )\n"
824
      "  (multihead_attn): torch::nn::MultiheadAttention(\n"
825
      "    (out_proj): torch::nn::Linear(in_features=4, out_features=4, bias=true)\n"
826
      "  )\n"
827
      "  (linear1): torch::nn::Linear(in_features=4, out_features=2048, bias=true)\n"
828
      "  (dropout): torch::nn::Dropout(p=0.1, inplace=false)\n"
829
      "  (linear2): torch::nn::Linear(in_features=2048, out_features=4, bias=true)\n"
830
      "  (norm1): torch::nn::LayerNorm([4], eps=1e-05, elementwise_affine=true)\n"
831
      "  (norm2): torch::nn::LayerNorm([4], eps=1e-05, elementwise_affine=true)\n"
832
      "  (norm3): torch::nn::LayerNorm([4], eps=1e-05, elementwise_affine=true)\n"
833
      "  (dropout1): torch::nn::Dropout(p=0.1, inplace=false)\n"
834
      "  (dropout2): torch::nn::Dropout(p=0.1, inplace=false)\n"
835
      "  (dropout3): torch::nn::Dropout(p=0.1, inplace=false)\n"
836
      ")");
837
}
838

839
void transformer_decoder_test_helper(
840
    bool is_cuda,
841
    bool use_callable_activation) {
842
  // this is a deterministic test for TransformerDecoder
843
  torch::Device device = is_cuda ? torch::kCUDA : torch::kCPU;
844
  torch::TensorOptions tensor_options =
845
      torch::TensorOptions().dtype(torch::kFloat32).device(device);
846

847
  TransformerDecoderLayer decoder_layer =
848
      get_a_test_layer<TransformerDecoderLayer, TransformerDecoderLayerOptions>(
849
          tensor_options, use_callable_activation);
850

851
  TransformerDecoder model(TransformerDecoderOptions(decoder_layer, 1));
852
  if (is_cuda) {
853
    model->to(torch::kCUDA);
854
  }
855

856
  torch::Tensor decoder_input =
857
      torch::tensor({{{20, 30, 40, 50}}}, tensor_options);
858
  torch::Tensor memory_input =
859
      torch::tensor({{{60, 70, 80, 90}}}, tensor_options);
860
  torch::Tensor result = model(decoder_input, memory_input).detach();
861
  torch::Tensor ref_output = torch::tensor(
862
      {{{2.314351, 0.094805, -0.671322, 0.101977}}}, tensor_options);
863
  ASSERT_EQ(result.sizes().size(), ref_output.sizes().size());
864
  ASSERT_TRUE(torch::allclose(
865
      result,
866
      ref_output,
867
      1e-7,
868
      1e-5,
869
      /*equal_nan=*/true));
870

871
  // deterministic input
872
  decoder_input =
873
      torch::tensor({{{9, 10, 11, 12}}, {{11, 12, 13, 14}}}, tensor_options);
874
  memory_input = torch::tensor({{{1, 2, 3, 4}}}, tensor_options);
875
  result = model(decoder_input, memory_input).detach();
876
  ref_output = torch::tensor(
877
      {{{2.422245, 0.051716, -0.606338, -0.024756}},
878
       {{2.422245, 0.051716, -0.606338, -0.024756}}},
879
      tensor_options);
880
  ASSERT_EQ(result.sizes().size(), ref_output.sizes().size());
881
  ASSERT_TRUE(torch::allclose(
882
      result,
883
      ref_output,
884
      1e-7,
885
      1e-5,
886
      /*equal_nan=*/true));
887

888
  // deterministic input
889
  decoder_input =
890
      torch::tensor({{{1, 2, 3, 4}}, {{5, 6, 7, 8}}}, tensor_options);
891
  memory_input =
892
      torch::tensor({{{9, 10, 11, 12}}, {{11, 12, 13, 14}}}, tensor_options);
893
  result = model(decoder_input, memory_input).detach();
894
  ref_output = torch::tensor(
895
      {{{2.343536, 0.085561, -0.654954, 0.074991}},
896
       {{2.343536, 0.085561, -0.654954, 0.074991}}},
897
      tensor_options);
898
  ASSERT_EQ(result.sizes().size(), ref_output.sizes().size());
899
  ASSERT_TRUE(torch::allclose(
900
      result,
901
      ref_output,
902
      1e-7,
903
      1e-5,
904
      /*equal_nan=*/true));
905

906
  // deterministic input
907
  decoder_input = torch::tensor(
908
      {{{0.4517, 0.6793, 0.5313, 0.0034}, {0.2678, 0.3677, 0.4459, 0.7166}},
909
       {{0.8100, 0.3716, 0.4096, 0.1976}, {0.6958, 0.8844, 0.6081, 0.8315}},
910
       {{0.0494, 0.9343, 0.5955, 0.3830}, {0.5404, 0.3464, 0.9378, 0.6200}}},
911
      tensor_options);
912
  memory_input = torch::tensor(
913
      {{{0.7462, 0.6653, 0.5679, 0.4891}, {0.5387, 0.1655, 0.3565, 0.0471}},
914
       {{0.8335, 0.2799, 0.5031, 0.2947}, {0.1402, 0.0318, 0.7636, 0.1346}},
915
       {{0.6333, 0.9344, 0.1376, 0.9938}, {0.8924, 0.2872, 0.6692, 0.2944}},
916
       {{0.9897, 0.6915, 0.3154, 0.1733}, {0.8645, 0.3513, 0.3064, 0.0767}},
917
       {{0.8117, 0.2366, 0.4838, 0.7881}, {0.3718, 0.4945, 0.9511, 0.0864}}},
918
      tensor_options);
919
  result = model(decoder_input, memory_input).detach();
920
  ref_output = torch::tensor(
921
      {{{2.430065, 0.027862, -0.601136, -0.073096},
922
        {2.431935, 0.028907, -0.599809, -0.072488}},
923
       {{2.428457, 0.027053, -0.602275, -0.073462},
924
        {2.431970, 0.029387, -0.599789, -0.071621}},
925
       {{2.431934, 0.028196, -0.599802, -0.073809},
926
        {2.432306, 0.028858, -0.599542, -0.072846}}},
927
      tensor_options);
928
  ASSERT_EQ(result.sizes().size(), ref_output.sizes().size());
929
  ASSERT_TRUE(torch::allclose(
930
      result,
931
      ref_output,
932
      1e-7,
933
      1e-5,
934
      /*equal_nan=*/true));
935

936
  // key_padding_mask
937
  torch::Tensor t_mask = {};
938
  torch::Tensor m_mask = {};
939
  torch::Tensor key_padding_mask = torch::zeros({2, 3}, tensor_options) == 1;
940
  result = model(decoder_input, memory_input, t_mask, m_mask, key_padding_mask)
941
               .detach();
942
  ref_output = torch::tensor(
943
      {{{2.430065, 0.027862, -0.601136, -0.073096},
944
        {2.431935, 0.028907, -0.599809, -0.072488}},
945
       {{2.428457, 0.027053, -0.602275, -0.073462},
946
        {2.431970, 0.029387, -0.599789, -0.071621}},
947
       {{2.431934, 0.028196, -0.599802, -0.073809},
948
        {2.432306, 0.028858, -0.599542, -0.072846}}},
949
      tensor_options);
950
  ASSERT_EQ(result.sizes().size(), ref_output.sizes().size());
951
  ASSERT_TRUE(torch::allclose(
952
      result,
953
      ref_output,
954
      1e-7,
955
      1e-5,
956
      /*equal_nan=*/true));
957

958
  // key_padding_mask
959
  key_padding_mask[0][2] = 1;
960
  key_padding_mask[1][1] = 1;
961
  key_padding_mask[1][2] = 1;
962
  result = model(decoder_input, memory_input, t_mask, m_mask, key_padding_mask)
963
               .detach();
964
  ref_output = torch::tensor(
965
      {{{2.430025, 0.027643, -0.601164, -0.073476},
966
        {2.4323, 0.029375, -0.599553, -0.071881}},
967
       {{2.428523, 0.026838, -0.602226, -0.07391},
968
        {2.432634, 0.029842, -0.599318, -0.071253}},
969
       {{2.432278, 0.028152, -0.599555, -0.074139},
970
        {2.432659, 0.029244, -0.599294, -0.072382}}},
971
      tensor_options);
972
  ASSERT_EQ(result.sizes().size(), ref_output.sizes().size());
973
  ASSERT_TRUE(torch::allclose(
974
      result,
975
      ref_output,
976
      1e-7,
977
      1e-5,
978
      /*equal_nan=*/true));
979

980
  // memory_key_padding_mask
981
  torch::Tensor t_key_padding_mask = {};
982
  key_padding_mask = torch::zeros({2, 5}, tensor_options) == 1;
983
  result = model(
984
               decoder_input,
985
               memory_input,
986
               t_mask,
987
               m_mask,
988
               t_key_padding_mask,
989
               key_padding_mask)
990
               .detach();
991
  ref_output = torch::tensor(
992
      {{{2.430065, 0.027862, -0.601136, -0.073096},
993
        {2.431935, 0.028907, -0.599809, -0.072488}},
994
       {{2.428457, 0.027053, -0.602275, -0.073462},
995
        {2.431970, 0.029387, -0.599789, -0.071621}},
996
       {{2.431934, 0.028196, -0.599802, -0.073809},
997
        {2.432306, 0.028858, -0.599542, -0.072846}}},
998
      tensor_options);
999
  ASSERT_EQ(result.sizes().size(), ref_output.sizes().size());
1000
  ASSERT_TRUE(torch::allclose(
1001
      result,
1002
      ref_output,
1003
      1e-7,
1004
      1e-5,
1005
      /*equal_nan=*/true));
1006

1007
  // memory_key_padding_mask
1008
  key_padding_mask[0][4] = 1;
1009
  key_padding_mask[1][3] = 1;
1010
  key_padding_mask[1][4] = 1;
1011
  result = model(
1012
               decoder_input,
1013
               memory_input,
1014
               t_mask,
1015
               m_mask,
1016
               t_key_padding_mask,
1017
               key_padding_mask)
1018
               .detach();
1019
  ref_output = torch::tensor(
1020
      {{{2.429757, 0.027358, -0.601351, -0.073816},
1021
        {2.432692, 0.028583, -0.599263, -0.073634}},
1022
       {{2.428247, 0.02662, -0.602419, -0.074123},
1023
        {2.432657, 0.029055, -0.599293, -0.072732}},
1024
       {{2.431515, 0.027687, -0.600096, -0.074459},
1025
        {2.433075, 0.028543, -0.598987, -0.073985}}},
1026
      tensor_options);
1027
  ASSERT_EQ(result.sizes().size(), ref_output.sizes().size());
1028
  ASSERT_TRUE(torch::allclose(
1029
      result,
1030
      ref_output,
1031
      1e-7,
1032
      1e-5,
1033
      /*equal_nan=*/true));
1034

1035
  // multiple layers no norm
1036
  model = TransformerDecoder(TransformerDecoderOptions(decoder_layer, 2));
1037
  if (is_cuda) {
1038
    model->to(torch::kCUDA);
1039
  }
1040

1041
  decoder_input = torch::tensor({{{20, 30, 40, 50}}}, tensor_options);
1042
  memory_input = torch::tensor({{{60, 70, 80, 90}}}, tensor_options);
1043
  result = model(decoder_input, memory_input).detach();
1044
  ref_output = torch::tensor(
1045
      {{{2.31316, 0.0950293, -0.671995, 0.102802}}}, tensor_options);
1046
  ASSERT_EQ(result.sizes().size(), ref_output.sizes().size());
1047
  ASSERT_TRUE(torch::allclose(
1048
      result,
1049
      ref_output,
1050
      1e-7,
1051
      1e-5,
1052
      /*equal_nan=*/true));
1053

1054
  // multiple layers no norm
1055
  model = TransformerDecoder(TransformerDecoderOptions(decoder_layer, 6));
1056
  if (is_cuda) {
1057
    model->to(torch::kCUDA);
1058
  }
1059
  // deterministic input
1060
  decoder_input = torch::tensor(
1061
      {{{0.4517, 0.6793, 0.5313, 0.0034}, {0.2678, 0.3677, 0.4459, 0.7166}},
1062
       {{0.8100, 0.3716, 0.4096, 0.1976}, {0.6958, 0.8844, 0.6081, 0.8315}},
1063
       {{0.0494, 0.9343, 0.5955, 0.3830}, {0.5404, 0.3464, 0.9378, 0.6200}}},
1064
      tensor_options);
1065
  memory_input = torch::tensor(
1066
      {{{0.7462, 0.6653, 0.5679, 0.4891}, {0.5387, 0.1655, 0.3565, 0.0471}},
1067
       {{0.8335, 0.2799, 0.5031, 0.2947}, {0.1402, 0.0318, 0.7636, 0.1346}},
1068
       {{0.6333, 0.9344, 0.1376, 0.9938}, {0.8924, 0.2872, 0.6692, 0.2944}},
1069
       {{0.9897, 0.6915, 0.3154, 0.1733}, {0.8645, 0.3513, 0.3064, 0.0767}},
1070
       {{0.8117, 0.2366, 0.4838, 0.7881}, {0.3718, 0.4945, 0.9511, 0.0864}}},
1071
      tensor_options);
1072
  result = model(decoder_input, memory_input).detach();
1073
  ref_output = torch::tensor(
1074
      {{{2.42794, 0.026164, -0.60263, -0.0747591},
1075
        {2.43113, 0.0279516, -0.600376, -0.0736896}},
1076
       {{2.42794, 0.026164, -0.60263, -0.0747591},
1077
        {2.43113, 0.0279516, -0.600376, -0.0736896}},
1078
       {{2.42794, 0.026164, -0.60263, -0.0747591},
1079
        {2.43113, 0.0279516, -0.600376, -0.0736896}}},
1080
      tensor_options);
1081
  ASSERT_EQ(result.sizes().size(), ref_output.sizes().size());
1082
  ASSERT_TRUE(torch::allclose(
1083
      result,
1084
      ref_output,
1085
      1e-7,
1086
      1e-5,
1087
      /*equal_nan=*/true));
1088

1089
  // multiple layers with norm
1090
  LayerNorm norm(LayerNormOptions({decoder_layer.get()->options.d_model()}));
1091
  model = TransformerDecoder(
1092
      TransformerDecoderOptions(decoder_layer, 2).norm(AnyModule(norm)));
1093
  if (is_cuda) {
1094
    model->to(torch::kCUDA);
1095
  }
1096

1097
  decoder_input = torch::tensor({{{20, 30, 40, 50}}}, tensor_options);
1098
  memory_input = torch::tensor({{{60, 70, 80, 90}}}, tensor_options);
1099
  result = model(decoder_input, memory_input).detach();
1100
  ref_output = torch::tensor(
1101
      {{{1.66166, -0.326986, -1.01466, -0.320017}}}, tensor_options);
1102
  ASSERT_EQ(result.sizes().size(), ref_output.sizes().size());
1103
  ASSERT_TRUE(torch::allclose(
1104
      result,
1105
      ref_output,
1106
      1e-7,
1107
      1e-5,
1108
      /*equal_nan=*/true));
1109

1110
  // multiple layers with norm
1111
  model = TransformerDecoder(
1112
      TransformerDecoderOptions(decoder_layer, 6).norm(AnyModule(norm)));
1113
  if (is_cuda) {
1114
    model->to(torch::kCUDA);
1115
  }
1116
  // deterministic input
1117
  decoder_input = torch::tensor(
1118
      {{{0.4517, 0.6793, 0.5313, 0.0034}, {0.2678, 0.3677, 0.4459, 0.7166}},
1119
       {{0.8100, 0.3716, 0.4096, 0.1976}, {0.6958, 0.8844, 0.6081, 0.8315}},
1120
       {{0.0494, 0.9343, 0.5955, 0.3830}, {0.5404, 0.3464, 0.9378, 0.6200}}},
1121
      tensor_options);
1122
  memory_input = torch::tensor(
1123
      {{{0.7462, 0.6653, 0.5679, 0.4891}, {0.5387, 0.1655, 0.3565, 0.0471}},
1124
       {{0.8335, 0.2799, 0.5031, 0.2947}, {0.1402, 0.0318, 0.7636, 0.1346}},
1125
       {{0.6333, 0.9344, 0.1376, 0.9938}, {0.8924, 0.2872, 0.6692, 0.2944}},
1126
       {{0.9897, 0.6915, 0.3154, 0.1733}, {0.8645, 0.3513, 0.3064, 0.0767}},
1127
       {{0.8117, 0.2366, 0.4838, 0.7881}, {0.3718, 0.4945, 0.9511, 0.0864}}},
1128
      tensor_options);
1129
  result = model(decoder_input, memory_input).detach();
1130
  ref_output = torch::tensor(
1131
      {{{1.69559, -0.357291, -0.894741, -0.443553},
1132
        {1.69571, -0.357363, -0.894154, -0.444196}},
1133
       {{1.69559, -0.357291, -0.894741, -0.443553},
1134
        {1.69571, -0.357363, -0.894154, -0.444196}},
1135
       {{1.69559, -0.357291, -0.894741, -0.443553},
1136
        {1.69571, -0.357363, -0.894154, -0.444196}}},
1137
      tensor_options);
1138
  ASSERT_EQ(result.sizes().size(), ref_output.sizes().size());
1139
  ASSERT_TRUE(torch::allclose(
1140
      result,
1141
      ref_output,
1142
      1e-7,
1143
      1e-5,
1144
      /*equal_nan=*/true));
1145

1146
  // gelu activation test cases
1147
  decoder_layer.get()->options.activation(torch::kGELU);
1148
  model = TransformerDecoder(TransformerDecoderOptions(decoder_layer, 1));
1149
  if (is_cuda) {
1150
    model->to(torch::kCUDA);
1151
  }
1152

1153
  // deterministic input
1154
  decoder_input = torch::tensor({{{20, 30, 40, 50}}}, tensor_options);
1155
  memory_input = torch::tensor({{{60, 70, 80, 90}}}, tensor_options);
1156
  result = model(decoder_input, memory_input).detach();
1157
  ref_output = torch::tensor(
1158
      {{{2.306435, 0.095946, -0.675796, 0.10687}}}, tensor_options);
1159
  ASSERT_EQ(result.sizes().size(), ref_output.sizes().size());
1160
  ASSERT_TRUE(torch::allclose(
1161
      result,
1162
      ref_output,
1163
      1e-7,
1164
      1e-5,
1165
      /*equal_nan=*/true));
1166

1167
  // deterministic input
1168
  decoder_input =
1169
      torch::tensor({{{9, 10, 11, 12}}, {{11, 12, 13, 14}}}, tensor_options);
1170
  memory_input = torch::tensor({{{1, 2, 3, 4}}}, tensor_options);
1171
  result = model(decoder_input, memory_input).detach();
1172
  ref_output = torch::tensor(
1173
      {{{2.415448, 0.054389, -0.610932, -0.0156613}},
1174
       {{2.415448, 0.054389, -0.610932, -0.0156613}}},
1175
      tensor_options);
1176
  ASSERT_EQ(result.sizes().size(), ref_output.sizes().size());
1177
  ASSERT_TRUE(torch::allclose(
1178
      result,
1179
      ref_output,
1180
      1e-7,
1181
      1e-5,
1182
      /*equal_nan=*/true));
1183

1184
  // deterministic input
1185
  decoder_input =
1186
      torch::tensor({{{1, 2, 3, 4}}, {{5, 6, 7, 8}}}, tensor_options);
1187
  memory_input =
1188
      torch::tensor({{{9, 10, 11, 12}}, {{11, 12, 13, 14}}}, tensor_options);
1189
  result = model(decoder_input, memory_input).detach();
1190
  ref_output = torch::tensor(
1191
      {{{2.338531, 0.087709, -0.65776, 0.080646}},
1192
       {{2.338531, 0.087709, -0.65776, 0.080646}}},
1193
      tensor_options);
1194
  ASSERT_EQ(result.sizes().size(), ref_output.sizes().size());
1195
  ASSERT_TRUE(torch::allclose(
1196
      result,
1197
      ref_output,
1198
      1e-7,
1199
      1e-5,
1200
      /*equal_nan=*/true));
1201

1202
  // deterministic input
1203
  decoder_input = torch::tensor(
1204
      {{{0.4517, 0.6793, 0.5313, 0.0034}, {0.2678, 0.3677, 0.4459, 0.7166}},
1205
       {{0.8100, 0.3716, 0.4096, 0.1976}, {0.6958, 0.8844, 0.6081, 0.8315}},
1206
       {{0.0494, 0.9343, 0.5955, 0.3830}, {0.5404, 0.3464, 0.9378, 0.6200}}},
1207
      tensor_options);
1208
  memory_input = torch::tensor(
1209
      {{{0.7462, 0.6653, 0.5679, 0.4891}, {0.5387, 0.1655, 0.3565, 0.0471}},
1210
       {{0.8335, 0.2799, 0.5031, 0.2947}, {0.1402, 0.0318, 0.7636, 0.1346}},
1211
       {{0.6333, 0.9344, 0.1376, 0.9938}, {0.8924, 0.2872, 0.6692, 0.2944}},
1212
       {{0.9897, 0.6915, 0.3154, 0.1733}, {0.8645, 0.3513, 0.3064, 0.0767}},
1213
       {{0.8117, 0.2366, 0.4838, 0.7881}, {0.3718, 0.4945, 0.9511, 0.0864}}},
1214
      tensor_options);
1215
  result = model(decoder_input, memory_input).detach();
1216
  ref_output = torch::tensor(
1217
      {{{2.42049104, 0.03443088, -0.60793706, -0.05436271},
1218
        {2.42210631, 0.03546578, -0.60679895, -0.05357488}},
1219
       {{2.41907674, 0.0336104, -0.60892977, -0.05490462},
1220
        {2.42216881, 0.03586554, -0.6067524, -0.05289126}},
1221
       {{2.42205716, 0.03488046, -0.60683681, -0.05460596},
1222
        {2.42240309, 0.0354595, -0.60659063, -0.05378816}}},
1223
      tensor_options);
1224
  ASSERT_EQ(result.sizes().size(), ref_output.sizes().size());
1225
  ASSERT_TRUE(torch::allclose(
1226
      result,
1227
      ref_output,
1228
      1e-7,
1229
      1e-5,
1230
      /*equal_nan=*/true));
1231

1232
  // Multiple layers no norm
1233
  model = TransformerDecoder(TransformerDecoderOptions(decoder_layer, 6));
1234
  if (is_cuda) {
1235
    model->to(torch::kCUDA);
1236
  }
1237
  decoder_input = torch::tensor(
1238
      {{{0.4517, 0.6793, 0.5313, 0.0034}, {0.2678, 0.3677, 0.4459, 0.7166}},
1239
       {{0.8100, 0.3716, 0.4096, 0.1976}, {0.6958, 0.8844, 0.6081, 0.8315}},
1240
       {{0.0494, 0.9343, 0.5955, 0.3830}, {0.5404, 0.3464, 0.9378, 0.6200}}},
1241
      tensor_options);
1242
  memory_input = torch::tensor(
1243
      {{{0.7462, 0.6653, 0.5679, 0.4891}, {0.5387, 0.1655, 0.3565, 0.0471}},
1244
       {{0.8335, 0.2799, 0.5031, 0.2947}, {0.1402, 0.0318, 0.7636, 0.1346}},
1245
       {{0.6333, 0.9344, 0.1376, 0.9938}, {0.8924, 0.2872, 0.6692, 0.2944}},
1246
       {{0.9897, 0.6915, 0.3154, 0.1733}, {0.8645, 0.3513, 0.3064, 0.0767}},
1247
       {{0.8117, 0.2366, 0.4838, 0.7881}, {0.3718, 0.4945, 0.9511, 0.0864}}},
1248
      tensor_options);
1249
  result = model(decoder_input, memory_input).detach();
1250
  ref_output = torch::tensor(
1251
      {{{2.41859, 0.0328114, -0.609269, -0.0560386},
1252
        {2.42138, 0.034598, -0.607316, -0.0546574}},
1253
       {{2.41859, 0.0328114, -0.609269, -0.0560386},
1254
        {2.42138, 0.034598, -0.607316, -0.0546574}},
1255
       {{2.41859, 0.0328114, -0.609269, -0.0560386},
1256
        {2.42138, 0.034598, -0.607316, -0.0546574}}},
1257
      tensor_options);
1258
  ASSERT_EQ(result.sizes().size(), ref_output.sizes().size());
1259
  ASSERT_TRUE(torch::allclose(
1260
      result,
1261
      ref_output,
1262
      1e-7,
1263
      1e-5,
1264
      /*equal_nan=*/true));
1265

1266
  // Multiple layers with norm
1267
  norm = LayerNorm(LayerNormOptions({decoder_layer.get()->options.d_model()}));
1268
  model = TransformerDecoder(
1269
      TransformerDecoderOptions(decoder_layer, 6).norm(AnyModule(norm)));
1270
  if (is_cuda) {
1271
    model->to(torch::kCUDA);
1272
  }
1273

1274
  decoder_input = torch::tensor(
1275
      {{{0.4517, 0.6793, 0.5313, 0.0034}, {0.2678, 0.3677, 0.4459, 0.7166}},
1276
       {{0.8100, 0.3716, 0.4096, 0.1976}, {0.6958, 0.8844, 0.6081, 0.8315}},
1277
       {{0.0494, 0.9343, 0.5955, 0.3830}, {0.5404, 0.3464, 0.9378, 0.6200}}},
1278
      tensor_options);
1279
  memory_input = torch::tensor(
1280
      {{{0.7462, 0.6653, 0.5679, 0.4891}, {0.5387, 0.1655, 0.3565, 0.0471}},
1281
       {{0.8335, 0.2799, 0.5031, 0.2947}, {0.1402, 0.0318, 0.7636, 0.1346}},
1282
       {{0.6333, 0.9344, 0.1376, 0.9938}, {0.8924, 0.2872, 0.6692, 0.2944}},
1283
       {{0.9897, 0.6915, 0.3154, 0.1733}, {0.8645, 0.3513, 0.3064, 0.0767}},
1284
       {{0.8117, 0.2366, 0.4838, 0.7881}, {0.3718, 0.4945, 0.9511, 0.0864}}},
1285
      tensor_options);
1286
  result = model(decoder_input, memory_input).detach();
1287
  ref_output = torch::tensor(
1288
      {{{1.69298, -0.355163, -0.906375, -0.431439},
1289
        {1.69305, -0.355195, -0.906062, -0.431791}},
1290
       {{1.69298, -0.355163, -0.906375, -0.431439},
1291
        {1.69305, -0.355195, -0.906062, -0.431791}},
1292
       {{1.69298, -0.355163, -0.906375, -0.431439},
1293
        {1.69305, -0.355195, -0.906062, -0.431791}}},
1294
      tensor_options);
1295
  ASSERT_EQ(result.sizes().size(), ref_output.sizes().size());
1296
  ASSERT_TRUE(torch::allclose(
1297
      result,
1298
      ref_output,
1299
      1e-7,
1300
      1e-5,
1301
      /*equal_nan=*/true));
1302
}
1303

1304
TEST_F(TransformerTest, TransformerDecoder) {
1305
  transformer_decoder_test_helper(
1306
      /*is_cuda=*/false, /*use_callable_activation=*/false);
1307
  transformer_decoder_test_helper(
1308
      /*is_cuda=*/false, /*use_callable_activation=*/true);
1309
}
1310

1311
TEST_F(TransformerTest, TransformerDecoder_CUDA) {
1312
  transformer_decoder_test_helper(
1313
      /*is_cuda=*/true, /*use_callable_activation=*/false);
1314
  transformer_decoder_test_helper(
1315
      /*is_cuda=*/true, /*use_callable_activation=*/true);
1316
}
1317

1318
TEST_F(TransformerTest, PrettyPrintTransformerDecoder) {
1319
  LayerNorm norm = LayerNorm(LayerNormOptions({4}));
1320
  TransformerDecoderOptions options(
1321
      TransformerDecoderOptions(TransformerDecoderLayerOptions(4, 2), 2)
1322
          .norm(AnyModule(norm)));
1323
  ASSERT_EQ(
1324
      c10::str(TransformerDecoder(options)),
1325
      "torch::nn::TransformerDecoderImpl(\n"
1326
      "  (layers): torch::nn::ModuleList(\n"
1327
      "    (0): torch::nn::TransformerDecoderLayerImpl(\n"
1328
      "      (self_attn): torch::nn::MultiheadAttention(\n"
1329
      "        (out_proj): torch::nn::Linear(in_features=4, out_features=4, bias=true)\n"
1330
      "      )\n"
1331
      "      (multihead_attn): torch::nn::MultiheadAttention(\n"
1332
      "        (out_proj): torch::nn::Linear(in_features=4, out_features=4, bias=true)\n"
1333
      "      )\n"
1334
      "      (linear1): torch::nn::Linear(in_features=4, out_features=2048, bias=true)\n"
1335
      "      (dropout): torch::nn::Dropout(p=0.1, inplace=false)\n"
1336
      "      (linear2): torch::nn::Linear(in_features=2048, out_features=4, bias=true)\n"
1337
      "      (norm1): torch::nn::LayerNorm([4], eps=1e-05, elementwise_affine=true)\n"
1338
      "      (norm2): torch::nn::LayerNorm([4], eps=1e-05, elementwise_affine=true)\n"
1339
      "      (norm3): torch::nn::LayerNorm([4], eps=1e-05, elementwise_affine=true)\n"
1340
      "      (dropout1): torch::nn::Dropout(p=0.1, inplace=false)\n"
1341
      "      (dropout2): torch::nn::Dropout(p=0.1, inplace=false)\n"
1342
      "      (dropout3): torch::nn::Dropout(p=0.1, inplace=false)\n"
1343
      "    )\n"
1344
      "    (1): torch::nn::TransformerDecoderLayerImpl(\n"
1345
      "      (self_attn): torch::nn::MultiheadAttention(\n"
1346
      "        (out_proj): torch::nn::Linear(in_features=4, out_features=4, bias=true)\n"
1347
      "      )\n"
1348
      "      (multihead_attn): torch::nn::MultiheadAttention(\n"
1349
      "        (out_proj): torch::nn::Linear(in_features=4, out_features=4, bias=true)\n"
1350
      "      )\n"
1351
      "      (linear1): torch::nn::Linear(in_features=4, out_features=2048, bias=true)\n"
1352
      "      (dropout): torch::nn::Dropout(p=0.1, inplace=false)\n"
1353
      "      (linear2): torch::nn::Linear(in_features=2048, out_features=4, bias=true)\n"
1354
      "      (norm1): torch::nn::LayerNorm([4], eps=1e-05, elementwise_affine=true)\n"
1355
      "      (norm2): torch::nn::LayerNorm([4], eps=1e-05, elementwise_affine=true)\n"
1356
      "      (norm3): torch::nn::LayerNorm([4], eps=1e-05, elementwise_affine=true)\n"
1357
      "      (dropout1): torch::nn::Dropout(p=0.1, inplace=false)\n"
1358
      "      (dropout2): torch::nn::Dropout(p=0.1, inplace=false)\n"
1359
      "      (dropout3): torch::nn::Dropout(p=0.1, inplace=false)\n"
1360
      "    )\n"
1361
      "  )\n"
1362
      "  (norm): torch::nn::LayerNorm([4], eps=1e-05, elementwise_affine=true)\n"
1363
      ")");
1364
}
1365

1366
void transformer_test_helper(bool is_cuda, bool use_callable_activation) {
1367
  // this is a deterministic test for Transformere
1368
  torch::Device device = is_cuda ? torch::kCUDA : torch::kCPU;
1369
  torch::TensorOptions tensor_options =
1370
      torch::TensorOptions().dtype(torch::kFloat32).device(device);
1371

1372
  // transformer created encoder/decoder
1373
  auto options = TransformerOptions()
1374
                     .d_model(4)
1375
                     .nhead(2)
1376
                     .num_encoder_layers(2)
1377
                     .num_decoder_layers(1)
1378
                     .dim_feedforward(16)
1379
                     .dropout(0.0)
1380
                     .activation(torch::kReLU);
1381
  if (use_callable_activation) {
1382
    options.activation(
1383
        [&](const torch::Tensor& t) { return torch::nn::functional::relu(t); });
1384
  }
1385
  Transformer model(options);
1386

1387
  set_parameter_to_constants<Transformer>(model, tensor_options);
1388
  if (tensor_options.device() == torch::kCUDA) {
1389
    model->to(torch::kCUDA);
1390
  }
1391

1392
  // transformer with customized encoder/decoder
1393
  LayerNorm enorm(LayerNormOptions({4}));
1394
  TransformerEncoder encoder(
1395
      TransformerEncoderOptions(
1396
          TransformerEncoderLayerOptions(4, 2).dim_feedforward(16).dropout(0.0),
1397
          2)
1398
          .norm(AnyModule(enorm)));
1399

1400
  LayerNorm dnorm(LayerNormOptions({4}));
1401
  TransformerDecoder decoder(
1402
      TransformerDecoderOptions(
1403
          TransformerDecoderLayerOptions(4, 2).dim_feedforward(16).dropout(0.0),
1404
          1)
1405
          .norm(AnyModule(dnorm)));
1406

1407
  Transformer model_cus(TransformerOptions()
1408
                            .d_model(4)
1409
                            .nhead(2)
1410
                            .custom_encoder(AnyModule(encoder))
1411
                            .custom_decoder(AnyModule(decoder)));
1412

1413
  set_parameter_to_constants<Transformer>(model_cus, tensor_options);
1414
  if (tensor_options.device() == torch::kCUDA) {
1415
    model_cus->to(torch::kCUDA);
1416
  }
1417

1418
  // test cases
1419
  torch::Tensor src = torch::tensor(
1420
      {{{1.0, 2.0, 3.0, 4.0}, {5.0, 6.0, 7.0, 8.0}},
1421
       {{9.0, 10.0, 11.0, 12.0}, {13.0, 14.0, 15.0, 16.0}},
1422
       {{17.0, 18.0, 19.0, 20.0}, {21.0, 22.0, 23.0, 24.0}}},
1423
      tensor_options);
1424

1425
  torch::Tensor tgt = torch::tensor(
1426
      {{{1.0, 2.0, 3.0, 4.0}, {5.0, 6.0, 7.0, 8.0}},
1427
       {{9.0, 10.0, 11.0, 12.0}, {13.0, 14.0, 15.0, 16.0}}},
1428
      tensor_options);
1429

1430
  torch::Tensor ref_output = torch::tensor(
1431
      {{{2.695875, 0.347114, -0.044355, -0.549541},
1432
        {2.696091, 0.347015, -0.044770, -0.548522}},
1433
       {{2.695875, 0.347114, -0.044355, -0.549541},
1434
        {2.696091, 0.347015, -0.044770, -0.548522}}},
1435
      tensor_options);
1436
  torch::Tensor result = model(src, tgt);
1437
  torch::Tensor result_cus = model_cus(src, tgt);
1438
  ASSERT_EQ(result.sizes(), ref_output.sizes());
1439
  ASSERT_TRUE(result.equal(result_cus));
1440
  ASSERT_TRUE(
1441
      torch::allclose(result, ref_output, 1e-7, 1e-5, /*equal_nan=*/true));
1442

1443
  torch::Tensor src_mask =
1444
      Transformer::Impl::generate_square_subsequent_mask(src.size(0))
1445
          .to(tensor_options);
1446
  ref_output = torch::tensor(
1447
      {{{2.695875, 0.347114, -0.044355, -0.549541},
1448
        {2.696091, 0.347015, -0.044770, -0.548522}},
1449
       {{2.695875, 0.347114, -0.044355, -0.549541},
1450
        {2.696091, 0.347015, -0.044770, -0.548522}}},
1451
      tensor_options);
1452
  result = model(src, tgt, src_mask);
1453
  result_cus = model_cus(src, tgt, src_mask);
1454
  ASSERT_EQ(result.sizes(), ref_output.sizes());
1455
  ASSERT_TRUE(result.equal(result_cus));
1456
  ASSERT_TRUE(
1457
      torch::allclose(result, ref_output, 1e-7, 1e-5, /*equal_nan=*/true));
1458

1459
  torch::Tensor tgt_key_padding_mask =
1460
      torch::zeros({tgt.size(1), tgt.size(0)}, tensor_options) == 1;
1461
  tgt_key_padding_mask[0][0] = 1;
1462
  tgt_key_padding_mask[1][1] = 1;
1463
  ref_output = torch::tensor(
1464
      {{{2.696114, 0.347004, -0.044813, -0.548417},
1465
        {2.696091, 0.347015, -0.044770, -0.548522}},
1466
       {{2.696114, 0.347004, -0.044813, -0.548417},
1467
        {2.696091, 0.347015, -0.044770, -0.548522}}},
1468
      tensor_options);
1469
  result = model(
1470
      src,
1471
      tgt,
1472
      src_mask,
1473
      torch::Tensor(),
1474
      torch::Tensor(),
1475
      torch::Tensor(),
1476
      tgt_key_padding_mask);
1477
  result_cus = model_cus(
1478
      src,
1479
      tgt,
1480
      src_mask,
1481
      torch::Tensor(),
1482
      torch::Tensor(),
1483
      torch::Tensor(),
1484
      tgt_key_padding_mask);
1485
  ASSERT_EQ(result.sizes(), ref_output.sizes());
1486
  ASSERT_TRUE(result.equal(result_cus));
1487
  ASSERT_TRUE(
1488
      torch::allclose(result, ref_output, 1e-7, 1e-5, /*equal_nan=*/true));
1489
}
1490

1491
TEST_F(TransformerTest, Transformer) {
1492
  transformer_test_helper(/*is_cuda=*/false, /*use_callable_activation=*/false);
1493
  transformer_test_helper(/*is_cuda=*/false, /*use_callable_activation=*/true);
1494
}
1495

1496
TEST_F(TransformerTest, Transformer_CUDA) {
1497
  transformer_test_helper(/*is_cuda=*/true, /*use_callable_activation=*/false);
1498
  transformer_test_helper(/*is_cuda=*/true, /*use_callable_activation=*/true);
1499
}
1500

1501
TEST_F(TransformerTest, TransformerArgsCorrectness) {
1502
  Transformer model(TransformerOptions()
1503
                        .d_model(4)
1504
                        .nhead(2)
1505
                        .num_encoder_layers(2)
1506
                        .num_decoder_layers(1)
1507
                        .dim_feedforward(16)
1508
                        .dropout(0.0)
1509
                        .activation(torch::kReLU));
1510

1511
  torch::Tensor src = torch::randn({2, 3, 4});
1512
  torch::Tensor tgt = torch::randn({3, 2, 4});
1513

1514
  ASSERT_THROWS_WITH(
1515
      model(src, tgt), "src and tgt should have equal batch size");
1516

1517
  tgt = torch::randn({2, 3, 3});
1518
  ASSERT_THROWS_WITH(
1519
      model(src, tgt), "src and tgt should have same feature size as d_model");
1520

1521
  src = torch::randn({2, 3});
1522
  ASSERT_THROWS_WITH(model(src, tgt), "src and tgt should have 3 dimensions");
1523
}
1524
pytorch

Использование cookies