llvm-project

matmul-transpose.cpp
397 строк · 15.1 Кб
Перенос по словам
1
//===-- runtime/matmul-transpose.cpp --------------------------------------===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8

9
// Implements a fused matmul-transpose operation
10
//
11
// There are two main entry points; one establishes a descriptor for the
12
// result and allocates it, and the other expects a result descriptor that
13
// points to existing storage.
14
//
15
// This implementation must handle all combinations of numeric types and
16
// kinds (100 - 165 cases depending on the target), plus all combinations
17
// of logical kinds (16).  A single template undergoes many instantiations
18
// to cover all of the valid possibilities.
19
//
20
// The usefulness of this optimization should be reviewed once Matmul is swapped
21
// to use the faster BLAS routines.
22

23
#include "flang/Runtime/matmul-transpose.h"
24
#include "terminator.h"
25
#include "tools.h"
26
#include "flang/Common/optional.h"
27
#include "flang/Runtime/c-or-cpp.h"
28
#include "flang/Runtime/cpp-type.h"
29
#include "flang/Runtime/descriptor.h"
30
#include <cstring>
31

32
namespace {
33
using namespace Fortran::runtime;
34

35
// Suppress the warnings about calling __host__-only std::complex operators,
36
// defined in C++ STD header files, from __device__ code.
37
RT_DIAG_PUSH
38
RT_DIAG_DISABLE_CALL_HOST_FROM_DEVICE_WARN
39

40
// Contiguous numeric TRANSPOSE(matrix)*matrix multiplication
41
//   TRANSPOSE(matrix(n, rows)) * matrix(n,cols) ->
42
//             matrix(rows, n)  * matrix(n,cols) -> matrix(rows,cols)
43
// The transpose is implemented by swapping the indices of accesses into the LHS
44
//
45
// Straightforward algorithm:
46
//   DO 1 I = 1, NROWS
47
//    DO 1 J = 1, NCOLS
48
//     RES(I,J) = 0
49
//     DO 1 K = 1, N
50
//   1  RES(I,J) = RES(I,J) + X(K,I)*Y(K,J)
51
//
52
// With loop distribution and transposition to avoid the inner sum
53
// reduction and to avoid non-unit strides:
54
//   DO 1 I = 1, NROWS
55
//    DO 1 J = 1, NCOLS
56
//   1 RES(I,J) = 0
57
//   DO 2 J = 1, NCOLS
58
//    DO 2 I = 1, NROWS
59
//     DO 2 K = 1, N
60
//   2  RES(I,J) = RES(I,J) + X(K,I)*Y(K,J) ! loop-invariant last term
61
template <TypeCategory RCAT, int RKIND, typename XT, typename YT,
62
    bool X_HAS_STRIDED_COLUMNS, bool Y_HAS_STRIDED_COLUMNS>
63
inline static RT_API_ATTRS void MatrixTransposedTimesMatrix(
64
    CppTypeFor<RCAT, RKIND> *RESTRICT product, SubscriptValue rows,
65
    SubscriptValue cols, const XT *RESTRICT x, const YT *RESTRICT y,
66
    SubscriptValue n, std::size_t xColumnByteStride = 0,
67
    std::size_t yColumnByteStride = 0) {
68
  using ResultType = CppTypeFor<RCAT, RKIND>;
69

70
  std::memset(product, 0, rows * cols * sizeof *product);
71
  for (SubscriptValue j{0}; j < cols; ++j) {
72
    for (SubscriptValue i{0}; i < rows; ++i) {
73
      for (SubscriptValue k{0}; k < n; ++k) {
74
        ResultType x_ki;
75
        if constexpr (!X_HAS_STRIDED_COLUMNS) {
76
          x_ki = static_cast<ResultType>(x[i * n + k]);
77
        } else {
78
          x_ki = static_cast<ResultType>(reinterpret_cast<const XT *>(
79
              reinterpret_cast<const char *>(x) + i * xColumnByteStride)[k]);
80
        }
81
        ResultType y_kj;
82
        if constexpr (!Y_HAS_STRIDED_COLUMNS) {
83
          y_kj = static_cast<ResultType>(y[j * n + k]);
84
        } else {
85
          y_kj = static_cast<ResultType>(reinterpret_cast<const YT *>(
86
              reinterpret_cast<const char *>(y) + j * yColumnByteStride)[k]);
87
        }
88
        product[j * rows + i] += x_ki * y_kj;
89
      }
90
    }
91
  }
92
}
93

94
RT_DIAG_POP
95

96
template <TypeCategory RCAT, int RKIND, typename XT, typename YT>
97
inline static RT_API_ATTRS void MatrixTransposedTimesMatrixHelper(
98
    CppTypeFor<RCAT, RKIND> *RESTRICT product, SubscriptValue rows,
99
    SubscriptValue cols, const XT *RESTRICT x, const YT *RESTRICT y,
100
    SubscriptValue n, Fortran::common::optional<std::size_t> xColumnByteStride,
101
    Fortran::common::optional<std::size_t> yColumnByteStride) {
102
  if (!xColumnByteStride) {
103
    if (!yColumnByteStride) {
104
      MatrixTransposedTimesMatrix<RCAT, RKIND, XT, YT, false, false>(
105
          product, rows, cols, x, y, n);
106
    } else {
107
      MatrixTransposedTimesMatrix<RCAT, RKIND, XT, YT, false, true>(
108
          product, rows, cols, x, y, n, 0, *yColumnByteStride);
109
    }
110
  } else {
111
    if (!yColumnByteStride) {
112
      MatrixTransposedTimesMatrix<RCAT, RKIND, XT, YT, true, false>(
113
          product, rows, cols, x, y, n, *xColumnByteStride);
114
    } else {
115
      MatrixTransposedTimesMatrix<RCAT, RKIND, XT, YT, true, true>(
116
          product, rows, cols, x, y, n, *xColumnByteStride, *yColumnByteStride);
117
    }
118
  }
119
}
120

121
RT_DIAG_PUSH
122
RT_DIAG_DISABLE_CALL_HOST_FROM_DEVICE_WARN
123

124
// Contiguous numeric matrix*vector multiplication
125
//   matrix(rows,n) * column vector(n) -> column vector(rows)
126
// Straightforward algorithm:
127
//   DO 1 I = 1, NROWS
128
//    RES(I) = 0
129
//    DO 1 K = 1, N
130
//   1 RES(I) = RES(I) + X(K,I)*Y(K)
131
// With loop distribution and transposition to avoid the inner
132
// sum reduction and to avoid non-unit strides:
133
//   DO 1 I = 1, NROWS
134
//   1 RES(I) = 0
135
//   DO 2 I = 1, NROWS
136
//    DO 2 K = 1, N
137
//   2 RES(I) = RES(I) + X(K,I)*Y(K)
138
template <TypeCategory RCAT, int RKIND, typename XT, typename YT,
139
    bool X_HAS_STRIDED_COLUMNS>
140
inline static RT_API_ATTRS void MatrixTransposedTimesVector(
141
    CppTypeFor<RCAT, RKIND> *RESTRICT product, SubscriptValue rows,
142
    SubscriptValue n, const XT *RESTRICT x, const YT *RESTRICT y,
143
    std::size_t xColumnByteStride = 0) {
144
  using ResultType = CppTypeFor<RCAT, RKIND>;
145
  std::memset(product, 0, rows * sizeof *product);
146
  for (SubscriptValue i{0}; i < rows; ++i) {
147
    for (SubscriptValue k{0}; k < n; ++k) {
148
      ResultType x_ki;
149
      if constexpr (!X_HAS_STRIDED_COLUMNS) {
150
        x_ki = static_cast<ResultType>(x[i * n + k]);
151
      } else {
152
        x_ki = static_cast<ResultType>(reinterpret_cast<const XT *>(
153
            reinterpret_cast<const char *>(x) + i * xColumnByteStride)[k]);
154
      }
155
      ResultType y_k = static_cast<ResultType>(y[k]);
156
      product[i] += x_ki * y_k;
157
    }
158
  }
159
}
160

161
RT_DIAG_POP
162

163
template <TypeCategory RCAT, int RKIND, typename XT, typename YT>
164
inline static RT_API_ATTRS void MatrixTransposedTimesVectorHelper(
165
    CppTypeFor<RCAT, RKIND> *RESTRICT product, SubscriptValue rows,
166
    SubscriptValue n, const XT *RESTRICT x, const YT *RESTRICT y,
167
    Fortran::common::optional<std::size_t> xColumnByteStride) {
168
  if (!xColumnByteStride) {
169
    MatrixTransposedTimesVector<RCAT, RKIND, XT, YT, false>(
170
        product, rows, n, x, y);
171
  } else {
172
    MatrixTransposedTimesVector<RCAT, RKIND, XT, YT, true>(
173
        product, rows, n, x, y, *xColumnByteStride);
174
  }
175
}
176

177
RT_DIAG_PUSH
178
RT_DIAG_DISABLE_CALL_HOST_FROM_DEVICE_WARN
179

180
// Implements an instance of MATMUL for given argument types.
181
template <bool IS_ALLOCATING, TypeCategory RCAT, int RKIND, typename XT,
182
    typename YT>
183
inline static RT_API_ATTRS void DoMatmulTranspose(
184
    std::conditional_t<IS_ALLOCATING, Descriptor, const Descriptor> &result,
185
    const Descriptor &x, const Descriptor &y, Terminator &terminator) {
186
  int xRank{x.rank()};
187
  int yRank{y.rank()};
188
  int resRank{xRank + yRank - 2};
189
  if (xRank * yRank != 2 * resRank) {
190
    terminator.Crash(
191
        "MATMUL-TRANSPOSE: bad argument ranks (%d * %d)", xRank, yRank);
192
  }
193
  SubscriptValue extent[2]{x.GetDimension(1).Extent(),
194
      resRank == 2 ? y.GetDimension(1).Extent() : 0};
195
  if constexpr (IS_ALLOCATING) {
196
    result.Establish(
197
        RCAT, RKIND, nullptr, resRank, extent, CFI_attribute_allocatable);
198
    for (int j{0}; j < resRank; ++j) {
199
      result.GetDimension(j).SetBounds(1, extent[j]);
200
    }
201
    if (int stat{result.Allocate()}) {
202
      terminator.Crash(
203
          "MATMUL-TRANSPOSE: could not allocate memory for result; STAT=%d",
204
          stat);
205
    }
206
  } else {
207
    RUNTIME_CHECK(terminator, resRank == result.rank());
208
    RUNTIME_CHECK(
209
        terminator, result.ElementBytes() == static_cast<std::size_t>(RKIND));
210
    RUNTIME_CHECK(terminator, result.GetDimension(0).Extent() == extent[0]);
211
    RUNTIME_CHECK(terminator,
212
        resRank == 1 || result.GetDimension(1).Extent() == extent[1]);
213
  }
214
  SubscriptValue n{x.GetDimension(0).Extent()};
215
  if (n != y.GetDimension(0).Extent()) {
216
    terminator.Crash(
217
        "MATMUL-TRANSPOSE: unacceptable operand shapes (%jdx%jd, %jdx%jd)",
218
        static_cast<std::intmax_t>(x.GetDimension(0).Extent()),
219
        static_cast<std::intmax_t>(x.GetDimension(1).Extent()),
220
        static_cast<std::intmax_t>(y.GetDimension(0).Extent()),
221
        static_cast<std::intmax_t>(y.GetDimension(1).Extent()));
222
  }
223
  using WriteResult =
224
      CppTypeFor<RCAT == TypeCategory::Logical ? TypeCategory::Integer : RCAT,
225
          RKIND>;
226
  const SubscriptValue rows{extent[0]};
227
  const SubscriptValue cols{extent[1]};
228
  if constexpr (RCAT != TypeCategory::Logical) {
229
    if (x.IsContiguous(1) && y.IsContiguous(1) &&
230
        (IS_ALLOCATING || result.IsContiguous())) {
231
      // Contiguous numeric matrices (maybe with columns
232
      // separated by a stride).
233
      Fortran::common::optional<std::size_t> xColumnByteStride;
234
      if (!x.IsContiguous()) {
235
        // X's columns are strided.
236
        SubscriptValue xAt[2]{};
237
        x.GetLowerBounds(xAt);
238
        xAt[1]++;
239
        xColumnByteStride = x.SubscriptsToByteOffset(xAt);
240
      }
241
      Fortran::common::optional<std::size_t> yColumnByteStride;
242
      if (!y.IsContiguous()) {
243
        // Y's columns are strided.
244
        SubscriptValue yAt[2]{};
245
        y.GetLowerBounds(yAt);
246
        yAt[1]++;
247
        yColumnByteStride = y.SubscriptsToByteOffset(yAt);
248
      }
249
      if (resRank == 2) { // M*M -> M
250
        // TODO: use BLAS-3 GEMM for supported types.
251
        MatrixTransposedTimesMatrixHelper<RCAT, RKIND, XT, YT>(
252
            result.template OffsetElement<WriteResult>(), rows, cols,
253
            x.OffsetElement<XT>(), y.OffsetElement<YT>(), n, xColumnByteStride,
254
            yColumnByteStride);
255
        return;
256
      }
257
      if (xRank == 2) { // M*V -> V
258
        // TODO: use BLAS-2 GEMM for supported types.
259
        MatrixTransposedTimesVectorHelper<RCAT, RKIND, XT, YT>(
260
            result.template OffsetElement<WriteResult>(), rows, n,
261
            x.OffsetElement<XT>(), y.OffsetElement<YT>(), xColumnByteStride);
262
        return;
263
      }
264
      // else V*M -> V (not allowed because TRANSPOSE() is only defined for rank
265
      // 1 matrices
266
      terminator.Crash(
267
          "MATMUL-TRANSPOSE: unacceptable operand shapes (%jdx%jd, %jdx%jd)",
268
          static_cast<std::intmax_t>(x.GetDimension(0).Extent()),
269
          static_cast<std::intmax_t>(n),
270
          static_cast<std::intmax_t>(y.GetDimension(0).Extent()),
271
          static_cast<std::intmax_t>(y.GetDimension(1).Extent()));
272
      return;
273
    }
274
  }
275
  // General algorithms for LOGICAL and noncontiguity
276
  SubscriptValue xLB[2], yLB[2], resLB[2];
277
  x.GetLowerBounds(xLB);
278
  y.GetLowerBounds(yLB);
279
  result.GetLowerBounds(resLB);
280
  using ResultType = CppTypeFor<RCAT, RKIND>;
281
  if (resRank == 2) { // M*M -> M
282
    for (SubscriptValue i{0}; i < rows; ++i) {
283
      for (SubscriptValue j{0}; j < cols; ++j) {
284
        ResultType res_ij;
285
        if constexpr (RCAT == TypeCategory::Logical) {
286
          res_ij = false;
287
        } else {
288
          res_ij = 0;
289
        }
290

291
        for (SubscriptValue k{0}; k < n; ++k) {
292
          SubscriptValue xAt[2]{k + xLB[0], i + xLB[1]};
293
          SubscriptValue yAt[2]{k + yLB[0], j + yLB[1]};
294
          if constexpr (RCAT == TypeCategory::Logical) {
295
            ResultType x_ki = IsLogicalElementTrue(x, xAt);
296
            ResultType y_kj = IsLogicalElementTrue(y, yAt);
297
            res_ij = res_ij || (x_ki && y_kj);
298
          } else {
299
            ResultType x_ki = static_cast<ResultType>(*x.Element<XT>(xAt));
300
            ResultType y_kj = static_cast<ResultType>(*y.Element<YT>(yAt));
301
            res_ij += x_ki * y_kj;
302
          }
303
        }
304
        SubscriptValue resAt[2]{i + resLB[0], j + resLB[1]};
305
        *result.template Element<WriteResult>(resAt) = res_ij;
306
      }
307
    }
308
  } else if (xRank == 2) { // M*V -> V
309
    for (SubscriptValue i{0}; i < rows; ++i) {
310
      ResultType res_i;
311
      if constexpr (RCAT == TypeCategory::Logical) {
312
        res_i = false;
313
      } else {
314
        res_i = 0;
315
      }
316

317
      for (SubscriptValue k{0}; k < n; ++k) {
318
        SubscriptValue xAt[2]{k + xLB[0], i + xLB[1]};
319
        SubscriptValue yAt[1]{k + yLB[0]};
320
        if constexpr (RCAT == TypeCategory::Logical) {
321
          ResultType x_ki = IsLogicalElementTrue(x, xAt);
322
          ResultType y_k = IsLogicalElementTrue(y, yAt);
323
          res_i = res_i || (x_ki && y_k);
324
        } else {
325
          ResultType x_ki = static_cast<ResultType>(*x.Element<XT>(xAt));
326
          ResultType y_k = static_cast<ResultType>(*y.Element<YT>(yAt));
327
          res_i += x_ki * y_k;
328
        }
329
      }
330
      SubscriptValue resAt[1]{i + resLB[0]};
331
      *result.template Element<WriteResult>(resAt) = res_i;
332
    }
333
  } else { // V*M -> V
334
    // TRANSPOSE(V) not allowed by fortran standard
335
    terminator.Crash(
336
        "MATMUL-TRANSPOSE: unacceptable operand shapes (%jdx%jd, %jdx%jd)",
337
        static_cast<std::intmax_t>(x.GetDimension(0).Extent()),
338
        static_cast<std::intmax_t>(n),
339
        static_cast<std::intmax_t>(y.GetDimension(0).Extent()),
340
        static_cast<std::intmax_t>(y.GetDimension(1).Extent()));
341
  }
342
}
343

344
RT_DIAG_POP
345

346
template <bool IS_ALLOCATING, TypeCategory XCAT, int XKIND, TypeCategory YCAT,
347
    int YKIND>
348
struct MatmulTransposeHelper {
349
  using ResultDescriptor =
350
      std::conditional_t<IS_ALLOCATING, Descriptor, const Descriptor>;
351
  RT_API_ATTRS void operator()(ResultDescriptor &result, const Descriptor &x,
352
      const Descriptor &y, const char *sourceFile, int line) const {
353
    Terminator terminator{sourceFile, line};
354
    auto xCatKind{x.type().GetCategoryAndKind()};
355
    auto yCatKind{y.type().GetCategoryAndKind()};
356
    RUNTIME_CHECK(terminator, xCatKind.has_value() && yCatKind.has_value());
357
    RUNTIME_CHECK(terminator, xCatKind->first == XCAT);
358
    RUNTIME_CHECK(terminator, yCatKind->first == YCAT);
359
    if constexpr (constexpr auto resultType{
360
                      GetResultType(XCAT, XKIND, YCAT, YKIND)}) {
361
      return DoMatmulTranspose<IS_ALLOCATING, resultType->first,
362
          resultType->second, CppTypeFor<XCAT, XKIND>, CppTypeFor<YCAT, YKIND>>(
363
          result, x, y, terminator);
364
    }
365
    terminator.Crash("MATMUL-TRANSPOSE: bad operand types (%d(%d), %d(%d))",
366
        static_cast<int>(XCAT), XKIND, static_cast<int>(YCAT), YKIND);
367
  }
368
};
369
} // namespace
370

371
namespace Fortran::runtime {
372
extern "C" {
373
RT_EXT_API_GROUP_BEGIN
374

375
#define MATMUL_INSTANCE(XCAT, XKIND, YCAT, YKIND) \
376
  void RTDEF(MatmulTranspose##XCAT##XKIND##YCAT##YKIND)(Descriptor & result, \
377
      const Descriptor &x, const Descriptor &y, const char *sourceFile, \
378
      int line) { \
379
    MatmulTransposeHelper<true, TypeCategory::XCAT, XKIND, TypeCategory::YCAT, \
380
        YKIND>{}(result, x, y, sourceFile, line); \
381
  }
382

383
#define MATMUL_DIRECT_INSTANCE(XCAT, XKIND, YCAT, YKIND) \
384
  void RTDEF(MatmulTransposeDirect##XCAT##XKIND##YCAT##YKIND)( \
385
      Descriptor & result, const Descriptor &x, const Descriptor &y, \
386
      const char *sourceFile, int line) { \
387
    MatmulTransposeHelper<false, TypeCategory::XCAT, XKIND, \
388
        TypeCategory::YCAT, YKIND>{}(result, x, y, sourceFile, line); \
389
  }
390

391
#define MATMUL_FORCE_ALL_TYPES 0
392

393
#include "flang/Runtime/matmul-instances.inc"
394

395
RT_EXT_API_GROUP_END
396
} // extern "C"
397
} // namespace Fortran::runtime
398
llvm-project

Использование cookies