llvm-project
497 строк · 18.6 Кб
1//===-- runtime/matmul.cpp ------------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9// Implements all forms of MATMUL (Fortran 2018 16.9.124)
10//
11// There are two main entry points; one establishes a descriptor for the
12// result and allocates it, and the other expects a result descriptor that
13// points to existing storage.
14//
15// This implementation must handle all combinations of numeric types and
16// kinds (100 - 165 cases depending on the target), plus all combinations
17// of logical kinds (16). A single template undergoes many instantiations
18// to cover all of the valid possibilities.
19//
20// Places where BLAS routines could be called are marked as TODO items.
21
22#include "flang/Runtime/matmul.h"23#include "terminator.h"24#include "tools.h"25#include "flang/Common/optional.h"26#include "flang/Runtime/c-or-cpp.h"27#include "flang/Runtime/cpp-type.h"28#include "flang/Runtime/descriptor.h"29#include <cstring>30
31namespace {32using namespace Fortran::runtime;33
34// Suppress the warnings about calling __host__-only std::complex operators,
35// defined in C++ STD header files, from __device__ code.
36RT_DIAG_PUSH
37RT_DIAG_DISABLE_CALL_HOST_FROM_DEVICE_WARN
38
39// General accumulator for any type and stride; this is not used for
40// contiguous numeric cases.
41template <TypeCategory RCAT, int RKIND, typename XT, typename YT>42class Accumulator {43public:44using Result = AccumulationType<RCAT, RKIND>;45RT_API_ATTRS Accumulator(const Descriptor &x, const Descriptor &y)46: x_{x}, y_{y} {}47RT_API_ATTRS void Accumulate(48const SubscriptValue xAt[], const SubscriptValue yAt[]) {49if constexpr (RCAT == TypeCategory::Logical) {50sum_ = sum_ ||51(IsLogicalElementTrue(x_, xAt) && IsLogicalElementTrue(y_, yAt));52} else {53sum_ += static_cast<Result>(*x_.Element<XT>(xAt)) *54static_cast<Result>(*y_.Element<YT>(yAt));55}56}57RT_API_ATTRS Result GetResult() const { return sum_; }58
59private:60const Descriptor &x_, &y_;61Result sum_{};62};63
64// Contiguous numeric matrix*matrix multiplication
65// matrix(rows,n) * matrix(n,cols) -> matrix(rows,cols)
66// Straightforward algorithm:
67// DO 1 I = 1, NROWS
68// DO 1 J = 1, NCOLS
69// RES(I,J) = 0
70// DO 1 K = 1, N
71// 1 RES(I,J) = RES(I,J) + X(I,K)*Y(K,J)
72// With loop distribution and transposition to avoid the inner sum
73// reduction and to avoid non-unit strides:
74// DO 1 I = 1, NROWS
75// DO 1 J = 1, NCOLS
76// 1 RES(I,J) = 0
77// DO 2 K = 1, N
78// DO 2 J = 1, NCOLS
79// DO 2 I = 1, NROWS
80// 2 RES(I,J) = RES(I,J) + X(I,K)*Y(K,J) ! loop-invariant last term
81template <TypeCategory RCAT, int RKIND, typename XT, typename YT,82bool X_HAS_STRIDED_COLUMNS, bool Y_HAS_STRIDED_COLUMNS>83inline RT_API_ATTRS void MatrixTimesMatrix(84CppTypeFor<RCAT, RKIND> *RESTRICT product, SubscriptValue rows,85SubscriptValue cols, const XT *RESTRICT x, const YT *RESTRICT y,86SubscriptValue n, std::size_t xColumnByteStride = 0,87std::size_t yColumnByteStride = 0) {88using ResultType = CppTypeFor<RCAT, RKIND>;89std::memset(product, 0, rows * cols * sizeof *product);90const XT *RESTRICT xp0{x};91for (SubscriptValue k{0}; k < n; ++k) {92ResultType *RESTRICT p{product};93for (SubscriptValue j{0}; j < cols; ++j) {94const XT *RESTRICT xp{xp0};95ResultType yv;96if constexpr (!Y_HAS_STRIDED_COLUMNS) {97yv = static_cast<ResultType>(y[k + j * n]);98} else {99yv = static_cast<ResultType>(reinterpret_cast<const YT *>(100reinterpret_cast<const char *>(y) + j * yColumnByteStride)[k]);101}102for (SubscriptValue i{0}; i < rows; ++i) {103*p++ += static_cast<ResultType>(*xp++) * yv;104}105}106if constexpr (!X_HAS_STRIDED_COLUMNS) {107xp0 += rows;108} else {109xp0 = reinterpret_cast<const XT *>(110reinterpret_cast<const char *>(xp0) + xColumnByteStride);111}112}113}
114
115RT_DIAG_POP
116
117template <TypeCategory RCAT, int RKIND, typename XT, typename YT>118inline RT_API_ATTRS void MatrixTimesMatrixHelper(119CppTypeFor<RCAT, RKIND> *RESTRICT product, SubscriptValue rows,120SubscriptValue cols, const XT *RESTRICT x, const YT *RESTRICT y,121SubscriptValue n, Fortran::common::optional<std::size_t> xColumnByteStride,122Fortran::common::optional<std::size_t> yColumnByteStride) {123if (!xColumnByteStride) {124if (!yColumnByteStride) {125MatrixTimesMatrix<RCAT, RKIND, XT, YT, false, false>(126product, rows, cols, x, y, n);127} else {128MatrixTimesMatrix<RCAT, RKIND, XT, YT, false, true>(129product, rows, cols, x, y, n, 0, *yColumnByteStride);130}131} else {132if (!yColumnByteStride) {133MatrixTimesMatrix<RCAT, RKIND, XT, YT, true, false>(134product, rows, cols, x, y, n, *xColumnByteStride);135} else {136MatrixTimesMatrix<RCAT, RKIND, XT, YT, true, true>(137product, rows, cols, x, y, n, *xColumnByteStride, *yColumnByteStride);138}139}140}
141
142RT_DIAG_PUSH
143RT_DIAG_DISABLE_CALL_HOST_FROM_DEVICE_WARN
144
145// Contiguous numeric matrix*vector multiplication
146// matrix(rows,n) * column vector(n) -> column vector(rows)
147// Straightforward algorithm:
148// DO 1 J = 1, NROWS
149// RES(J) = 0
150// DO 1 K = 1, N
151// 1 RES(J) = RES(J) + X(J,K)*Y(K)
152// With loop distribution and transposition to avoid the inner
153// sum reduction and to avoid non-unit strides:
154// DO 1 J = 1, NROWS
155// 1 RES(J) = 0
156// DO 2 K = 1, N
157// DO 2 J = 1, NROWS
158// 2 RES(J) = RES(J) + X(J,K)*Y(K)
159template <TypeCategory RCAT, int RKIND, typename XT, typename YT,160bool X_HAS_STRIDED_COLUMNS>161inline RT_API_ATTRS void MatrixTimesVector(162CppTypeFor<RCAT, RKIND> *RESTRICT product, SubscriptValue rows,163SubscriptValue n, const XT *RESTRICT x, const YT *RESTRICT y,164std::size_t xColumnByteStride = 0) {165using ResultType = CppTypeFor<RCAT, RKIND>;166std::memset(product, 0, rows * sizeof *product);167[[maybe_unused]] const XT *RESTRICT xp0{x};168for (SubscriptValue k{0}; k < n; ++k) {169ResultType *RESTRICT p{product};170auto yv{static_cast<ResultType>(*y++)};171for (SubscriptValue j{0}; j < rows; ++j) {172*p++ += static_cast<ResultType>(*x++) * yv;173}174if constexpr (X_HAS_STRIDED_COLUMNS) {175xp0 = reinterpret_cast<const XT *>(176reinterpret_cast<const char *>(xp0) + xColumnByteStride);177x = xp0;178}179}180}
181
182RT_DIAG_POP
183
184template <TypeCategory RCAT, int RKIND, typename XT, typename YT>185inline RT_API_ATTRS void MatrixTimesVectorHelper(186CppTypeFor<RCAT, RKIND> *RESTRICT product, SubscriptValue rows,187SubscriptValue n, const XT *RESTRICT x, const YT *RESTRICT y,188Fortran::common::optional<std::size_t> xColumnByteStride) {189if (!xColumnByteStride) {190MatrixTimesVector<RCAT, RKIND, XT, YT, false>(product, rows, n, x, y);191} else {192MatrixTimesVector<RCAT, RKIND, XT, YT, true>(193product, rows, n, x, y, *xColumnByteStride);194}195}
196
197RT_DIAG_PUSH
198RT_DIAG_DISABLE_CALL_HOST_FROM_DEVICE_WARN
199
200// Contiguous numeric vector*matrix multiplication
201// row vector(n) * matrix(n,cols) -> row vector(cols)
202// Straightforward algorithm:
203// DO 1 J = 1, NCOLS
204// RES(J) = 0
205// DO 1 K = 1, N
206// 1 RES(J) = RES(J) + X(K)*Y(K,J)
207// With loop distribution and transposition to avoid the inner
208// sum reduction and one non-unit stride (the other remains):
209// DO 1 J = 1, NCOLS
210// 1 RES(J) = 0
211// DO 2 K = 1, N
212// DO 2 J = 1, NCOLS
213// 2 RES(J) = RES(J) + X(K)*Y(K,J)
214template <TypeCategory RCAT, int RKIND, typename XT, typename YT,215bool Y_HAS_STRIDED_COLUMNS>216inline RT_API_ATTRS void VectorTimesMatrix(217CppTypeFor<RCAT, RKIND> *RESTRICT product, SubscriptValue n,218SubscriptValue cols, const XT *RESTRICT x, const YT *RESTRICT y,219std::size_t yColumnByteStride = 0) {220using ResultType = CppTypeFor<RCAT, RKIND>;221std::memset(product, 0, cols * sizeof *product);222for (SubscriptValue k{0}; k < n; ++k) {223ResultType *RESTRICT p{product};224auto xv{static_cast<ResultType>(*x++)};225const YT *RESTRICT yp{&y[k]};226for (SubscriptValue j{0}; j < cols; ++j) {227*p++ += xv * static_cast<ResultType>(*yp);228if constexpr (!Y_HAS_STRIDED_COLUMNS) {229yp += n;230} else {231yp = reinterpret_cast<const YT *>(232reinterpret_cast<const char *>(yp) + yColumnByteStride);233}234}235}236}
237
238RT_DIAG_POP
239
240template <TypeCategory RCAT, int RKIND, typename XT, typename YT,241bool SPARSE_COLUMNS = false>242inline RT_API_ATTRS void VectorTimesMatrixHelper(243CppTypeFor<RCAT, RKIND> *RESTRICT product, SubscriptValue n,244SubscriptValue cols, const XT *RESTRICT x, const YT *RESTRICT y,245Fortran::common::optional<std::size_t> yColumnByteStride) {246if (!yColumnByteStride) {247VectorTimesMatrix<RCAT, RKIND, XT, YT, false>(product, n, cols, x, y);248} else {249VectorTimesMatrix<RCAT, RKIND, XT, YT, true>(250product, n, cols, x, y, *yColumnByteStride);251}252}
253
254RT_DIAG_PUSH
255RT_DIAG_DISABLE_CALL_HOST_FROM_DEVICE_WARN
256
257// Implements an instance of MATMUL for given argument types.
258template <bool IS_ALLOCATING, TypeCategory RCAT, int RKIND, typename XT,259typename YT>260static inline RT_API_ATTRS void DoMatmul(261std::conditional_t<IS_ALLOCATING, Descriptor, const Descriptor> &result,262const Descriptor &x, const Descriptor &y, Terminator &terminator) {263int xRank{x.rank()};264int yRank{y.rank()};265int resRank{xRank + yRank - 2};266if (xRank * yRank != 2 * resRank) {267terminator.Crash("MATMUL: bad argument ranks (%d * %d)", xRank, yRank);268}269SubscriptValue extent[2]{270xRank == 2 ? x.GetDimension(0).Extent() : y.GetDimension(1).Extent(),271resRank == 2 ? y.GetDimension(1).Extent() : 0};272if constexpr (IS_ALLOCATING) {273result.Establish(274RCAT, RKIND, nullptr, resRank, extent, CFI_attribute_allocatable);275for (int j{0}; j < resRank; ++j) {276result.GetDimension(j).SetBounds(1, extent[j]);277}278if (int stat{result.Allocate()}) {279terminator.Crash(280"MATMUL: could not allocate memory for result; STAT=%d", stat);281}282} else {283RUNTIME_CHECK(terminator, resRank == result.rank());284RUNTIME_CHECK(285terminator, result.ElementBytes() == static_cast<std::size_t>(RKIND));286RUNTIME_CHECK(terminator, result.GetDimension(0).Extent() == extent[0]);287RUNTIME_CHECK(terminator,288resRank == 1 || result.GetDimension(1).Extent() == extent[1]);289}290SubscriptValue n{x.GetDimension(xRank - 1).Extent()};291if (n != y.GetDimension(0).Extent()) {292// At this point, we know that there's a shape error. There are three293// possibilities, x is rank 1, y is rank 1, or both are rank 2.294if (xRank == 1) {295terminator.Crash("MATMUL: unacceptable operand shapes (%jd, %jdx%jd)",296static_cast<std::intmax_t>(n),297static_cast<std::intmax_t>(y.GetDimension(0).Extent()),298static_cast<std::intmax_t>(y.GetDimension(1).Extent()));299} else if (yRank == 1) {300terminator.Crash("MATMUL: unacceptable operand shapes (%jdx%jd, %jd)",301static_cast<std::intmax_t>(x.GetDimension(0).Extent()),302static_cast<std::intmax_t>(n),303static_cast<std::intmax_t>(y.GetDimension(0).Extent()));304} else {305terminator.Crash("MATMUL: unacceptable operand shapes (%jdx%jd, %jdx%jd)",306static_cast<std::intmax_t>(x.GetDimension(0).Extent()),307static_cast<std::intmax_t>(n),308static_cast<std::intmax_t>(y.GetDimension(0).Extent()),309static_cast<std::intmax_t>(y.GetDimension(1).Extent()));310}311}312using WriteResult =313CppTypeFor<RCAT == TypeCategory::Logical ? TypeCategory::Integer : RCAT,314RKIND>;315if constexpr (RCAT != TypeCategory::Logical) {316if (x.IsContiguous(1) && y.IsContiguous(1) &&317(IS_ALLOCATING || result.IsContiguous())) {318// Contiguous numeric matrices (maybe with columns319// separated by a stride).320Fortran::common::optional<std::size_t> xColumnByteStride;321if (!x.IsContiguous()) {322// X's columns are strided.323SubscriptValue xAt[2]{};324x.GetLowerBounds(xAt);325xAt[1]++;326xColumnByteStride = x.SubscriptsToByteOffset(xAt);327}328Fortran::common::optional<std::size_t> yColumnByteStride;329if (!y.IsContiguous()) {330// Y's columns are strided.331SubscriptValue yAt[2]{};332y.GetLowerBounds(yAt);333yAt[1]++;334yColumnByteStride = y.SubscriptsToByteOffset(yAt);335}336// Note that BLAS GEMM can be used for the strided337// columns by setting proper leading dimension size.338// This implies that the column stride is divisible339// by the element size, which is usually true.340if (resRank == 2) { // M*M -> M341if (std::is_same_v<XT, YT>) {342if constexpr (std::is_same_v<XT, float>) {343// TODO: call BLAS-3 SGEMM344// TODO: try using CUTLASS for device.345} else if constexpr (std::is_same_v<XT, double>) {346// TODO: call BLAS-3 DGEMM347} else if constexpr (std::is_same_v<XT, std::complex<float>>) {348// TODO: call BLAS-3 CGEMM349} else if constexpr (std::is_same_v<XT, std::complex<double>>) {350// TODO: call BLAS-3 ZGEMM351}352}353MatrixTimesMatrixHelper<RCAT, RKIND, XT, YT>(354result.template OffsetElement<WriteResult>(), extent[0], extent[1],355x.OffsetElement<XT>(), y.OffsetElement<YT>(), n, xColumnByteStride,356yColumnByteStride);357return;358} else if (xRank == 2) { // M*V -> V359if (std::is_same_v<XT, YT>) {360if constexpr (std::is_same_v<XT, float>) {361// TODO: call BLAS-2 SGEMV(x,y)362} else if constexpr (std::is_same_v<XT, double>) {363// TODO: call BLAS-2 DGEMV(x,y)364} else if constexpr (std::is_same_v<XT, std::complex<float>>) {365// TODO: call BLAS-2 CGEMV(x,y)366} else if constexpr (std::is_same_v<XT, std::complex<double>>) {367// TODO: call BLAS-2 ZGEMV(x,y)368}369}370MatrixTimesVectorHelper<RCAT, RKIND, XT, YT>(371result.template OffsetElement<WriteResult>(), extent[0], n,372x.OffsetElement<XT>(), y.OffsetElement<YT>(), xColumnByteStride);373return;374} else { // V*M -> V375if (std::is_same_v<XT, YT>) {376if constexpr (std::is_same_v<XT, float>) {377// TODO: call BLAS-2 SGEMV(y,x)378} else if constexpr (std::is_same_v<XT, double>) {379// TODO: call BLAS-2 DGEMV(y,x)380} else if constexpr (std::is_same_v<XT, std::complex<float>>) {381// TODO: call BLAS-2 CGEMV(y,x)382} else if constexpr (std::is_same_v<XT, std::complex<double>>) {383// TODO: call BLAS-2 ZGEMV(y,x)384}385}386VectorTimesMatrixHelper<RCAT, RKIND, XT, YT>(387result.template OffsetElement<WriteResult>(), n, extent[0],388x.OffsetElement<XT>(), y.OffsetElement<YT>(), yColumnByteStride);389return;390}391}392}393// General algorithms for LOGICAL and noncontiguity394SubscriptValue xAt[2], yAt[2], resAt[2];395x.GetLowerBounds(xAt);396y.GetLowerBounds(yAt);397result.GetLowerBounds(resAt);398if (resRank == 2) { // M*M -> M399SubscriptValue x1{xAt[1]}, y0{yAt[0]}, y1{yAt[1]}, res1{resAt[1]};400for (SubscriptValue i{0}; i < extent[0]; ++i) {401for (SubscriptValue j{0}; j < extent[1]; ++j) {402Accumulator<RCAT, RKIND, XT, YT> accumulator{x, y};403yAt[1] = y1 + j;404for (SubscriptValue k{0}; k < n; ++k) {405xAt[1] = x1 + k;406yAt[0] = y0 + k;407accumulator.Accumulate(xAt, yAt);408}409resAt[1] = res1 + j;410*result.template Element<WriteResult>(resAt) = accumulator.GetResult();411}412++resAt[0];413++xAt[0];414}415} else if (xRank == 2) { // M*V -> V416SubscriptValue x1{xAt[1]}, y0{yAt[0]};417for (SubscriptValue j{0}; j < extent[0]; ++j) {418Accumulator<RCAT, RKIND, XT, YT> accumulator{x, y};419for (SubscriptValue k{0}; k < n; ++k) {420xAt[1] = x1 + k;421yAt[0] = y0 + k;422accumulator.Accumulate(xAt, yAt);423}424*result.template Element<WriteResult>(resAt) = accumulator.GetResult();425++resAt[0];426++xAt[0];427}428} else { // V*M -> V429SubscriptValue x0{xAt[0]}, y0{yAt[0]};430for (SubscriptValue j{0}; j < extent[0]; ++j) {431Accumulator<RCAT, RKIND, XT, YT> accumulator{x, y};432for (SubscriptValue k{0}; k < n; ++k) {433xAt[0] = x0 + k;434yAt[0] = y0 + k;435accumulator.Accumulate(xAt, yAt);436}437*result.template Element<WriteResult>(resAt) = accumulator.GetResult();438++resAt[0];439++yAt[1];440}441}442}
443
444RT_DIAG_POP
445
446template <bool IS_ALLOCATING, TypeCategory XCAT, int XKIND, TypeCategory YCAT,447int YKIND>448struct MatmulHelper {449using ResultDescriptor =450std::conditional_t<IS_ALLOCATING, Descriptor, const Descriptor>;451RT_API_ATTRS void operator()(ResultDescriptor &result, const Descriptor &x,452const Descriptor &y, const char *sourceFile, int line) const {453Terminator terminator{sourceFile, line};454auto xCatKind{x.type().GetCategoryAndKind()};455auto yCatKind{y.type().GetCategoryAndKind()};456RUNTIME_CHECK(terminator, xCatKind.has_value() && yCatKind.has_value());457RUNTIME_CHECK(terminator, xCatKind->first == XCAT);458RUNTIME_CHECK(terminator, yCatKind->first == YCAT);459if constexpr (constexpr auto resultType{460GetResultType(XCAT, XKIND, YCAT, YKIND)}) {461return DoMatmul<IS_ALLOCATING, resultType->first, resultType->second,462CppTypeFor<XCAT, XKIND>, CppTypeFor<YCAT, YKIND>>(463result, x, y, terminator);464}465terminator.Crash("MATMUL: bad operand types (%d(%d), %d(%d))",466static_cast<int>(XCAT), XKIND, static_cast<int>(YCAT), YKIND);467}468};469} // namespace470
471namespace Fortran::runtime {472extern "C" {473RT_EXT_API_GROUP_BEGIN
474
475#define MATMUL_INSTANCE(XCAT, XKIND, YCAT, YKIND) \476void RTDEF(Matmul##XCAT##XKIND##YCAT##YKIND)(Descriptor & result, \477const Descriptor &x, const Descriptor &y, const char *sourceFile, \478int line) { \479MatmulHelper<true, TypeCategory::XCAT, XKIND, TypeCategory::YCAT, \480YKIND>{}(result, x, y, sourceFile, line); \481}482
483#define MATMUL_DIRECT_INSTANCE(XCAT, XKIND, YCAT, YKIND) \484void RTDEF(MatmulDirect##XCAT##XKIND##YCAT##YKIND)(Descriptor & result, \485const Descriptor &x, const Descriptor &y, const char *sourceFile, \486int line) { \487MatmulHelper<false, TypeCategory::XCAT, XKIND, TypeCategory::YCAT, \488YKIND>{}(result, x, y, sourceFile, line); \489}490
491#define MATMUL_FORCE_ALL_TYPES 0492
493#include "flang/Runtime/matmul-instances.inc"494
495RT_EXT_API_GROUP_END
496} // extern "C"497} // namespace Fortran::runtime498