onnxruntime
746 строк · 30.6 Кб
1# Copyright (c) Microsoft Corporation. All rights reserved.
2# Licensed under the MIT License.
3
4set(MLAS_ROOT ${ONNXRUNTIME_ROOT}/core/mlas)
5set(MLAS_SRC_DIR ${MLAS_ROOT}/lib)
6set(MLAS_INC_DIR ${MLAS_ROOT}/inc)
7
8#
9# All hardware agnostic source files here
10# hardware specific files would cause trouble in
11# multi-target build
12#
13onnxruntime_add_static_library(onnxruntime_mlas
14${MLAS_SRC_DIR}/mlasi.h
15${MLAS_SRC_DIR}/platform.cpp
16${MLAS_SRC_DIR}/threading.cpp
17${MLAS_SRC_DIR}/sgemm.cpp
18${MLAS_SRC_DIR}/halfgemm.cpp
19${MLAS_SRC_DIR}/qgemm.cpp
20${MLAS_SRC_DIR}/qdwconv.cpp
21${MLAS_SRC_DIR}/convolve.cpp
22${MLAS_SRC_DIR}/convsym.cpp
23${MLAS_SRC_DIR}/pooling.cpp
24${MLAS_SRC_DIR}/transpose.cpp
25${MLAS_SRC_DIR}/reorder.cpp
26${MLAS_SRC_DIR}/snchwc.cpp
27${MLAS_SRC_DIR}/activate.cpp
28${MLAS_SRC_DIR}/logistic.cpp
29${MLAS_SRC_DIR}/tanh.cpp
30${MLAS_SRC_DIR}/erf.cpp
31${MLAS_SRC_DIR}/compute.cpp
32${MLAS_SRC_DIR}/quantize.cpp
33${MLAS_SRC_DIR}/qgemm_kernel_default.cpp
34${MLAS_SRC_DIR}/qladd.cpp
35${MLAS_SRC_DIR}/qlmul.cpp
36${MLAS_SRC_DIR}/qpostprocessor.cpp
37${MLAS_SRC_DIR}/qlgavgpool.cpp
38${MLAS_SRC_DIR}/qdwconv_kernelsize.cpp
39${MLAS_SRC_DIR}/sqnbitgemm.h
40${MLAS_SRC_DIR}/sqnbitgemm.cpp
41${MLAS_SRC_DIR}/sqnbitgemm_q8_block.h
42${MLAS_SRC_DIR}/flashattn.cpp
43)
44
45target_sources(onnxruntime_mlas PRIVATE
46${MLAS_INC_DIR}/mlas_float16.h
47${MLAS_INC_DIR}/mlas_gemm_postprocessor.h
48${MLAS_INC_DIR}/mlas_q4.h
49${MLAS_INC_DIR}/mlas_qnbit.h
50${MLAS_INC_DIR}/mlas.h
51)
52
53if (NOT onnxruntime_ORT_MINIMAL_BUILD)
54target_sources(onnxruntime_mlas PRIVATE
55${MLAS_SRC_DIR}/q4_dq.cpp
56${MLAS_SRC_DIR}/q4gemm.cpp
57)
58endif()
59
60set(ONNXRUNTIME_MLAS_LIBS onnxruntime_mlas)
61
62#TODO: set MASM flags properly
63function(setup_mlas_source_for_windows)
64
65#
66# Sources common for all platforms.
67#
68target_sources(onnxruntime_mlas PRIVATE
69${MLAS_SRC_DIR}/activate_fp16.cpp
70${MLAS_SRC_DIR}/dwconv.cpp
71${MLAS_SRC_DIR}/pooling_fp16.cpp
72)
73
74#The onnxruntime_target_platform variable was added by Windows AI team in onnxruntime_common.cmake
75#Don't use it for other platforms.
76if((onnxruntime_target_platform STREQUAL "ARM64") OR (onnxruntime_target_platform STREQUAL "ARM64EC"))
77set(PREPROCESS_ARMASM_FLAGS "")
78set(ARMASM_FLAGS "")
79
80if(onnxruntime_target_platform STREQUAL "ARM64")
81target_sources(onnxruntime_mlas PRIVATE
82${MLAS_SRC_DIR}/halfgemm_kernel_neon.cpp
83${MLAS_SRC_DIR}/qgemm_kernel_neon.cpp
84${MLAS_SRC_DIR}/qgemm_kernel_udot.cpp
85${MLAS_SRC_DIR}/qgemm_kernel_sdot.cpp
86${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon.h
87${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon.cpp
88${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon_fp32.cpp
89${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon_int8.cpp
90)
91
92set(mlas_platform_preprocess_srcs
93${MLAS_SRC_DIR}/arm64/ConvSymS8KernelDot.asm
94${MLAS_SRC_DIR}/arm64/ConvSymS8KernelDotLd64.asm
95${MLAS_SRC_DIR}/arm64/ConvSymU8KernelDot.asm
96${MLAS_SRC_DIR}/arm64/ConvSymS8KernelNeon.asm
97${MLAS_SRC_DIR}/arm64/ConvSymU8KernelNeon.asm
98${MLAS_SRC_DIR}/arm64/DepthwiseQConvSymS8KernelNeon.asm
99${MLAS_SRC_DIR}/arm64/DepthwiseQConvSymU8KernelNeon.asm
100${MLAS_SRC_DIR}/arm64/DepthwiseQConvKernelSize9Neon.asm
101${MLAS_SRC_DIR}/arm64/HalfGemmKernelNeon.asm
102${MLAS_SRC_DIR}/arm64/QgemmU8X8KernelNeon.asm
103${MLAS_SRC_DIR}/arm64/QgemmS8S8KernelNeon.asm
104${MLAS_SRC_DIR}/arm64/QgemmU8X8KernelUdot.asm
105${MLAS_SRC_DIR}/arm64/QgemmS8S8KernelSdot.asm
106${MLAS_SRC_DIR}/arm64/SgemmKernelNeon.asm
107${MLAS_SRC_DIR}/arm64/SgemvKernelNeon.asm
108${MLAS_SRC_DIR}/arm64/SymQgemmS8KernelNeon.asm
109${MLAS_SRC_DIR}/arm64/SymQgemmS8KernelSDot.asm
110${MLAS_SRC_DIR}/arm64/SymQgemmS8KernelSDotLd64.asm
111)
112else()
113target_sources(onnxruntime_mlas PRIVATE
114${MLAS_SRC_DIR}/qgemm_kernel_neon.cpp
115)
116
117set(mlas_platform_preprocess_srcs
118${MLAS_SRC_DIR}/arm64ec/QgemmU8X8KernelNeon.asm
119${MLAS_SRC_DIR}/arm64ec/SgemmKernelNeon.asm
120)
121
122string(APPEND PREPROCESS_ARMASM_FLAGS " /arm64EC")
123string(APPEND ARMASM_FLAGS " -machine ARM64EC")
124endif()
125
126if(CMAKE_BUILD_TYPE STREQUAL "Debug")
127string(APPEND ARMASM_FLAGS " -g")
128endif()
129
130# Remove double quotes from flag strings.
131separate_arguments(PREPROCESS_ARMASM_FLAGS NATIVE_COMMAND "${PREPROCESS_ARMASM_FLAGS}")
132separate_arguments(ARMASM_FLAGS NATIVE_COMMAND "${ARMASM_FLAGS}")
133
134# Run the C precompiler on each input before the assembler.
135foreach(asm_filename ${mlas_platform_preprocess_srcs})
136get_filename_component(asm_filename_base ${asm_filename} NAME_WLE)
137set(preprocess_filename ${CMAKE_CURRENT_BINARY_DIR}/${asm_filename_base}.i)
138set(obj_filename ${CMAKE_CURRENT_BINARY_DIR}/${asm_filename_base}.obj)
139add_custom_command(
140OUTPUT ${obj_filename}
141COMMAND
142cl.exe ${PREPROCESS_ARMASM_FLAGS} /P ${asm_filename} /Fi${preprocess_filename}
143COMMAND
144armasm64.exe ${ARMASM_FLAGS} ${preprocess_filename} ${obj_filename}
145DEPENDS ${asm_filename}
146BYPRODUCTS ${preprocess_filename}
147)
148target_sources(onnxruntime_mlas PRIVATE ${obj_filename})
149endforeach()
150elseif(onnxruntime_target_platform STREQUAL "ARM")
151target_sources(onnxruntime_mlas PRIVATE
152${MLAS_SRC_DIR}/arm/sgemmc.cpp
153)
154elseif(onnxruntime_target_platform STREQUAL "x64")
155
156file(GLOB_RECURSE mlas_platform_srcs_avx CONFIGURE_DEPENDS
157"${MLAS_SRC_DIR}/intrinsics/avx/*.cpp"
158)
159set_source_files_properties(${mlas_platform_srcs_avx} PROPERTIES COMPILE_FLAGS "/arch:AVX")
160
161file(GLOB_RECURSE mlas_platform_srcs_avx2 CONFIGURE_DEPENDS
162"${MLAS_SRC_DIR}/intrinsics/avx2/*.cpp"
163)
164set_source_files_properties(${mlas_platform_srcs_avx2} PROPERTIES COMPILE_FLAGS "/arch:AVX2")
165
166target_sources(onnxruntime_mlas PRIVATE
167${MLAS_SRC_DIR}/dgemm.cpp
168${mlas_platform_srcs_avx}
169${mlas_platform_srcs_avx2}
170${MLAS_SRC_DIR}/qgemm_kernel_amx.cpp
171${MLAS_SRC_DIR}/qgemm_kernel_avx2.cpp
172${MLAS_SRC_DIR}/qgemm_kernel_sse.cpp
173${MLAS_SRC_DIR}/qgemm_kernel_sse41.cpp
174${MLAS_SRC_DIR}/intrinsics/avx512/quantize_avx512f.cpp
175${MLAS_SRC_DIR}/sqnbitgemm_kernel_avx2.cpp
176${MLAS_SRC_DIR}/sqnbitgemm_kernel_avx512.cpp
177${MLAS_SRC_DIR}/sqnbitgemm_kernel_avx512vnni.cpp
178${MLAS_SRC_DIR}/amd64/QgemmU8S8KernelAmx.asm
179${MLAS_SRC_DIR}/amd64/QgemmU8S8KernelAvx2.asm
180${MLAS_SRC_DIR}/amd64/QgemmU8U8KernelAvx2.asm
181${MLAS_SRC_DIR}/amd64/QgemmU8X8KernelAvx2.asm
182${MLAS_SRC_DIR}/amd64/QgemmU8X8KernelAvx512Core.asm
183${MLAS_SRC_DIR}/amd64/QgemvU8S8KernelAvx2.asm
184${MLAS_SRC_DIR}/amd64/QgemvU8S8KernelAvx512Core.asm
185${MLAS_SRC_DIR}/amd64/QgemvU8S8KernelAvx512Vnni.asm
186${MLAS_SRC_DIR}/amd64/QgemvU8S8KernelAvxVnni.asm
187${MLAS_SRC_DIR}/amd64/ConvSymKernelAvx2.asm
188${MLAS_SRC_DIR}/amd64/ConvSymKernelAvx512Core.asm
189${MLAS_SRC_DIR}/amd64/DgemmKernelSse2.asm
190${MLAS_SRC_DIR}/amd64/DgemmKernelAvx.asm
191${MLAS_SRC_DIR}/amd64/DgemmKernelFma3.asm
192${MLAS_SRC_DIR}/amd64/DgemmKernelAvx512F.asm
193${MLAS_SRC_DIR}/amd64/SgemmKernelSse2.asm
194${MLAS_SRC_DIR}/amd64/SgemmKernelAvx.asm
195${MLAS_SRC_DIR}/amd64/SgemmKernelM1Avx.asm
196${MLAS_SRC_DIR}/amd64/SgemmKernelFma3.asm
197${MLAS_SRC_DIR}/amd64/SgemmKernelAvx512F.asm
198${MLAS_SRC_DIR}/amd64/SconvKernelSse2.asm
199${MLAS_SRC_DIR}/amd64/SconvKernelAvx.asm
200${MLAS_SRC_DIR}/amd64/SconvKernelFma3.asm
201${MLAS_SRC_DIR}/amd64/SconvKernelAvx512F.asm
202${MLAS_SRC_DIR}/amd64/SpoolKernelSse2.asm
203${MLAS_SRC_DIR}/amd64/SpoolKernelAvx.asm
204${MLAS_SRC_DIR}/amd64/SpoolKernelAvx512F.asm
205${MLAS_SRC_DIR}/amd64/sgemma.asm
206${MLAS_SRC_DIR}/amd64/cvtfp16a.asm
207${MLAS_SRC_DIR}/amd64/SoftmaxKernelAvx.asm
208${MLAS_SRC_DIR}/amd64/SoftmaxKernelAvx512F.asm
209${MLAS_SRC_DIR}/amd64/TransKernelFma3.asm
210${MLAS_SRC_DIR}/amd64/TransKernelAvx512F.asm
211${MLAS_SRC_DIR}/amd64/LogisticKernelFma3.asm
212${MLAS_SRC_DIR}/amd64/TanhKernelFma3.asm
213${MLAS_SRC_DIR}/amd64/ErfKernelFma3.asm
214)
215if (NOT onnxruntime_ORT_MINIMAL_BUILD)
216target_sources(onnxruntime_mlas PRIVATE
217${MLAS_SRC_DIR}/q4gemm_avx512.cpp
218)
219endif()
220else()
221target_sources(onnxruntime_mlas PRIVATE
222${MLAS_SRC_DIR}/qgemm_kernel_sse.cpp
223${MLAS_SRC_DIR}/qgemm_kernel_sse41.cpp
224${MLAS_SRC_DIR}/i386/SgemmKernelSse2.asm
225${MLAS_SRC_DIR}/i386/SgemmKernelAvx.asm
226)
227endif()
228endfunction()
229
230if (CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
231if (onnxruntime_ENABLE_WEBASSEMBLY_SIMD)
232file(GLOB_RECURSE mlas_platform_srcs
233"${MLAS_SRC_DIR}/wasm_simd/*.cpp"
234)
235set(mlas_platform_srcs
236${mlas_platform_srcs}
237${MLAS_SRC_DIR}/qgemm_kernel_wasmsimd.cpp
238)
239else()
240file(GLOB_RECURSE mlas_platform_srcs
241"${MLAS_SRC_DIR}/scalar/*.cpp"
242)
243endif()
244target_sources(onnxruntime_mlas PRIVATE ${mlas_platform_srcs})
245elseif(MSVC)
246setup_mlas_source_for_windows()
247else()
248
249if(APPLE)
250get_target_property(ONNXRUNTIME_MLAS_OSX_ARCH onnxruntime_mlas OSX_ARCHITECTURES)
251
252if(NOT ONNXRUNTIME_MLAS_OSX_ARCH)
253set(ONNXRUNTIME_MLAS_OSX_ARCH ${CMAKE_HOST_SYSTEM_PROCESSOR})
254endif()
255foreach(OSX_ARCH ${ONNXRUNTIME_MLAS_OSX_ARCH})
256if (OSX_ARCH STREQUAL "arm64")
257set(ARM64 TRUE)
258elseif (OSX_ARCH STREQUAL "arm64e")
259set(ARM64 TRUE)
260elseif (OSX_ARCH STREQUAL "arm")
261set(ARM TRUE)
262elseif (OSX_ARCH STREQUAL "x86_64")
263set(X86_64 TRUE)
264elseif (OSX_ARCH STREQUAL "i386")
265set(X86 TRUE)
266endif()
267endforeach()
268elseif(ANDROID)
269if (CMAKE_ANDROID_ARCH_ABI STREQUAL "armeabi-v7a")
270set(ARM TRUE)
271elseif (CMAKE_ANDROID_ARCH_ABI STREQUAL "arm64-v8a")
272set(ARM64 TRUE)
273elseif (CMAKE_ANDROID_ARCH_ABI STREQUAL "x86_64")
274set(X86_64 TRUE)
275elseif (CMAKE_ANDROID_ARCH_ABI STREQUAL "x86")
276set(X86 TRUE)
277endif()
278else()
279#Linux/FreeBSD/PowerPC/...
280#The value of CMAKE_SYSTEM_PROCESSOR should be from `uname -m`
281#Example values:
282#arm64v8/ubuntu -> aarch64
283#arm32v6/alpine -> armv7l
284#arm32v7/centos -> armv7l
285#ppc64le/debian -> ppc64le
286#s390x/ubuntu -> s390x
287#ppc64le/busybox -> ppc64le
288#arm64v8/ubuntu -> aarch64
289#Android: armv7-a aarch64 i686 x86_64
290#chasun: I don't think anyone uses 'arm64'
291if(CMAKE_SYSTEM_PROCESSOR MATCHES "^arm64.*")
292set(ARM64 TRUE)
293elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^arm.*")
294set(ARM TRUE)
295elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^aarch64.*")
296set(ARM64 TRUE)
297elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(powerpc.*|ppc.*)")
298set(POWER TRUE)
299elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(i.86|x86?)$")
300set(X86 TRUE)
301elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|amd64)$")
302set(X86_64 TRUE)
303elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^loongarch64.*")
304set(LOONGARCH64 TRUE)
305endif()
306endif()
307
308if(APPLE)
309get_target_property(ONNXRUNTIME_MLAS_MACOSX_ARCH onnxruntime_mlas OSX_ARCHITECTURES)
310endif()
311list(LENGTH ONNXRUNTIME_MLAS_MACOSX_ARCH ONNXRUNTIME_MLAS_MACOSX_ARCH_LENGTH)
312if(ONNXRUNTIME_MLAS_MACOSX_ARCH_LENGTH GREATER 1)
313set(ONNXRUNTIME_MLAS_MULTI_ARCH TRUE)
314endif()
315#If ONNXRUNTIME_MLAS_MULTI_ARCH is true, we need to go through every if branch below
316#and split MLAS to multiple static libraries.
317#Otherwise, it works like if(...) elseif(...) elseif(...) endif()
318set(MLAS_SOURCE_IS_NOT_SET 1)
319if(ARM)
320enable_language(ASM)
321
322set(CMAKE_ASM_FLAGS "${CMAKE_ASM_FLAGS} -mfpu=neon")
323set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfpu=neon")
324
325set(mlas_platform_srcs
326${MLAS_SRC_DIR}/aarch32/QgemmU8X8KernelNeon.S
327${MLAS_SRC_DIR}/arm/sgemmc.cpp
328${MLAS_SRC_DIR}/qgemm_kernel_neon.cpp
329)
330if(NOT ONNXRUNTIME_MLAS_MULTI_ARCH)
331set(MLAS_SOURCE_IS_NOT_SET 0)
332endif()
333endif()
334if(ARM64 AND MLAS_SOURCE_IS_NOT_SET )
335enable_language(ASM)
336set(mlas_platform_srcs
337${MLAS_SRC_DIR}/aarch64/ConvSymS8KernelDot.S
338${MLAS_SRC_DIR}/aarch64/ConvSymS8KernelDotLd64.S
339${MLAS_SRC_DIR}/aarch64/ConvSymU8KernelDot.S
340${MLAS_SRC_DIR}/aarch64/ConvSymS8KernelNeon.S
341${MLAS_SRC_DIR}/aarch64/ConvSymU8KernelNeon.S
342${MLAS_SRC_DIR}/aarch64/DepthwiseQConvSymS8KernelNeon.S
343${MLAS_SRC_DIR}/aarch64/DepthwiseQConvSymU8KernelNeon.S
344${MLAS_SRC_DIR}/aarch64/DepthwiseQConvKernelSize9Neon.S
345${MLAS_SRC_DIR}/aarch64/QgemmU8X8KernelNeon.S
346${MLAS_SRC_DIR}/aarch64/QgemmS8S8KernelNeon.S
347${MLAS_SRC_DIR}/aarch64/QgemmU8X8KernelUdot.S
348${MLAS_SRC_DIR}/aarch64/QgemmS8S8KernelSdot.S
349${MLAS_SRC_DIR}/aarch64/SgemmKernelNeon.S
350${MLAS_SRC_DIR}/aarch64/SgemvKernelNeon.S
351${MLAS_SRC_DIR}/aarch64/SymQgemmS8KernelNeon.S
352${MLAS_SRC_DIR}/aarch64/SymQgemmS8KernelSdot.S
353${MLAS_SRC_DIR}/aarch64/SymQgemmS8KernelSdotLd64.S
354${MLAS_SRC_DIR}/qgemm_kernel_neon.cpp
355${MLAS_SRC_DIR}/qgemm_kernel_udot.cpp
356${MLAS_SRC_DIR}/qgemm_kernel_sdot.cpp
357${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon.h
358${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon.cpp
359${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon_fp32.cpp
360${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon_int8.cpp
361)
362set_source_files_properties(${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon_int8.cpp
363PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+dotprod")
364if (NOT APPLE)
365set(mlas_platform_srcs
366${mlas_platform_srcs}
367${MLAS_SRC_DIR}/aarch64/HalfGemmKernelNeon.S
368${MLAS_SRC_DIR}/aarch64/QgemmS8S8KernelSmmla.S
369${MLAS_SRC_DIR}/aarch64/QgemmU8X8KernelUmmla.S
370${MLAS_SRC_DIR}/aarch64/SbgemmKernelNeon.S
371${MLAS_SRC_DIR}/activate_fp16.cpp
372${MLAS_SRC_DIR}/dwconv.cpp
373${MLAS_SRC_DIR}/halfgemm_kernel_neon.cpp
374${MLAS_SRC_DIR}/pooling_fp16.cpp
375${MLAS_SRC_DIR}/qgemm_kernel_smmla.cpp
376${MLAS_SRC_DIR}/qgemm_kernel_ummla.cpp
377${MLAS_SRC_DIR}/sbgemm_kernel_neon.cpp
378)
379set_source_files_properties(${MLAS_SRC_DIR}/aarch64/HalfGemmKernelNeon.S PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+fp16 ")
380set_source_files_properties(${MLAS_SRC_DIR}/aarch64/QgemmS8S8KernelSmmla.S PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+i8mm ")
381set_source_files_properties(${MLAS_SRC_DIR}/aarch64/QgemmU8X8KernelUmmla.S PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+i8mm ")
382set_source_files_properties(${MLAS_SRC_DIR}/aarch64/SbgemmKernelNeon.S PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+bf16 ")
383set_source_files_properties(${MLAS_SRC_DIR}/activate_fp16.cpp PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+fp16 ")
384set_source_files_properties(${MLAS_SRC_DIR}/dwconv.cpp PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+fp16 ")
385set_source_files_properties(${MLAS_SRC_DIR}/pooling_fp16.cpp PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+fp16 ")
386set_source_files_properties(${MLAS_SRC_DIR}/sbgemm_kernel_neon.cpp PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+bf16 ")
387endif()
388
389if(ONNXRUNTIME_MLAS_MULTI_ARCH)
390onnxruntime_add_static_library(onnxruntime_mlas_arm64 ${mlas_platform_srcs})
391set_target_properties(onnxruntime_mlas_arm64 PROPERTIES OSX_ARCHITECTURES "arm64")
392list(APPEND ONNXRUNTIME_MLAS_LIBS onnxruntime_mlas_arm64)
393set(mlas_platform_srcs )
394else()
395set(MLAS_SOURCE_IS_NOT_SET 0)
396endif()
397endif()
398if(POWER AND MLAS_SOURCE_IS_NOT_SET)
399set(mlas_platform_srcs
400${MLAS_SRC_DIR}/power/SgemmKernelPower.cpp
401${MLAS_SRC_DIR}/dgemm.cpp
402${MLAS_SRC_DIR}/power/DgemmKernelPower.cpp
403${MLAS_SRC_DIR}/power/QuantizePower.cpp
404)
405set_source_files_properties(${MLAS_SRC_DIR}/power/SgemmKernelPower.cpp PROPERTIES COMPILE_FLAGS "-DSINGLE")
406
407check_cxx_compiler_flag("-mcpu=power9" HAS_POWER9)
408if (HAS_POWER9)
409set(mlas_platform_srcs
410${mlas_platform_srcs}
411${MLAS_SRC_DIR}/power/QuantizePowerVSX.cpp
412)
413set_source_files_properties(${MLAS_SRC_DIR}/power/QuantizePowerVSX.cpp PROPERTIES COMPILE_FLAGS "-mcpu=power9")
414endif()
415
416check_cxx_compiler_flag("-mcpu=power10" HAS_POWER10)
417if(HAS_POWER10)
418set(CMAKE_REQUIRED_FLAGS "-mcpu=power10")
419check_cxx_source_compiles("
420#include <altivec.h>
421int main() {
422__vector_quad acc0;
423__builtin_mma_xxsetaccz (&acc0);
424return 0;
425}"
426COMPILES_P10
427)
428if(COMPILES_P10)
429check_cxx_source_compiles("
430#ifdef _AIX
431#define POWER_10 0x40000
432#define POWER_10_ANDUP (POWER_10)
433#include <sys/systemcfg.h>
434#define __power_10_andup() (_system_configuration.implementation & POWER_10_ANDUP)
435int main() {
436bool HasP10 = (__power_10_andup() && __power_mma_version() == MMA_V31);
437return 0;
438}
439#else
440#include <sys/auxv.h>
441int main() {
442unsigned long hwcap2 = getauxval(AT_HWCAP2);
443bool HasP10 = ((hwcap2 & PPC_FEATURE2_MMA) && (hwcap2 & PPC_FEATURE2_ARCH_3_1));
444return 0;
445}
446}
447#endif"
448HAS_P10_RUNTIME
449)
450if (HAS_P10_RUNTIME)
451set_source_files_properties(${MLAS_SRC_DIR}/platform.cpp PROPERTIES COMPILE_FLAGS "-DPOWER10")
452set_source_files_properties(${MLAS_SRC_DIR}/qgemm.cpp PROPERTIES COMPILE_FLAGS "-DPOWER10")
453endif()
454set(mlas_platform_srcs_power10
455${MLAS_SRC_DIR}/power/SgemmKernelPOWER10.cpp
456${MLAS_SRC_DIR}/power/DgemmKernelPOWER10.cpp
457${MLAS_SRC_DIR}/power/qgemm_kernel_power10.cpp
458)
459set_source_files_properties(${MLAS_SRC_DIR}/power/SgemmKernelPOWER10.cpp PROPERTIES COMPILE_FLAGS "-O2 -mcpu=power10 -DSINGLE")
460set_source_files_properties(${MLAS_SRC_DIR}/power/DgemmKernelPOWER10.cpp PROPERTIES COMPILE_FLAGS "-O2 -mcpu=power10")
461set_source_files_properties(${MLAS_SRC_DIR}/power/qgemm_kernel_power10.cpp PROPERTIES COMPILE_FLAGS "-O3 -mcpu=power10")
462set(mlas_platform_srcs
463${mlas_platform_srcs}
464${mlas_platform_srcs_power10}
465)
466endif()
467endif()
468if(NOT ONNXRUNTIME_MLAS_MULTI_ARCH)
469set(MLAS_SOURCE_IS_NOT_SET 0)
470endif()
471endif()
472if(X86 AND MLAS_SOURCE_IS_NOT_SET)
473enable_language(ASM)
474
475set(mlas_platform_srcs_sse2
476${MLAS_SRC_DIR}/qgemm_kernel_sse.cpp
477${MLAS_SRC_DIR}/x86/SgemmKernelSse2.S
478)
479set_source_files_properties(${mlas_platform_srcs_sse2} PROPERTIES COMPILE_FLAGS "-msse2")
480
481set(mlas_platform_srcs_avx
482${MLAS_SRC_DIR}/x86/SgemmKernelAvx.S
483)
484set_source_files_properties(${mlas_platform_srcs_avx} PROPERTIES COMPILE_FLAGS "-mavx")
485
486set(mlas_platform_srcs
487${mlas_platform_srcs_sse2}
488${mlas_platform_srcs_avx}
489)
490
491# In r23, NDK remove __x86.get_pc_thunk.* from libatomic. Add our own
492# implementation to avoid external dependency.
493if(ANDROID)
494set(mlas_platform_srcs
495${mlas_platform_srcs}
496${MLAS_SRC_DIR}/x86/x86.get_pc_thunk.S
497)
498endif()
499
500if(NOT ONNXRUNTIME_MLAS_MULTI_ARCH)
501set(MLAS_SOURCE_IS_NOT_SET 0)
502endif()
503endif()
504if(X86_64 AND MLAS_SOURCE_IS_NOT_SET)
505enable_language(ASM)
506
507# Forward the flags for the minimum target platform version from the C
508# compiler to the assembler. This works around CMakeASMCompiler.cmake.in
509# not including the logic to set this flag for the assembler.
510set(CMAKE_ASM${ASM_DIALECT}_OSX_DEPLOYMENT_TARGET_FLAG "${CMAKE_C_OSX_DEPLOYMENT_TARGET_FLAG}")
511
512# The LLVM assembler does not support the .arch directive to enable instruction
513# set extensions and also doesn't support AVX-512F instructions without
514# turning on support via command-line option. Group the sources by the
515# instruction set extension and explicitly set the compiler flag as appropriate.
516
517set(mlas_platform_srcs_sse2
518${MLAS_SRC_DIR}/qgemm_kernel_sse.cpp
519${MLAS_SRC_DIR}/x86_64/DgemmKernelSse2.S
520${MLAS_SRC_DIR}/x86_64/SgemmKernelSse2.S
521${MLAS_SRC_DIR}/x86_64/SgemmTransposePackB16x4Sse2.S
522${MLAS_SRC_DIR}/x86_64/SconvKernelSse2.S
523${MLAS_SRC_DIR}/x86_64/SpoolKernelSse2.S
524)
525set_source_files_properties(${mlas_platform_srcs_sse2} PROPERTIES COMPILE_FLAGS "-msse2")
526
527set(mlas_platform_srcs_avx
528${MLAS_SRC_DIR}/x86_64/DgemmKernelAvx.S
529${MLAS_SRC_DIR}/x86_64/SgemmKernelAvx.S
530${MLAS_SRC_DIR}/x86_64/SgemmKernelM1Avx.S
531${MLAS_SRC_DIR}/x86_64/SgemmKernelM1TransposeBAvx.S
532${MLAS_SRC_DIR}/x86_64/SgemmTransposePackB16x4Avx.S
533${MLAS_SRC_DIR}/x86_64/SconvKernelAvx.S
534${MLAS_SRC_DIR}/x86_64/SpoolKernelAvx.S
535${MLAS_SRC_DIR}/x86_64/SoftmaxKernelAvx.S
536${MLAS_SRC_DIR}/intrinsics/avx/min_max_elements.cpp
537)
538set_source_files_properties(${mlas_platform_srcs_avx} PROPERTIES COMPILE_FLAGS "-mavx")
539
540set(mlas_platform_srcs_avx2
541${MLAS_SRC_DIR}/x86_64/QgemmU8S8KernelAvx2.S
542${MLAS_SRC_DIR}/x86_64/QgemvU8S8KernelAvx2.S
543${MLAS_SRC_DIR}/x86_64/QgemmU8U8KernelAvx2.S
544${MLAS_SRC_DIR}/x86_64/QgemvU8S8KernelAvxVnni.S
545${MLAS_SRC_DIR}/x86_64/QgemmU8X8KernelAvx2.S
546${MLAS_SRC_DIR}/x86_64/ConvSymKernelAvx2.S
547${MLAS_SRC_DIR}/x86_64/DgemmKernelFma3.S
548${MLAS_SRC_DIR}/x86_64/SgemmKernelFma3.S
549${MLAS_SRC_DIR}/x86_64/SconvKernelFma3.S
550${MLAS_SRC_DIR}/x86_64/TransKernelFma3.S
551${MLAS_SRC_DIR}/x86_64/LogisticKernelFma3.S
552${MLAS_SRC_DIR}/x86_64/TanhKernelFma3.S
553${MLAS_SRC_DIR}/x86_64/ErfKernelFma3.S
554${MLAS_SRC_DIR}/intrinsics/avx2/qladd_avx2.cpp
555${MLAS_SRC_DIR}/intrinsics/avx2/qdwconv_avx2.cpp
556${MLAS_SRC_DIR}/sqnbitgemm_kernel_avx2.cpp
557)
558
559message(STATUS "CMAKE_CXX_COMPILER_ID: ${CMAKE_CXX_COMPILER_ID}")
560message(STATUS "CMAKE_CXX_COMPILER_VERSION: ${CMAKE_CXX_COMPILER_VERSION}")
561
562if(NOT "${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU" OR CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "10")
563message(STATUS "Using -mavx2 -mfma -mavxvnni flags")
564set_source_files_properties(${mlas_platform_srcs_avx2} PROPERTIES COMPILE_FLAGS "-mavx2 -mfma -mavxvnni")
565else()
566message(STATUS "Using -mavx2 -mfma flags")
567set_source_files_properties(${mlas_platform_srcs_avx2} PROPERTIES COMPILE_FLAGS "-mavx2 -mfma")
568endif()
569set(mlas_platform_srcs_avx512f
570${MLAS_SRC_DIR}/x86_64/DgemmKernelAvx512F.S
571${MLAS_SRC_DIR}/x86_64/SgemmKernelAvx512F.S
572${MLAS_SRC_DIR}/x86_64/SconvKernelAvx512F.S
573${MLAS_SRC_DIR}/x86_64/SoftmaxKernelAvx512F.S
574${MLAS_SRC_DIR}/x86_64/SpoolKernelAvx512F.S
575${MLAS_SRC_DIR}/x86_64/TransKernelAvx512F.S
576${MLAS_SRC_DIR}/intrinsics/avx512/quantize_avx512f.cpp
577)
578set_source_files_properties(${mlas_platform_srcs_avx512f} PROPERTIES COMPILE_FLAGS "-mavx512f")
579
580set(mlas_platform_srcs_avx512core
581${MLAS_SRC_DIR}/x86_64/QgemvU8S8KernelAvx512Core.S
582${MLAS_SRC_DIR}/x86_64/QgemvU8S8KernelAvx512Vnni.S
583${MLAS_SRC_DIR}/x86_64/QgemmU8X8KernelAvx512Core.S
584${MLAS_SRC_DIR}/x86_64/ConvSymKernelAvx512Core.S
585${MLAS_SRC_DIR}/sqnbitgemm_kernel_avx512.cpp
586)
587set_source_files_properties(${mlas_platform_srcs_avx512core} PROPERTIES COMPILE_FLAGS "-mfma -mavx512vnni -mavx512bw -mavx512dq -mavx512vl")
588
589set(mlas_platform_srcs_avx512vnni
590${MLAS_SRC_DIR}/sqnbitgemm_kernel_avx512vnni.cpp
591)
592set_source_files_properties(${mlas_platform_srcs_avx512vnni} PROPERTIES COMPILE_FLAGS "-mfma -mavx512vnni -mavx512bw -mavx512dq -mavx512vl -mavx512f")
593
594set(mlas_platform_srcs
595${MLAS_SRC_DIR}/activate_fp16.cpp
596${MLAS_SRC_DIR}/dwconv.cpp
597${MLAS_SRC_DIR}/dgemm.cpp
598${MLAS_SRC_DIR}/pooling_fp16.cpp
599${MLAS_SRC_DIR}/qgemm_kernel_avx2.cpp
600${mlas_platform_srcs_sse2}
601${mlas_platform_srcs_avx}
602${mlas_platform_srcs_avx2}
603${mlas_platform_srcs_avx512f}
604${mlas_platform_srcs_avx512core}
605${mlas_platform_srcs_avx512vnni}
606)
607
608if (NOT onnxruntime_ORT_MINIMAL_BUILD)
609set(mlas_platform_srcs
610${mlas_platform_srcs}
611${MLAS_SRC_DIR}/q4gemm_avx512.cpp
612)
613set_source_files_properties(${MLAS_SRC_DIR}/q4gemm_avx512.cpp PROPERTIES COMPILE_FLAGS "-mfma -mavx512vnni -mavx512bw -mavx512dq -mavx512vl -mavx512f")
614endif()
615if(NOT APPLE)
616set(mlas_platform_srcs
617${mlas_platform_srcs}
618${MLAS_SRC_DIR}/x86_64/QgemmU8S8KernelAmxCommon.S
619${MLAS_SRC_DIR}/qgemm_kernel_amx.cpp
620${MLAS_SRC_DIR}/x86_64/QgemmU8S8KernelAmx.S
621)
622set_source_files_properties(${MLAS_SRC_DIR}/qgemm_kernel_amx.cpp PROPERTIES COMPILE_FLAGS "-mavx2 -mavx512bw -mavx512dq -mavx512vl -mavx512f")
623set_source_files_properties(${MLAS_SRC_DIR}/x86_64/QgemmU8S8KernelAmx.S PROPERTIES COMPILE_FLAGS "-mavx2 -mavx512bw -mavx512dq -mavx512vl -mavx512f")
624endif()
625
626if(ONNXRUNTIME_MLAS_MULTI_ARCH)
627onnxruntime_add_static_library(onnxruntime_mlas_x86_64 ${mlas_platform_srcs})
628set_target_properties(onnxruntime_mlas_x86_64 PROPERTIES OSX_ARCHITECTURES "x86_64")
629list(APPEND ONNXRUNTIME_MLAS_LIBS onnxruntime_mlas_x86_64)
630set(mlas_platform_srcs )
631else()
632set(MLAS_SOURCE_IS_NOT_SET 0)
633endif()
634endif()
635if(LOONGARCH64 AND MLAS_SOURCE_IS_NOT_SET)
636set(mlas_platform_srcs
637${MLAS_SRC_DIR}/qgemm_kernel_lsx.cpp
638${MLAS_SRC_DIR}/loongarch64/SgemmKernelLasx.S
639${MLAS_SRC_DIR}/loongarch64/DgemmKernelLsx.S
640${MLAS_SRC_DIR}/loongarch64/DgemmKernelLasx.S
641${MLAS_SRC_DIR}/loongarch64/SgemmKernelLsx.S
642${MLAS_SRC_DIR}/loongarch64/SconvKernelLsx.S
643${MLAS_SRC_DIR}/loongarch64/SconvKernelLasx.S
644${MLAS_SRC_DIR}/loongarch64/SpoolKernelLSX.S
645${MLAS_SRC_DIR}/loongarch64/SpoolKernelLasx.S
646${MLAS_SRC_DIR}/loongarch64/SgemmTransposePackB16x4LSX.S
647${MLAS_SRC_DIR}/loongarch64/SgemmTransposePackB16x4Lasx.S
648${MLAS_SRC_DIR}/loongarch64/SoftmaxKernelLasx.S
649)
650set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mlsx -mlasx")
651if(NOT ONNXRUNTIME_MLAS_MULTI_ARCH)
652set(MLAS_SOURCE_IS_NOT_SET 0)
653endif()
654endif()
655if(NOT ONNXRUNTIME_MLAS_MULTI_ARCH AND MLAS_SOURCE_IS_NOT_SET)
656file(GLOB_RECURSE mlas_platform_srcs
657"${MLAS_SRC_DIR}/scalar/*.cpp")
658endif()
659target_sources(onnxruntime_mlas PRIVATE ${mlas_platform_srcs})
660endif()
661
662foreach(mlas_target ${ONNXRUNTIME_MLAS_LIBS})
663target_include_directories(${mlas_target} PRIVATE ${MLAS_INC_DIR} ${MLAS_SRC_DIR})
664onnxruntime_add_include_to_target(${mlas_target} ${GSL_TARGET})
665
666set_target_properties(${mlas_target} PROPERTIES FOLDER "ONNXRuntime")
667endforeach()
668
669if (WIN32)
670target_compile_options(onnxruntime_mlas PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:/wd6385>" "$<$<COMPILE_LANGUAGE:CXX>:/wd4127>")
671if (onnxruntime_ENABLE_STATIC_ANALYSIS)
672target_compile_options(onnxruntime_mlas PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:/analyze:stacksize 131072>")
673endif()
674endif()
675
676if (PLATFORM_NAME STREQUAL "macabi")
677# Needed for maccatalyst C compilation
678# i.e. the flags below add "--target=x86_64-apple-ios14.0-macabi -ffunction-sections -fdata-sections"
679target_compile_options(onnxruntime_mlas PRIVATE ${CMAKE_C_FLAGS})
680endif()
681
682if (NOT onnxruntime_BUILD_SHARED_LIB)
683install(TARGETS onnxruntime_mlas
684ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
685LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
686RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
687FRAMEWORK DESTINATION ${CMAKE_INSTALL_BINDIR})
688endif()
689
690# set up source group for MLAS source files
691block()
692set(source_group_srcs)
693foreach(mlas_target ${ONNXRUNTIME_MLAS_LIBS})
694get_target_property(mlas_target_srcs ${mlas_target} SOURCES)
695foreach(mlas_target_src ${mlas_target_srcs})
696cmake_path(IS_PREFIX MLAS_ROOT ${mlas_target_src} in_mlas_root)
697if(in_mlas_root)
698list(APPEND source_group_srcs ${mlas_target_src})
699endif()
700endforeach()
701endforeach()
702source_group(TREE ${MLAS_ROOT} FILES ${source_group_srcs})
703endblock()
704
705
706if (NOT onnxruntime_ORT_MINIMAL_BUILD)
707
708#
709# Command line tool for quantization and de-quantization of 2-D fp32 tensors
710# based on block-wise quantization of int4
711#
712
713onnxruntime_add_executable(onnxruntime_mlas_q4dq
714${MLAS_SRC_DIR}/q4_dq_cli.cpp
715)
716target_include_directories(onnxruntime_mlas_q4dq PRIVATE ${MLAS_INC_DIR} ${MLAS_SRC_DIR})
717set_target_properties(onnxruntime_mlas_q4dq PROPERTIES FOLDER "ONNXRuntimeTest")
718
719target_link_libraries(onnxruntime_mlas_q4dq PRIVATE ${ONNXRUNTIME_MLAS_LIBS} onnxruntime_common)
720if (CPUINFO_SUPPORTED AND NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
721target_link_libraries(onnxruntime_mlas_q4dq PRIVATE cpuinfo)
722endif()
723if(NOT WIN32)
724target_link_libraries(onnxruntime_mlas_q4dq PRIVATE nsync::nsync_cpp ${CMAKE_DL_LIBS})
725endif()
726if (CMAKE_SYSTEM_NAME STREQUAL "Android")
727target_link_libraries(onnxruntime_mlas_q4dq PRIVATE ${android_shared_libs})
728endif()
729
730if(WIN32)
731target_link_libraries(onnxruntime_mlas_q4dq PRIVATE debug Dbghelp Advapi32)
732endif()
733if (onnxruntime_LINK_LIBATOMIC)
734target_link_libraries(onnxruntime_mlas_q4dq PRIVATE atomic)
735endif()
736target_link_libraries(onnxruntime_mlas_q4dq PRIVATE Threads::Threads)
737
738if (CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
739if (onnxruntime_ENABLE_WEBASSEMBLY_THREADS)
740set_target_properties(onnxruntime_mlas_q4dq PROPERTIES LINK_FLAGS "-s ALLOW_MEMORY_GROWTH=1 -s PROXY_TO_PTHREAD=1 -s EXIT_RUNTIME=1")
741else()
742set_target_properties(onnxruntime_mlas_q4dq PROPERTIES LINK_FLAGS "-s ALLOW_MEMORY_GROWTH=1")
743endif()
744endif()
745
746endif()
747