From b3f727782f62402f0c1bd2d6d548628c2cf64f0d Mon Sep 17 00:00:00 2001 From: Katelyn Gadd Date: Wed, 29 Mar 2023 09:07:56 -0700 Subject: [PATCH 1/4] Checkpoint Jiterpreter SIMD support Fix bugs in the jiterpreter-support import management implementation Enable interpreter SIMD for WASM Minor cleanups Fix merge Checkpoint PackedSimd Implement a few PackedSimd methods in the interp Add packedsimd ops to jiterpreter Move the interp simd opcode -> wasm opcode mapping into the C tables Add intrinsic ids for most of the remaining packedsimd methods Map most of the PackedSimd methods to intrinsics Update genmintops Fix merge damage Fix build Add more wasm opcodes Add more wasm opcodes Add bitmask intrinsics Add missing opcodes to transform-simd Use HOST_BROWSER instead of HOST_WASM to fix wasi build Implement the pack-n-elements vector instructions Disable bitselect because it's broken somehow Simplify vector packing Add browser-bench measurements for packing vectors Disable more opcodes Disable more opcodes Fix PackedSimd feature detection on non-wasm targets Maybe fix linux interp assertion Don't fail transform for unsupported PackedSimd methods on non-browser targets Fix i64 popcnt Add basic R4 v128 intrinsics (add/sub/div/mul) Re-enable more jiterpreter simd --- src/mono/CMakeLists.txt | 1 + src/mono/mono/mini/interp/interp-internals.h | 8 +- .../mono/mini/interp/interp-simd-intrins.def | 266 ++++++++---- src/mono/mono/mini/interp/interp-simd.c | 137 +++++- src/mono/mono/mini/interp/interp-simd.h | 6 + src/mono/mono/mini/interp/interp.c | 38 ++ src/mono/mono/mini/interp/mintops.h | 18 +- src/mono/mono/mini/interp/simd-methods.def | 20 + src/mono/mono/mini/interp/transform-simd.c | 264 +++++++++++- src/mono/mono/utils/options-def.h | 2 + src/mono/sample/wasm/browser-bench/Vector.cs | 21 + src/mono/wasm/runtime/CMakeLists.txt | 1 + src/mono/wasm/runtime/cwraps.ts | 4 + src/mono/wasm/runtime/genmintops.py | 62 ++- .../wasm/runtime/jiterpreter-interp-entry.ts | 9 +- src/mono/wasm/runtime/jiterpreter-jit-call.ts | 17 +- src/mono/wasm/runtime/jiterpreter-opcodes.ts | 246 +++++++++++ src/mono/wasm/runtime/jiterpreter-support.ts | 141 ++++-- .../runtime/jiterpreter-trace-generator.ts | 401 +++++++++++++++++- src/mono/wasm/runtime/jiterpreter.ts | 50 ++- .../runtime/wasm-simd-feature-detect.wasm | Bin 0 -> 39 bytes .../wasm/runtime/wasm-simd-feature-detect.wat | 6 + src/mono/wasm/wasm.proj | 8 +- 23 files changed, 1531 insertions(+), 195 deletions(-) create mode 100644 src/mono/wasm/runtime/wasm-simd-feature-detect.wasm create mode 100644 src/mono/wasm/runtime/wasm-simd-feature-detect.wat diff --git a/src/mono/CMakeLists.txt b/src/mono/CMakeLists.txt index 92a0ac8ea82a5..5ff644b7cd57d 100644 --- a/src/mono/CMakeLists.txt +++ b/src/mono/CMakeLists.txt @@ -270,6 +270,7 @@ elseif(CLR_CMAKE_HOST_OS STREQUAL "emscripten") add_compile_options(-Wno-strict-prototypes) add_compile_options(-Wno-unused-but-set-variable) add_compile_options(-Wno-single-bit-bitfield-constant-conversion) + add_compile_options(-msimd128) set(DISABLE_EXECUTABLES 1) # FIXME: Is there a cmake option for this ? set(DISABLE_SHARED_LIBS 1) diff --git a/src/mono/mono/mini/interp/interp-internals.h b/src/mono/mono/mini/interp/interp-internals.h index a2bff18e2ef32..dc38222a8ff38 100644 --- a/src/mono/mono/mini/interp/interp-internals.h +++ b/src/mono/mono/mini/interp/interp-internals.h @@ -102,7 +102,7 @@ typedef enum { #define PROFILE_INTERP 0 -#if !HOST_BROWSER && __GNUC__ +#if __GNUC__ #define INTERP_ENABLE_SIMD #endif @@ -342,6 +342,12 @@ mono_jiterp_stackval_from_data (MonoType *type, stackval *result, const void *da gpointer mono_jiterp_frame_data_allocator_alloc (FrameDataAllocator *stack, InterpFrame *frame, int size); +gpointer +mono_jiterp_get_simd_intrinsic (int arity, int index); + +int +mono_jiterp_get_simd_opcode (int arity, int index); + #endif static inline int diff --git a/src/mono/mono/mini/interp/interp-simd-intrins.def b/src/mono/mono/mini/interp/interp-simd-intrins.def index 57bbba1717d7b..9ed37a34b1287 100644 --- a/src/mono/mono/mini/interp/interp-simd-intrins.def +++ b/src/mono/mono/mini/interp/interp-simd-intrins.def @@ -1,81 +1,185 @@ -INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I1_ADD, interp_v128_i1_op_addition) -INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I2_ADD, interp_v128_i2_op_addition) -INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I4_ADD, interp_v128_i4_op_addition) - -INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I1_SUB, interp_v128_i1_op_subtraction) -INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I2_SUB, interp_v128_i2_op_subtraction) -INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I4_SUB, interp_v128_i4_op_subtraction) - -INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_BITWISE_AND, interp_v128_op_bitwise_and) -INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_BITWISE_OR, interp_v128_op_bitwise_or) - -INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_BITWISE_EQUALITY, interp_v128_op_bitwise_equality) -INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_BITWISE_INEQUALITY, interp_v128_op_bitwise_inequality) - -INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_EXCLUSIVE_OR, interp_v128_op_exclusive_or) - -INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I1_MULTIPLY, interp_v128_i1_op_multiply) -INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I2_MULTIPLY, interp_v128_i2_op_multiply) -INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I4_MULTIPLY, interp_v128_i4_op_multiply) - -INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I1_NEGATION, interp_v128_i1_op_negation) -INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I2_NEGATION, interp_v128_i2_op_negation) -INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I4_NEGATION, interp_v128_i4_op_negation) - -INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I1_LEFT_SHIFT, interp_v128_i1_op_left_shift) -INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I2_LEFT_SHIFT, interp_v128_i2_op_left_shift) -INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I4_LEFT_SHIFT, interp_v128_i4_op_left_shift) -INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I8_LEFT_SHIFT, interp_v128_i8_op_left_shift) - -INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I1_RIGHT_SHIFT, interp_v128_i1_op_right_shift) -INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I2_RIGHT_SHIFT, interp_v128_i2_op_right_shift) -INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I4_RIGHT_SHIFT, interp_v128_i4_op_right_shift) - -INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I1_URIGHT_SHIFT, interp_v128_i1_op_uright_shift) -INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I2_URIGHT_SHIFT, interp_v128_i2_op_uright_shift) -INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I4_URIGHT_SHIFT, interp_v128_i4_op_uright_shift) -INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I8_URIGHT_SHIFT, interp_v128_i8_op_uright_shift) - -INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_ONES_COMPLEMENT, interp_v128_op_ones_complement) - -INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_U2_WIDEN_LOWER, interp_v128_u2_widen_lower) -INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_U2_WIDEN_UPPER, interp_v128_u2_widen_upper) - -INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_U1_NARROW, interp_v128_u1_narrow) - -INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_U1_GREATER_THAN, interp_v128_u1_greater_than) - -INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I1_LESS_THAN, interp_v128_i1_less_than) -INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_U1_LESS_THAN, interp_v128_u1_less_than) -INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I2_LESS_THAN, interp_v128_i2_less_than) - -INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I1_EQUALS, interp_v128_i1_equals) -INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I2_EQUALS, interp_v128_i2_equals) -INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I4_EQUALS, interp_v128_i4_equals) -INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I8_EQUALS, interp_v128_i8_equals) - -INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I1_CREATE_SCALAR, interp_v128_i1_create_scalar) -INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I2_CREATE_SCALAR, interp_v128_i2_create_scalar) -INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I4_CREATE_SCALAR, interp_v128_i4_create_scalar) -INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I8_CREATE_SCALAR, interp_v128_i8_create_scalar) - -INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I1_EXTRACT_MSB, interp_v128_i1_extract_msb) -INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I2_EXTRACT_MSB, interp_v128_i2_extract_msb) -INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I4_EXTRACT_MSB, interp_v128_i4_extract_msb) -INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I8_EXTRACT_MSB, interp_v128_i8_extract_msb) - -INTERP_SIMD_INTRINSIC_P_PPP (INTERP_SIMD_INTRINSIC_V128_CONDITIONAL_SELECT, interp_v128_conditional_select) - -INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I1_CREATE, interp_v128_i1_create) -INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I2_CREATE, interp_v128_i2_create) -INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I4_CREATE, interp_v128_i4_create) -INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I8_CREATE, interp_v128_i8_create) - -INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_AND_NOT, interp_v128_and_not) - -INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_U2_LESS_THAN_EQUAL, interp_v128_u2_less_than_equal) - -INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I1_SHUFFLE, interp_v128_i1_shuffle) -INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I2_SHUFFLE, interp_v128_i2_shuffle) -INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I4_SHUFFLE, interp_v128_i4_shuffle) -INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I8_SHUFFLE, interp_v128_i8_shuffle) +// FIXME: SIMD causes compile errors on WASI +#ifdef HOST_BROWSER +#ifndef INTERP_WASM_SIMD_INTRINSIC_V_P +#define INTERP_WASM_SIMD_INTRINSIC_V_P(id, c_intrinsic, wasm_opcode) INTERP_SIMD_INTRINSIC_P_P(id, _mono_interp_simd_ ## id, wasm_opcode) +#endif +#ifndef INTERP_WASM_SIMD_INTRINSIC_V_V +#define INTERP_WASM_SIMD_INTRINSIC_V_V(id, c_intrinsic, wasm_opcode) INTERP_SIMD_INTRINSIC_P_P(id, _mono_interp_simd_ ## id, wasm_opcode) +#endif +#ifndef INTERP_WASM_SIMD_INTRINSIC_I_V +#define INTERP_WASM_SIMD_INTRINSIC_I_V(id, c_intrinsic, wasm_opcode) INTERP_SIMD_INTRINSIC_P_P(id, _mono_interp_simd_ ## id, wasm_opcode) +#endif +#ifndef INTERP_WASM_SIMD_INTRINSIC_V_VV +#define INTERP_WASM_SIMD_INTRINSIC_V_VV(id, c_intrinsic, wasm_opcode) INTERP_SIMD_INTRINSIC_P_PP(id, _mono_interp_simd_ ## id, wasm_opcode) +#endif +#ifndef INTERP_WASM_SIMD_INTRINSIC_V_VI +#define INTERP_WASM_SIMD_INTRINSIC_V_VI(id, c_intrinsic, wasm_opcode) INTERP_SIMD_INTRINSIC_P_PP(id, _mono_interp_simd_ ## id, wasm_opcode) +#endif +#ifndef INTERP_WASM_SIMD_INTRINSIC_V_VVV +#define INTERP_WASM_SIMD_INTRINSIC_V_VVV(id, c_intrinsic, wasm_opcode) INTERP_SIMD_INTRINSIC_P_PPP(id, _mono_interp_simd_ ## id, wasm_opcode) +#endif +#else // HOST_BROWSER +#define INTERP_WASM_SIMD_INTRINSIC_V_P(id, c_intrinsic, wasm_opcode) +#define INTERP_WASM_SIMD_INTRINSIC_V_V(id, c_intrinsic, wasm_opcode) +#define INTERP_WASM_SIMD_INTRINSIC_I_V(id, c_intrinsic, wasm_opcode) +#define INTERP_WASM_SIMD_INTRINSIC_V_VV(id, c_intrinsic, wasm_opcode) +#define INTERP_WASM_SIMD_INTRINSIC_V_VI(id, c_intrinsic, wasm_opcode) +#define INTERP_WASM_SIMD_INTRINSIC_V_VVV(id, c_intrinsic, wasm_opcode) +#endif // HOST_BROWSER + +// The third argument is the wasm opcode that corresponds to this simd intrinsic, if any. +// Specify 0 if there is no exact 1:1 mapping (the opcode can still be implemented manually in the jiterpreter.) + +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I1_ADD, interp_v128_i1_op_addition, 110) +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I2_ADD, interp_v128_i2_op_addition, 142) +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I4_ADD, interp_v128_i4_op_addition, 174) +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_R4_ADD, interp_v128_r4_op_addition, 228) + +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I1_SUB, interp_v128_i1_op_subtraction, 113) +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I2_SUB, interp_v128_i2_op_subtraction, 145) +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I4_SUB, interp_v128_i4_op_subtraction, 177) +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_R4_SUB, interp_v128_r4_op_subtraction, 229) + +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_BITWISE_AND, interp_v128_op_bitwise_and, 78) +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_BITWISE_OR, interp_v128_op_bitwise_or, 80) + +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_BITWISE_EQUALITY, interp_v128_op_bitwise_equality, 0) +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_BITWISE_INEQUALITY, interp_v128_op_bitwise_inequality, 0) + +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_EXCLUSIVE_OR, interp_v128_op_exclusive_or, 81) + +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I1_MULTIPLY, interp_v128_i1_op_multiply, 0) +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I2_MULTIPLY, interp_v128_i2_op_multiply, 149) +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I4_MULTIPLY, interp_v128_i4_op_multiply, 181) +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_R4_MULTIPLY, interp_v128_r4_op_multiply, 230) + +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_R4_DIVISION, interp_v128_r4_op_division, 231) + +INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I1_NEGATION, interp_v128_i1_op_negation, 97) +INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I2_NEGATION, interp_v128_i2_op_negation, 129) +INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I4_NEGATION, interp_v128_i4_op_negation, 161) + +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I1_LEFT_SHIFT, interp_v128_i1_op_left_shift, 107) +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I2_LEFT_SHIFT, interp_v128_i2_op_left_shift, 139) +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I4_LEFT_SHIFT, interp_v128_i4_op_left_shift, 171) +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I8_LEFT_SHIFT, interp_v128_i8_op_left_shift, 203) + +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I1_RIGHT_SHIFT, interp_v128_i1_op_right_shift, 108) +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I2_RIGHT_SHIFT, interp_v128_i2_op_right_shift, 140) +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I4_RIGHT_SHIFT, interp_v128_i4_op_right_shift, 172) + +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I1_URIGHT_SHIFT, interp_v128_i1_op_uright_shift, 109) +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I2_URIGHT_SHIFT, interp_v128_i2_op_uright_shift, 141) +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I4_URIGHT_SHIFT, interp_v128_i4_op_uright_shift, 173) +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I8_URIGHT_SHIFT, interp_v128_i8_op_uright_shift, 205) + +INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_ONES_COMPLEMENT, interp_v128_op_ones_complement, 77) + +INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_U2_WIDEN_LOWER, interp_v128_u2_widen_lower, 137) +INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_U2_WIDEN_UPPER, interp_v128_u2_widen_upper, 138) + +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_U1_NARROW, interp_v128_u1_narrow, 102) + +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_U1_GREATER_THAN, interp_v128_u1_greater_than, 40) + +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I1_LESS_THAN, interp_v128_i1_less_than, 37) +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_U1_LESS_THAN, interp_v128_u1_less_than, 38) +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I2_LESS_THAN, interp_v128_i2_less_than, 47) + +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I1_EQUALS, interp_v128_i1_equals, 35) +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I2_EQUALS, interp_v128_i2_equals, 45) +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I4_EQUALS, interp_v128_i4_equals, 55) +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I8_EQUALS, interp_v128_i8_equals, 214) + +INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I1_CREATE_SCALAR, interp_v128_i1_create_scalar, 0) +INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I2_CREATE_SCALAR, interp_v128_i2_create_scalar, 0) +INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I4_CREATE_SCALAR, interp_v128_i4_create_scalar, 0) +INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I8_CREATE_SCALAR, interp_v128_i8_create_scalar, 0) + +INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I1_EXTRACT_MSB, interp_v128_i1_extract_msb, 0) +INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I2_EXTRACT_MSB, interp_v128_i2_extract_msb, 0) +INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I4_EXTRACT_MSB, interp_v128_i4_extract_msb, 0) +INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I8_EXTRACT_MSB, interp_v128_i8_extract_msb, 0) + +// wasm opcode is 0 because it has a different calling convention +INTERP_SIMD_INTRINSIC_P_PPP (INTERP_SIMD_INTRINSIC_V128_CONDITIONAL_SELECT, interp_v128_conditional_select, 0) + +INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I1_CREATE, interp_v128_i1_create, 0) +INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I2_CREATE, interp_v128_i2_create, 0) +INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I4_CREATE, interp_v128_i4_create, 0) +INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I8_CREATE, interp_v128_i8_create, 0) + +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_AND_NOT, interp_v128_and_not, 79) + +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_U2_LESS_THAN_EQUAL, interp_v128_u2_less_than_equal, 52) + +// wasm only has a swizzle opcode for i8x16, none of the others +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I1_SHUFFLE, interp_v128_i1_shuffle, 14) +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I2_SHUFFLE, interp_v128_i2_shuffle, 0) +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I4_SHUFFLE, interp_v128_i4_shuffle, 0) +INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I8_SHUFFLE, interp_v128_i8_shuffle, 0) + +// Wasm PackedSimd (see PackedSimd.cs) +// We automatically generate C wrappers around clang's wasm simd intrinsics for each of these intrinsics +// The 2nd argument is the name of the clang intrinsic and the 3rd argument is the wasm opcode. + +INTERP_WASM_SIMD_INTRINSIC_V_P (INTERP_SIMD_INTRINSIC_WASM_I8X16_SPLAT, wasm_v128_load8_splat, 0x07) +INTERP_WASM_SIMD_INTRINSIC_V_P (INTERP_SIMD_INTRINSIC_WASM_I16X8_SPLAT, wasm_v128_load16_splat, 0x08) +INTERP_WASM_SIMD_INTRINSIC_V_P (INTERP_SIMD_INTRINSIC_WASM_I32X4_SPLAT, wasm_v128_load32_splat, 0x09) +INTERP_WASM_SIMD_INTRINSIC_V_P (INTERP_SIMD_INTRINSIC_WASM_I64X2_SPLAT, wasm_v128_load64_splat, 0x0a) +// FIXME: ExtractLane and ReplaceLane +// FIXME: Shuffle +INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_I8X16_SWIZZLE, wasm_i8x16_swizzle, 0x0e) +// FIXME: f32/f64 versions of add/subtract/multiply/negate are missing +INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_I8X16_ADD, wasm_i8x16_add, 0x6e) +INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_I16X8_ADD, wasm_i16x8_add, 0x8e) +INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_I32X4_ADD, wasm_i32x4_add, 0xae) +INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_I64X2_ADD, wasm_i64x2_add, 0xce) +INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_I8X16_SUBTRACT, wasm_i8x16_sub, 0x71) +INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_I16X8_SUBTRACT, wasm_i16x8_sub, 0x91) +INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_I32X4_SUBTRACT, wasm_i32x4_sub, 0xb1) +INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_I64X2_SUBTRACT, wasm_i64x2_sub, 0xd1) +// There is no i8x16 mul opcode +INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_I8X16_MULTIPLY, _interp_wasm_simd_assert_not_reached, 0x0) +INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_I16X8_MULTIPLY, wasm_i16x8_mul, 0x95) +INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_I32X4_MULTIPLY, wasm_i32x4_mul, 0xb5) +INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_I64X2_MULTIPLY, wasm_i64x2_mul, 0xd5) +INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_I32X4_DOT_I16X8, wasm_i32x4_dot_i16x8, 0xba) +INTERP_WASM_SIMD_INTRINSIC_V_V (INTERP_SIMD_INTRINSIC_WASM_I8X16_NEGATE, wasm_i8x16_neg, 0x61) +INTERP_WASM_SIMD_INTRINSIC_V_V (INTERP_SIMD_INTRINSIC_WASM_I16X8_NEGATE, wasm_i16x8_neg, 0x81) +INTERP_WASM_SIMD_INTRINSIC_V_V (INTERP_SIMD_INTRINSIC_WASM_I32X4_NEGATE, wasm_i32x4_neg, 0xa1) +INTERP_WASM_SIMD_INTRINSIC_V_V (INTERP_SIMD_INTRINSIC_WASM_I64X2_NEGATE, wasm_i64x2_neg, 0xc1) +INTERP_WASM_SIMD_INTRINSIC_V_VI (INTERP_SIMD_INTRINSIC_WASM_I8X16_SHIFTLEFT, wasm_i8x16_shl, 0x6b) +INTERP_WASM_SIMD_INTRINSIC_V_VI (INTERP_SIMD_INTRINSIC_WASM_I16X8_SHIFTLEFT, wasm_i16x8_shl, 0x8b) +INTERP_WASM_SIMD_INTRINSIC_V_VI (INTERP_SIMD_INTRINSIC_WASM_I32X4_SHIFTLEFT, wasm_i32x4_shl, 0xab) +INTERP_WASM_SIMD_INTRINSIC_V_VI (INTERP_SIMD_INTRINSIC_WASM_I64X2_SHIFTLEFT, wasm_i64x2_shl, 0xcb) +INTERP_WASM_SIMD_INTRINSIC_V_VI (INTERP_SIMD_INTRINSIC_WASM_I8X16_SHIFTRIGHTARITHMETIC, wasm_i8x16_shr, 0x6c) +INTERP_WASM_SIMD_INTRINSIC_V_VI (INTERP_SIMD_INTRINSIC_WASM_I16X8_SHIFTRIGHTARITHMETIC, wasm_i16x8_shr, 0x8c) +INTERP_WASM_SIMD_INTRINSIC_V_VI (INTERP_SIMD_INTRINSIC_WASM_I32X4_SHIFTRIGHTARITHMETIC, wasm_i32x4_shr, 0xac) +INTERP_WASM_SIMD_INTRINSIC_V_VI (INTERP_SIMD_INTRINSIC_WASM_I64X2_SHIFTRIGHTARITHMETIC, wasm_i64x2_shr, 0xcc) +INTERP_WASM_SIMD_INTRINSIC_V_VI (INTERP_SIMD_INTRINSIC_WASM_I8X16_SHIFTRIGHTLOGICAL, wasm_u8x16_shr, 0x6d) +INTERP_WASM_SIMD_INTRINSIC_V_VI (INTERP_SIMD_INTRINSIC_WASM_I16X8_SHIFTRIGHTLOGICAL, wasm_u16x8_shr, 0x8d) +INTERP_WASM_SIMD_INTRINSIC_V_VI (INTERP_SIMD_INTRINSIC_WASM_I32X4_SHIFTRIGHTLOGICAL, wasm_u32x4_shr, 0xad) +INTERP_WASM_SIMD_INTRINSIC_V_VI (INTERP_SIMD_INTRINSIC_WASM_I64X2_SHIFTRIGHTLOGICAL, wasm_u64x2_shr, 0xcd) +INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_AND, wasm_v128_and, 0x4e) +// FIXME: NOT, OR, XOR +INTERP_WASM_SIMD_INTRINSIC_I_V (INTERP_SIMD_INTRINSIC_WASM_I8X16_BITMASK, wasm_i8x16_bitmask, 0x64) +INTERP_WASM_SIMD_INTRINSIC_I_V (INTERP_SIMD_INTRINSIC_WASM_I16X8_BITMASK, wasm_i16x8_bitmask, 0x84) +INTERP_WASM_SIMD_INTRINSIC_I_V (INTERP_SIMD_INTRINSIC_WASM_I32X4_BITMASK, wasm_i32x4_bitmask, 0xa4) +INTERP_WASM_SIMD_INTRINSIC_I_V (INTERP_SIMD_INTRINSIC_WASM_I64X2_BITMASK, wasm_i64x2_bitmask, 0xc4) +INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_I8X16_COMPAREEQUAL, wasm_i8x16_eq, 0x23) +INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_I16X8_COMPAREEQUAL, wasm_i16x8_eq, 0x2d) +INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_I32X4_COMPAREEQUAL, wasm_i32x4_eq, 0x37) +INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_I64X2_COMPAREEQUAL, wasm_i64x2_eq, 0xd6) +INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_F32X4_COMPAREEQUAL, wasm_f32x4_eq, 0x41) +INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_F64X2_COMPAREEQUAL, wasm_f64x2_eq, 0x47) +INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_I8X16_COMPARENOTEQUAL, wasm_i8x16_ne, 0x24) +INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_I16X8_COMPARENOTEQUAL, wasm_i16x8_ne, 0x2e) +INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_I32X4_COMPARENOTEQUAL, wasm_i32x4_ne, 0x38) +INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_I64X2_COMPARENOTEQUAL, wasm_i64x2_ne, 0xd7) +INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_F32X4_COMPARENOTEQUAL, wasm_f32x4_ne, 0x42) +INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_F64X2_COMPARENOTEQUAL, wasm_f64x2_ne, 0x48) +INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_I8X16_NARROW_I16X8_S, wasm_i8x16_narrow_i16x8, 0x65) +INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_I16X8_NARROW_I32X4_S, wasm_i16x8_narrow_i32x4, 0x85) +INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_I8X16_NARROW_I16X8_U, wasm_u8x16_narrow_i16x8, 0x66) +INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_I16X8_NARROW_I32X4_U, wasm_u16x8_narrow_i32x4, 0x86) diff --git a/src/mono/mono/mini/interp/interp-simd.c b/src/mono/mono/mini/interp/interp-simd.c index f67370e478d1a..09e90a997ac6e 100644 --- a/src/mono/mono/mini/interp/interp-simd.c +++ b/src/mono/mono/mini/interp/interp-simd.c @@ -2,6 +2,10 @@ #include "interp-internals.h" #include "interp-simd.h" +#if HOST_BROWSER +#include +#endif + #ifdef INTERP_ENABLE_SIMD typedef gint64 v128_i8 __attribute__ ((vector_size (SIZEOF_V128))); @@ -12,6 +16,7 @@ typedef gint16 v128_i2 __attribute__ ((vector_size (SIZEOF_V128))); typedef guint16 v128_u2 __attribute__ ((vector_size (SIZEOF_V128))); typedef gint8 v128_i1 __attribute__ ((vector_size (SIZEOF_V128))); typedef guint8 v128_u1 __attribute__ ((vector_size (SIZEOF_V128))); +typedef float v128_r4 __attribute__ ((vector_size (SIZEOF_V128))); // get_AllBitsSet static void @@ -39,6 +44,12 @@ interp_v128_i4_op_addition (gpointer res, gpointer v1, gpointer v2) *(v128_i4*)res = *(v128_i4*)v1 + *(v128_i4*)v2; } +static void +interp_v128_r4_op_addition (gpointer res, gpointer v1, gpointer v2) +{ + *(v128_r4*)res = *(v128_r4*)v1 + *(v128_r4*)v2; +} + // op_Subtraction static void interp_v128_i1_op_subtraction (gpointer res, gpointer v1, gpointer v2) @@ -58,6 +69,12 @@ interp_v128_i4_op_subtraction (gpointer res, gpointer v1, gpointer v2) *(v128_i4*)res = *(v128_i4*)v1 - *(v128_i4*)v2; } +static void +interp_v128_r4_op_subtraction (gpointer res, gpointer v1, gpointer v2) +{ + *(v128_r4*)res = *(v128_r4*)v1 - *(v128_r4*)v2; +} + // op_BitwiseAnd static void interp_v128_op_bitwise_and (gpointer res, gpointer v1, gpointer v2) @@ -124,6 +141,18 @@ interp_v128_i4_op_multiply (gpointer res, gpointer v1, gpointer v2) *(v128_i4*)res = *(v128_i4*)v1 * *(v128_i4*)v2; } +static void +interp_v128_r4_op_multiply (gpointer res, gpointer v1, gpointer v2) +{ + *(v128_r4*)res = *(v128_r4*)v1 * *(v128_r4*)v2; +} + +static void +interp_v128_r4_op_division (gpointer res, gpointer v1, gpointer v2) +{ + *(v128_r4*)res = *(v128_r4*)v1 / *(v128_r4*)v2; +} + // op_UnaryNegation static void interp_v128_i1_op_negation (gpointer res, gpointer v1) @@ -535,32 +564,122 @@ interp_v128_i8_shuffle (gpointer res, gpointer v1, gpointer v2) V128_SHUFFLE (gint64, guint64); } -#define INTERP_SIMD_INTRINSIC_P_P(a,b) -#define INTERP_SIMD_INTRINSIC_P_PP(a,b) -#define INTERP_SIMD_INTRINSIC_P_PPP(a,b) +#define INTERP_SIMD_INTRINSIC_P_P(a,b,c) +#define INTERP_SIMD_INTRINSIC_P_PP(a,b,c) +#define INTERP_SIMD_INTRINSIC_P_PPP(a,b,c) + +// For the wasm packed simd intrinsics we want to automatically generate the C implementations from +// their corresponding clang intrinsics. See also: +// https://github.com/llvm/llvm-project/blob/main/clang/lib/Headers/wasm_simd128.h +// In this context V means Vector128 and P means void* pointer. +#ifdef HOST_BROWSER + +static v128_t +_interp_wasm_simd_assert_not_reached (v128_t lhs, v128_t rhs) { + g_assert_not_reached (); +} + +#define INTERP_WASM_SIMD_INTRINSIC_V_P(id, c_intrinsic, wasm_opcode) \ +static void \ +_mono_interp_simd_ ## id (gpointer res, gpointer v1) { \ + *((v128_t *)res) = c_intrinsic (v1); \ +} + +#define INTERP_WASM_SIMD_INTRINSIC_V_V(id, c_intrinsic, wasm_opcode) \ +static void \ +_mono_interp_simd_ ## id (gpointer res, gpointer v1) { \ + *((v128_t *)res) = c_intrinsic (*((v128_t *)v1)); \ +} + +#define INTERP_WASM_SIMD_INTRINSIC_I_V(id, c_intrinsic, wasm_opcode) \ +static void \ +_mono_interp_simd_ ## id (gpointer res, gpointer v1) { \ + *((int32_t *)res) = c_intrinsic (*((v128_t *)v1)); \ +} + +#define INTERP_WASM_SIMD_INTRINSIC_V_VV(id, c_intrinsic, wasm_opcode) \ +static void \ +_mono_interp_simd_ ## id (gpointer res, gpointer v1, gpointer v2) { \ + *((v128_t *)res) = c_intrinsic (*((v128_t *)v1), *((v128_t *)v2)); \ +} + +#define INTERP_WASM_SIMD_INTRINSIC_V_VI(id, c_intrinsic, wasm_opcode) \ +static void \ +_mono_interp_simd_ ## id (gpointer res, gpointer v1, gpointer v2) { \ + *((v128_t *)res) = c_intrinsic (*((v128_t *)v1), *((int *)v2)); \ +} + +#define INTERP_WASM_SIMD_INTRINSIC_V_VVV(id, c_intrinsic, wasm_opcode) \ +static void \ +_mono_interp_simd_ ## id (gpointer res, gpointer v1, gpointer v2, gpointer v3) { \ + *((v128_t *)res) = c_intrinsic (*((v128_t *)v1), *((v128_t *)v2), *((v128_t *)v3)); \ +} + +#include "interp-simd-intrins.def" + +#undef INTERP_WASM_SIMD_INTRINSIC_V_P +#undef INTERP_WASM_SIMD_INTRINSIC_V_V +#undef INTERP_WASM_SIMD_INTRINSIC_I_V +#undef INTERP_WASM_SIMD_INTRINSIC_V_VV +#undef INTERP_WASM_SIMD_INTRINSIC_V_VI +#undef INTERP_WASM_SIMD_INTRINSIC_V_VVV + +// Now generate the wasm opcode tables for the intrinsics + +#undef INTERP_SIMD_INTRINSIC_P_P +#define INTERP_SIMD_INTRINSIC_P_P(a,b,c) c, + +int interp_simd_p_p_wasm_opcode_table [] = { +#include "interp-simd-intrins.def" +}; + +#undef INTERP_SIMD_INTRINSIC_P_P +#define INTERP_SIMD_INTRINSIC_P_P(a,b,c) + +#undef INTERP_SIMD_INTRINSIC_P_PP +#define INTERP_SIMD_INTRINSIC_P_PP(a,b,c) c, + +int interp_simd_p_pp_wasm_opcode_table [] = { +#include "interp-simd-intrins.def" +}; + +#undef INTERP_SIMD_INTRINSIC_P_PP +#define INTERP_SIMD_INTRINSIC_P_PP(a,b,c) + +#undef INTERP_SIMD_INTRINSIC_P_PPP +#define INTERP_SIMD_INTRINSIC_P_PPP(a,b,c) c, + +int interp_simd_p_ppp_wasm_opcode_table [] = { +#include "interp-simd-intrins.def" +}; + +#undef INTERP_SIMD_INTRINSIC_P_PPP +#define INTERP_SIMD_INTRINSIC_P_PPP(a,b,c) + +#endif // HOST_BROWSER #undef INTERP_SIMD_INTRINSIC_P_P -#define INTERP_SIMD_INTRINSIC_P_P(a,b) b, +#define INTERP_SIMD_INTRINSIC_P_P(a,b,c) b, PP_SIMD_Method interp_simd_p_p_table [] = { #include "interp-simd-intrins.def" }; #undef INTERP_SIMD_INTRINSIC_P_P -#define INTERP_SIMD_INTRINSIC_P_P(a,b) +#define INTERP_SIMD_INTRINSIC_P_P(a,b,c) #undef INTERP_SIMD_INTRINSIC_P_PP -#define INTERP_SIMD_INTRINSIC_P_PP(a,b) b, +#define INTERP_SIMD_INTRINSIC_P_PP(a,b,c) b, PPP_SIMD_Method interp_simd_p_pp_table [] = { #include "interp-simd-intrins.def" }; #undef INTERP_SIMD_INTRINSIC_P_PP -#define INTERP_SIMD_INTRINSIC_P_PP(a,b) +#define INTERP_SIMD_INTRINSIC_P_PP(a,b,c) #undef INTERP_SIMD_INTRINSIC_P_PPP -#define INTERP_SIMD_INTRINSIC_P_PPP(a,b) b, +#define INTERP_SIMD_INTRINSIC_P_PPP(a,b,c) b, PPPP_SIMD_Method interp_simd_p_ppp_table [] = { #include "interp-simd-intrins.def" }; #undef INTERP_SIMD_INTRINSIC_P_PPP -#define INTERP_SIMD_INTRINSIC_P_PPP(a,b) +#define INTERP_SIMD_INTRINSIC_P_PPP(a,b,c) #endif // INTERP_ENABLE_SIMD diff --git a/src/mono/mono/mini/interp/interp-simd.h b/src/mono/mono/mini/interp/interp-simd.h index 3763c571069ba..e3306a251fc9f 100644 --- a/src/mono/mono/mini/interp/interp-simd.h +++ b/src/mono/mono/mini/interp/interp-simd.h @@ -11,6 +11,12 @@ extern PP_SIMD_Method interp_simd_p_p_table []; extern PPP_SIMD_Method interp_simd_p_pp_table []; extern PPPP_SIMD_Method interp_simd_p_ppp_table []; +#if HOST_BROWSER +extern int interp_simd_p_p_wasm_opcode_table []; +extern int interp_simd_p_pp_wasm_opcode_table []; +extern int interp_simd_p_ppp_wasm_opcode_table []; +#endif + #endif /* __MONO_MINI_INTERP_SIMD_H__ */ diff --git a/src/mono/mono/mini/interp/interp.c b/src/mono/mono/mini/interp/interp.c index f3802f7afe5e8..9e1e9e1e8561b 100644 --- a/src/mono/mono/mini/interp/interp.c +++ b/src/mono/mono/mini/interp/interp.c @@ -8907,4 +8907,42 @@ mono_jiterp_enum_hasflag (MonoClass *klass, gint32 *dest, stackval *sp1, stackva *dest = mono_interp_enum_hasflag (sp1, sp2, klass); } +EMSCRIPTEN_KEEPALIVE gpointer +mono_jiterp_get_simd_intrinsic (int arity, int index) +{ +#ifdef INTERP_ENABLE_SIMD + switch (arity) { + case 1: + return interp_simd_p_p_table [index]; + case 2: + return interp_simd_p_pp_table [index]; + case 3: + return interp_simd_p_ppp_table [index]; + default: + g_assert_not_reached(); + } +#else + g_assert_not_reached(); +#endif +} + +EMSCRIPTEN_KEEPALIVE int +mono_jiterp_get_simd_opcode (int arity, int index) +{ +#ifdef INTERP_ENABLE_SIMD + switch (arity) { + case 1: + return interp_simd_p_p_wasm_opcode_table [index]; + case 2: + return interp_simd_p_pp_wasm_opcode_table [index]; + case 3: + return interp_simd_p_ppp_wasm_opcode_table [index]; + default: + g_assert_not_reached(); + } +#else + g_assert_not_reached(); +#endif +} + #endif diff --git a/src/mono/mono/mini/interp/mintops.h b/src/mono/mono/mini/interp/mintops.h index 021a4399fe307..2849cec1778ff 100644 --- a/src/mono/mono/mini/interp/mintops.h +++ b/src/mono/mono/mini/interp/mintops.h @@ -41,35 +41,35 @@ typedef enum { /* SIMD opcodes, grouped by signature */ -#define INTERP_SIMD_INTRINSIC_P_P(a,b) -#define INTERP_SIMD_INTRINSIC_P_PP(a,b) -#define INTERP_SIMD_INTRINSIC_P_PPP(a,b) +#define INTERP_SIMD_INTRINSIC_P_P(a,b,c) +#define INTERP_SIMD_INTRINSIC_P_PP(a,b,c) +#define INTERP_SIMD_INTRINSIC_P_PPP(a,b,c) #undef INTERP_SIMD_INTRINSIC_P_P -#define INTERP_SIMD_INTRINSIC_P_P(a,b) a, +#define INTERP_SIMD_INTRINSIC_P_P(a,b,c) a, typedef enum { #include "interp-simd-intrins.def" } MintSIMDOpsPP; #undef INTERP_SIMD_INTRINSIC_P_P -#define INTERP_SIMD_INTRINSIC_P_P(a,b) +#define INTERP_SIMD_INTRINSIC_P_P(a,b,c) #undef INTERP_SIMD_INTRINSIC_P_PP -#define INTERP_SIMD_INTRINSIC_P_PP(a,b) a, +#define INTERP_SIMD_INTRINSIC_P_PP(a,b,c) a, typedef enum { #include "interp-simd-intrins.def" INTERP_SIMD_INTRINSIC_P_PP_LAST } MintSIMDOpsPPP; #undef INTERP_SIMD_INTRINSIC_P_PP -#define INTERP_SIMD_INTRINSIC_P_PP(a,b) +#define INTERP_SIMD_INTRINSIC_P_PP(a,b,c) #undef INTERP_SIMD_INTRINSIC_P_PPP -#define INTERP_SIMD_INTRINSIC_P_PPP(a,b) a, +#define INTERP_SIMD_INTRINSIC_P_PPP(a,b,c) a, typedef enum { #include "interp-simd-intrins.def" INTERP_SIMD_INTRINSIC_P_PPP_LAST } MintSIMDOpsPPPP; #undef INTERP_SIMD_INTRINSIC_P_PPP -#define INTERP_SIMD_INTRINSIC_P_PPP(a,b) +#define INTERP_SIMD_INTRINSIC_P_PPP(a,b,c) #if NO_UNALIGNED_ACCESS # if G_BYTE_ORDER == G_LITTLE_ENDIAN diff --git a/src/mono/mono/mini/interp/simd-methods.def b/src/mono/mono/mini/interp/simd-methods.def index 57b87d028de94..4eb76e178558b 100644 --- a/src/mono/mono/mini/interp/simd-methods.def +++ b/src/mono/mono/mini/interp/simd-methods.def @@ -1,12 +1,14 @@ SIMD_METHOD(get_Count) SIMD_METHOD(get_AllBitsSet) SIMD_METHOD(get_IsHardwareAccelerated) +SIMD_METHOD(get_IsSupported) SIMD_METHOD(get_Item) SIMD_METHOD(get_One) SIMD_METHOD(get_Zero) SIMD_METHOD(op_Addition) SIMD_METHOD(op_BitwiseAnd) SIMD_METHOD(op_BitwiseOr) +SIMD_METHOD(op_Division) SIMD_METHOD(op_Equality) SIMD_METHOD(op_ExclusiveOr) SIMD_METHOD(op_Explicit) @@ -24,6 +26,7 @@ SIMD_METHOD(ConditionalSelect) SIMD_METHOD(Create) SIMD_METHOD(CreateScalar) SIMD_METHOD(CreateScalarUnsafe) + SIMD_METHOD(Equals) SIMD_METHOD(ExtractMostSignificantBits) SIMD_METHOD(GreaterThan) @@ -36,3 +39,20 @@ SIMD_METHOD(ShiftRightLogical) SIMD_METHOD(Shuffle) SIMD_METHOD(WidenLower) SIMD_METHOD(WidenUpper) + +// PackedSimd +SIMD_METHOD(Splat) +SIMD_METHOD(ExtractLane) +SIMD_METHOD(ReplaceLane) +SIMD_METHOD(Swizzle) +SIMD_METHOD(Add) +SIMD_METHOD(Subtract) +SIMD_METHOD(Multiply) +SIMD_METHOD(Dot) +SIMD_METHOD(Negate) +SIMD_METHOD(And) +SIMD_METHOD(Bitmask) +SIMD_METHOD(CompareEqual) +SIMD_METHOD(CompareNotEqual) +SIMD_METHOD(ConvertNarrowingSignedSaturate) +SIMD_METHOD(ConvertNarrowingUnsignedSaturate) diff --git a/src/mono/mono/mini/interp/transform-simd.c b/src/mono/mono/mini/interp/transform-simd.c index a46f7555e14fe..c441753d10e34 100644 --- a/src/mono/mono/mini/interp/transform-simd.c +++ b/src/mono/mono/mini/interp/transform-simd.c @@ -2,6 +2,8 @@ * SIMD Intrinsics support for interpreter */ +#include "config.h" +#include #include // We use the same approach as jit/aot for identifying simd methods. @@ -61,7 +63,7 @@ static guint16 sri_vector128_methods [] = { SN_Shuffle, SN_WidenLower, SN_WidenUpper, - SN_get_IsHardwareAccelerated + SN_get_IsHardwareAccelerated, }; static guint16 sri_vector128_t_methods [] = { @@ -72,6 +74,7 @@ static guint16 sri_vector128_t_methods [] = { SN_op_Addition, SN_op_BitwiseAnd, SN_op_BitwiseOr, + SN_op_Division, SN_op_Equality, SN_op_ExclusiveOr, SN_op_Inequality, @@ -84,6 +87,60 @@ static guint16 sri_vector128_t_methods [] = { SN_op_UnsignedRightShift }; +static guint16 sri_packedsimd_methods [] = { + SN_ConvertNarrowingSignedSaturate, + SN_ConvertNarrowingUnsignedSaturate, + SN_Swizzle, + SN_get_IsHardwareAccelerated, + SN_get_IsSupported, +}; + +#if HOST_BROWSER + +/* + * maps from INTERP_SIMD_INTRINSIC_WASM_I8X16_xxx to the correct one for the return type, + * assuming that they are laid out sequentially like this: + * INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_I8X16_COMPAREEQUAL, wasm_i8x16_eq, 0x0) + * INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_I16X8_COMPAREEQUAL, wasm_i16x8_eq, 0x0) + * INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_I32X4_COMPAREEQUAL, wasm_i32x4_eq, 0x0) + * INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_I64X2_COMPAREEQUAL, wasm_i64x2_eq, 0x0) + * INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_F32X4_COMPAREEQUAL, wasm_f32x4_eq, 0x0) + * INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_F64X2_COMPAREEQUAL, wasm_f64x2_eq, 0x0) + * It is your responsibility to ensure that it's actually laid out this way! + */ + +static int sri_packedsimd_offset_from_atype [] = { + -1, // MONO_TYPE_END = 0x00, + -1, // MONO_TYPE_VOID = 0x01, + // FIXME: Should this be 2, for I4? + 0, // MONO_TYPE_BOOLEAN = 0x02, + 1, // MONO_TYPE_CHAR = 0x03, + 0, // MONO_TYPE_I1 = 0x04, + 0, // MONO_TYPE_U1 = 0x05, + 1, // MONO_TYPE_I2 = 0x06, + 1, // MONO_TYPE_U2 = 0x07, + 2, // MONO_TYPE_I4 = 0x08, + 2, // MONO_TYPE_U4 = 0x09, + 3, // MONO_TYPE_I8 = 0x0a, + 3, // MONO_TYPE_U8 = 0x0b, + 4, // MONO_TYPE_R4 = 0x0c, + 5, // MONO_TYPE_R8 = 0x0d, + -1, // MONO_TYPE_STRING = 0x0e, + 2, // MONO_TYPE_PTR = 0x0f, + -1, // MONO_TYPE_BYREF = 0x10, + -1, // MONO_TYPE_VALUETYPE = 0x11, + -1, // MONO_TYPE_CLASS = 0x12, + -1, // MONO_TYPE_VAR = 0x13, + -1, // MONO_TYPE_ARRAY = 0x14, + -1, // MONO_TYPE_GENERICINST= 0x15, + -1, // MONO_TYPE_TYPEDBYREF = 0x16, + 2, // MONO_TYPE_I = 0x18, + 2, // MONO_TYPE_U = 0x19, +}; + +static const int sri_packedsimd_offset_from_atype_length = sizeof(sri_packedsimd_offset_from_atype) / sizeof(sri_packedsimd_offset_from_atype[0]); +#endif // HOST_BROWSER + static gboolean emit_sri_vector128 (TransformData *td, MonoMethod *cmethod, MonoMethodSignature *csignature) { @@ -373,26 +430,36 @@ emit_sri_vector128_t (TransformData *td, MonoMethod *cmethod, MonoMethodSignatur } break; case SN_op_LeftShift: - g_assert (scalar_arg == 1); + if (scalar_arg != 1) + return FALSE; simd_opcode = MINT_SIMD_INTRINS_P_PP; if (arg_size == 1) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I1_LEFT_SHIFT; else if (arg_size == 2) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I2_LEFT_SHIFT; else if (arg_size == 4) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I4_LEFT_SHIFT; else if (arg_size == 8) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I8_LEFT_SHIFT; break; + case SN_op_Division: + if (scalar_arg != -1) + return FALSE; + simd_opcode = MINT_SIMD_INTRINS_P_PP; + if (atype == MONO_TYPE_R4) simd_intrins = INTERP_SIMD_INTRINSIC_V128_R4_DIVISION; + break; case SN_op_Multiply: - g_assert (scalar_arg == -1); + if (scalar_arg != -1) + return FALSE; simd_opcode = MINT_SIMD_INTRINS_P_PP; if (atype == MONO_TYPE_I1 || atype == MONO_TYPE_U1) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I1_MULTIPLY; else if (atype == MONO_TYPE_I2 || atype == MONO_TYPE_U2) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I2_MULTIPLY; else if (atype == MONO_TYPE_I4 || atype == MONO_TYPE_U4) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I4_MULTIPLY; + else if (atype == MONO_TYPE_R4) simd_intrins = INTERP_SIMD_INTRINSIC_V128_R4_MULTIPLY; break; case SN_op_OnesComplement: simd_opcode = MINT_SIMD_INTRINS_P_P; simd_intrins = INTERP_SIMD_INTRINSIC_V128_ONES_COMPLEMENT; break; case SN_op_RightShift: - g_assert (scalar_arg == 1); + if (scalar_arg != 1) + return FALSE; simd_opcode = MINT_SIMD_INTRINS_P_PP; if (atype == MONO_TYPE_I1) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I1_RIGHT_SHIFT; else if (atype == MONO_TYPE_I2) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I2_RIGHT_SHIFT; @@ -414,7 +481,8 @@ emit_sri_vector128_t (TransformData *td, MonoMethod *cmethod, MonoMethodSignatur else if (atype == MONO_TYPE_I4 || atype == MONO_TYPE_U4) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I4_NEGATION; break; case SN_op_UnsignedRightShift: - g_assert (scalar_arg == 1); + if (scalar_arg != 1) + return FALSE; simd_opcode = MINT_SIMD_INTRINS_P_PP; if (arg_size == 1) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I1_URIGHT_SHIFT; else if (arg_size == 2) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I2_URIGHT_SHIFT; @@ -448,6 +516,182 @@ emit_sri_vector128_t (TransformData *td, MonoMethod *cmethod, MonoMethodSignatur return TRUE; } +#if HOST_BROWSER +static int +map_packedsimd_intrins_based_on_atype (MonoTypeEnum atype, int base_intrins, gboolean allow_float) +{ + int max_offset = allow_float ? 5 : 3; + if ((atype < 0) || (atype >= sri_packedsimd_offset_from_atype_length)) + return -1; + int offset = sri_packedsimd_offset_from_atype [atype]; + if ((offset < 0) || (offset > max_offset)) + return -1; + return base_intrins + offset; +} +#endif + +static gboolean +emit_sri_packedsimd (TransformData *td, MonoMethod *cmethod, MonoMethodSignature *csignature) +{ + int id = lookup_intrins (sri_packedsimd_methods, sizeof (sri_packedsimd_methods), cmethod); + if (id == -1) + return FALSE; + + MonoClass *vector_klass = mono_class_from_mono_type_internal (csignature->ret); + int vector_size = -1; + + if ((id == SN_get_IsSupported) || (id == SN_get_IsHardwareAccelerated)) { +#if HOST_BROWSER + interp_add_ins (td, MINT_LDC_I4_1); +#else + interp_add_ins (td, MINT_LDC_I4_0); +#endif + goto opcode_added; + } + +#if HOST_BROWSER + gint16 simd_opcode = -1; + gint16 simd_intrins = -1; + if (!m_class_is_simd_type (vector_klass)) + vector_klass = mono_class_from_mono_type_internal (csignature->params [0]); + if (!m_class_is_simd_type (vector_klass)) + return FALSE; + + vector_size = mono_class_value_size (vector_klass, NULL); + g_assert (vector_size == SIZEOF_V128); + + MonoType *arg_type = mono_class_get_context (vector_klass)->class_inst->type_argv [0]; + if (!mono_type_is_primitive (arg_type)) + return FALSE; + MonoTypeEnum atype = arg_type->type; + if (atype == MONO_TYPE_BOOLEAN) + return FALSE; + + int scalar_arg = -1; + for (int i = 0; i < csignature->param_count; i++) { + if (csignature->params [i]->type != MONO_TYPE_GENERICINST) + scalar_arg = i; + } + + switch (id) { + case SN_Splat: { + simd_opcode = MINT_SIMD_INTRINS_P_P; + simd_intrins = map_packedsimd_intrins_based_on_atype (atype, INTERP_SIMD_INTRINSIC_WASM_I8X16_SPLAT, FALSE); + break; + } + case SN_Swizzle: { + simd_opcode = MINT_SIMD_INTRINS_P_PP; + simd_intrins = INTERP_SIMD_INTRINSIC_WASM_I8X16_SWIZZLE; + break; + } + case SN_Add: { + simd_opcode = MINT_SIMD_INTRINS_P_PP; + simd_intrins = map_packedsimd_intrins_based_on_atype (atype, INTERP_SIMD_INTRINSIC_WASM_I8X16_ADD, FALSE); + break; + } + case SN_Subtract: { + simd_opcode = MINT_SIMD_INTRINS_P_PP; + simd_intrins = map_packedsimd_intrins_based_on_atype (atype, INTERP_SIMD_INTRINSIC_WASM_I8X16_SUBTRACT, FALSE); + break; + } + case SN_Multiply: { + simd_opcode = MINT_SIMD_INTRINS_P_PP; + simd_intrins = map_packedsimd_intrins_based_on_atype (atype, INTERP_SIMD_INTRINSIC_WASM_I8X16_MULTIPLY, FALSE); + break; + } + case SN_Dot: { + simd_opcode = MINT_SIMD_INTRINS_P_PP; + simd_intrins = INTERP_SIMD_INTRINSIC_WASM_I32X4_DOT_I16X8; + break; + } + case SN_Negate: { + simd_opcode = MINT_SIMD_INTRINS_P_P; + simd_intrins = map_packedsimd_intrins_based_on_atype (atype, INTERP_SIMD_INTRINSIC_WASM_I8X16_NEGATE, FALSE); + break; + } + case SN_ShiftLeft: { + simd_opcode = MINT_SIMD_INTRINS_P_PP; + simd_intrins = map_packedsimd_intrins_based_on_atype (atype, INTERP_SIMD_INTRINSIC_WASM_I8X16_SHIFTLEFT, FALSE); + break; + } + case SN_ShiftRightArithmetic: { + simd_opcode = MINT_SIMD_INTRINS_P_PP; + simd_intrins = map_packedsimd_intrins_based_on_atype (atype, INTERP_SIMD_INTRINSIC_WASM_I8X16_SHIFTRIGHTARITHMETIC, FALSE); + break; + } + case SN_ShiftRightLogical: { + simd_opcode = MINT_SIMD_INTRINS_P_PP; + simd_intrins = map_packedsimd_intrins_based_on_atype (atype, INTERP_SIMD_INTRINSIC_WASM_I8X16_SHIFTRIGHTLOGICAL, FALSE); + break; + } + case SN_And: { + simd_opcode = MINT_SIMD_INTRINS_P_PP; + simd_intrins = INTERP_SIMD_INTRINSIC_WASM_AND; + break; + } + case SN_Bitmask: { + simd_opcode = MINT_SIMD_INTRINS_P_P; + simd_intrins = map_packedsimd_intrins_based_on_atype (atype, INTERP_SIMD_INTRINSIC_WASM_I8X16_BITMASK, FALSE); + break; + } + case SN_CompareEqual: { + simd_opcode = MINT_SIMD_INTRINS_P_PP; + simd_intrins = map_packedsimd_intrins_based_on_atype (atype, INTERP_SIMD_INTRINSIC_WASM_I8X16_COMPAREEQUAL, TRUE); + break; + } + case SN_CompareNotEqual: { + simd_opcode = MINT_SIMD_INTRINS_P_PP; + simd_intrins = map_packedsimd_intrins_based_on_atype (atype, INTERP_SIMD_INTRINSIC_WASM_I8X16_COMPARENOTEQUAL, TRUE); + break; + } + case SN_ConvertNarrowingSignedSaturate: { + simd_opcode = MINT_SIMD_INTRINS_P_PP; + if (atype == MONO_TYPE_I1) + simd_intrins = INTERP_SIMD_INTRINSIC_WASM_I8X16_NARROW_I16X8_S; + else if (atype == MONO_TYPE_I2) + simd_intrins = INTERP_SIMD_INTRINSIC_WASM_I16X8_NARROW_I32X4_S; + break; + } + case SN_ConvertNarrowingUnsignedSaturate: { + simd_opcode = MINT_SIMD_INTRINS_P_PP; + if (atype == MONO_TYPE_U1) + simd_intrins = INTERP_SIMD_INTRINSIC_WASM_I8X16_NARROW_I16X8_U; + else if (atype == MONO_TYPE_U2) + simd_intrins = INTERP_SIMD_INTRINSIC_WASM_I16X8_NARROW_I32X4_U; + break; + } + default: + return FALSE; + } + + if (simd_opcode == -1 || simd_intrins == -1) { + return FALSE; + } + + interp_add_ins (td, simd_opcode); + td->last_ins->data [0] = simd_intrins; +#else // HOST_BROWSER + return FALSE; +#endif // HOST_BROWSER + +opcode_added: + td->sp -= csignature->param_count; + for (int i = 0; i < csignature->param_count; i++) + td->last_ins->sregs [i] = td->sp [i].local; + + g_assert (csignature->ret->type != MONO_TYPE_VOID); + int ret_mt = mint_type (csignature->ret); + if (ret_mt == MINT_TYPE_VT) { + // For these intrinsics, if we return a VT then it is a V128 + push_type_vt (td, vector_klass, vector_size); + } else { + push_simple_type (td, stack_type [ret_mt]); + } + interp_ins_set_dreg (td->last_ins, td->sp [-1].local); + td->ip += 5; + return TRUE; +} + static gboolean interp_emit_simd_intrinsics (TransformData *td, MonoMethod *cmethod, MonoMethodSignature *csignature) { @@ -466,6 +710,16 @@ interp_emit_simd_intrinsics (TransformData *td, MonoMethod *cmethod, MonoMethodS return emit_sri_vector128 (td, cmethod, csignature); else if (!strcmp (class_name, "Vector128`1")) return emit_sri_vector128_t (td, cmethod, csignature); + } else if (!strcmp (class_ns, "System.Runtime.Intrinsics.Wasm")) { + if (!strcmp (class_name, "PackedSimd")) { + gboolean res = emit_sri_packedsimd (td, cmethod, csignature); +#if HOST_BROWSER + if (!res) + g_print ("MONO interpreter: Unsupported method: System.Runtime.Intrinsics.Wasm.PackedSimd.%s\n", cmethod->name); + g_assert (res); +#endif + return res; + } } return FALSE; } diff --git a/src/mono/mono/utils/options-def.h b/src/mono/mono/utils/options-def.h index 1a6f58a9fbda2..29cf845ca1b44 100644 --- a/src/mono/mono/utils/options-def.h +++ b/src/mono/mono/utils/options-def.h @@ -110,6 +110,8 @@ DEFINE_BOOL(jiterpreter_use_constants, "jiterpreter-use-constants", FALSE, "Use DEFINE_BOOL(jiterpreter_eliminate_null_checks, "jiterpreter-eliminate-null-checks", TRUE, "Attempt to eliminate redundant null checks in traces") // enables performing backward branches without exiting traces DEFINE_BOOL(jiterpreter_backward_branches_enabled, "jiterpreter-backward-branches-enabled", TRUE, "Enable performing backward branches without exiting traces") +// Attempt to use WASM v128 opcodes to implement SIMD interpreter opcodes +DEFINE_BOOL(jiterpreter_enable_simd, "jiterpreter-simd-enabled", TRUE, "Attempt to use WebAssembly SIMD support") // When compiling a jit_call wrapper, bypass sharedvt wrappers if possible by inlining their // logic into the compiled wrapper and calling the target AOTed function with native call convention DEFINE_BOOL(jiterpreter_direct_jit_call, "jiterpreter-direct-jit-calls", TRUE, "Bypass gsharedvt wrappers when compiling JIT call wrappers") diff --git a/src/mono/sample/wasm/browser-bench/Vector.cs b/src/mono/sample/wasm/browser-bench/Vector.cs index 343332783555f..cb04d361fd4aa 100644 --- a/src/mono/sample/wasm/browser-bench/Vector.cs +++ b/src/mono/sample/wasm/browser-bench/Vector.cs @@ -15,6 +15,8 @@ public VectorTask() { measurements = new Measurement[] { new Create(), + new PackConstant(), + new Pack(), new Add(), new Multiply(), new DotInt(), @@ -56,6 +58,25 @@ class Create : VectorMeasurement public override void RunStep() => vector = Vector128.Create(0x123456); } + class PackConstant : VectorMeasurement + { + Vector128 vector; + + public override string Name => "Pack Vector128 (Constant)"; + + public override void RunStep() => vector = Vector128.Create(1, 2, 3, 4); + } + + class Pack : VectorMeasurement + { + Vector128 vector; + int a = 1, b = 2, c = 3, d = 4; + + public override string Name => "Pack Vector128"; + + public override void RunStep() => vector = Vector128.Create(a, b, c, d); + } + class Add : VectorMeasurement { Vector128 vector1, vector2, vector3; diff --git a/src/mono/wasm/runtime/CMakeLists.txt b/src/mono/wasm/runtime/CMakeLists.txt index 4d3781bb924f6..6e2ac177da001 100644 --- a/src/mono/wasm/runtime/CMakeLists.txt +++ b/src/mono/wasm/runtime/CMakeLists.txt @@ -39,6 +39,7 @@ set_target_properties(dotnet PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${NATIVE_BIN_DIR}") set(ignoreMeWasmOptFlags "${CONFIGURATION_WASM_OPT_FLAGS}") +set(ignoreMeWasmOptAdditionalFlags "${WASM_OPT_ADDITIONAL_FLAGS}") if(CMAKE_BUILD_TYPE STREQUAL "Release") add_custom_command(TARGET dotnet diff --git a/src/mono/wasm/runtime/cwraps.ts b/src/mono/wasm/runtime/cwraps.ts index 6c6912fd6b6c9..3b0a98261edf4 100644 --- a/src/mono/wasm/runtime/cwraps.ts +++ b/src/mono/wasm/runtime/cwraps.ts @@ -125,6 +125,8 @@ const fn_signatures: SigLine[] = [ [true, "mono_jiterp_boost_back_branch_target", "void", ["number"]], [true, "mono_jiterp_is_imethod_var_address_taken", "number", ["number", "number"]], [true, "mono_jiterp_get_opcode_value_table_entry", "number", ["number"]], + [true, "mono_jiterp_get_simd_intrinsic", "number", ["number", "number"]], + [true, "mono_jiterp_get_simd_opcode", "number", ["number", "number"]], ...legacy_interop_cwraps ]; @@ -246,6 +248,8 @@ export interface t_Cwraps { mono_jiterp_boost_back_branch_target(destination: number): void; mono_jiterp_is_imethod_var_address_taken(imethod: VoidPtr, offsetBytes: number): number; mono_jiterp_get_opcode_value_table_entry(opcode: number): number; + mono_jiterp_get_simd_intrinsic(arity: number, index: number): VoidPtr; + mono_jiterp_get_simd_opcode(arity: number, index: number): number; } const wrapped_c_functions: t_Cwraps = {}; diff --git a/src/mono/wasm/runtime/genmintops.py b/src/mono/wasm/runtime/genmintops.py index 510b1db22d32f..de7f6e53ac9ab 100755 --- a/src/mono/wasm/runtime/genmintops.py +++ b/src/mono/wasm/runtime/genmintops.py @@ -8,20 +8,25 @@ import os import re -if len (sys.argv) != 3: - print ("Usage: genmintops.py ") +if len (sys.argv) != 4: + print ("Usage: genmintops.py ") exit (1) src_header_path = sys.argv [1] -output_ts_path = sys.argv [2] +simd_header_path = sys.argv [2] +output_ts_path = sys.argv [3] src = open(src_header_path, 'r') +simd_src = open(simd_header_path, 'r') tab = " " header_lines = src.read().splitlines() +# strip preprocessing directives +simd_header_lines = (l for l in simd_src.read().splitlines() if not l.startswith("#")) # strip preprocessing directives and add indentation for tslint/eslint header = "\n".join((tab + l) for l in header_lines if not l.startswith("#")) src.close() +simd_src.close() opdef_regex = r'\s(IR)?OPDEF\((\w+),\s*(.+?),\s*(MintOp\w+)\)' enum_values = re.sub( @@ -31,11 +36,36 @@ opdef_regex, lambda m : f"[MintOpcode.{m.group(2)}]: [{m.group(3)}, MintOpArgType.{m.group(4)}],", header ) +simd_values_1 = [] +simd_values_2 = [] +simd_values_3 = [] +simd_disp = { + "INTERP_SIMD_INTRINSIC_P_P": simd_values_1, + "INTERP_SIMD_INTRINSIC_P_PP": simd_values_2, + "INTERP_SIMD_INTRINSIC_P_PPP": simd_values_3, + "INTERP_WASM_SIMD_INTRINSIC_V_P": simd_values_1, + "INTERP_WASM_SIMD_INTRINSIC_V_V": simd_values_1, + "INTERP_WASM_SIMD_INTRINSIC_I_V": simd_values_1, + "INTERP_WASM_SIMD_INTRINSIC_V_VV": simd_values_2, + "INTERP_WASM_SIMD_INTRINSIC_V_VI": simd_values_2, + "INTERP_WASM_SIMD_INTRINSIC_V_VVV": simd_values_3, +} + +for line in simd_header_lines: + idx1 = line.index("(") if "(" in line else None + idx2 = line.index(",") if "," in line else None + if (idx1 and idx2): + key = line[0:idx1].strip() + simd_disp[key].append(line[(idx1 + 1):idx2].strip().replace("INTERP_SIMD_INTRINSIC_", "")) + +splitter = ",\n " +splitter2 = ",\n " + generated = f""" // Generated by genmintops.py from mintops.def. // Do not manually edit this file. -import {{ OpcodeInfoTable, MintOpArgType }} from "./jiterpreter-opcodes"; +import {{ OpcodeInfoTable, MintOpArgType, SimdInfoTable }} from "./jiterpreter-opcodes"; export const enum MintOpcode {{ {enum_values} @@ -46,6 +76,30 @@ export const OpcodeInfo : OpcodeInfoTable = {{ {metadata_table} }}; + +export const enum SimdIntrinsic2 {{ + {splitter.join(simd_values_1)} +}} + +export const enum SimdIntrinsic3 {{ + {splitter.join(simd_values_2)} +}} + +export const enum SimdIntrinsic4 {{ + {splitter.join(simd_values_3)} +}} + +export const SimdInfo : SimdInfoTable = {{ + 2: [ + {splitter2.join(repr(x) for x in simd_values_1)} + ], + 3: [ + {splitter2.join(repr(x) for x in simd_values_2)} + ], + 4: [ + {splitter2.join(repr(x) for x in simd_values_3)} + ], +}}; """ os.makedirs(os.path.dirname(output_ts_path), exist_ok=True) diff --git a/src/mono/wasm/runtime/jiterpreter-interp-entry.ts b/src/mono/wasm/runtime/jiterpreter-interp-entry.ts index aad65e7ad4841..14651ad27e970 100644 --- a/src/mono/wasm/runtime/jiterpreter-interp-entry.ts +++ b/src/mono/wasm/runtime/jiterpreter-interp-entry.ts @@ -289,7 +289,7 @@ function flush_wasm_entry_trampoline_jit_queue() { // Emit function imports for (let i = 0; i < trampImports.length; i++) { mono_assert(trampImports[i], () => `trace #${i} missing`); - builder.defineImportedFunction("i", trampImports[i][0], trampImports[i][1], true, false, trampImports[i][2]); + builder.defineImportedFunction("i", trampImports[i][0], trampImports[i][1], true, trampImports[i][2]); } builder._generateImportSection(); @@ -343,12 +343,9 @@ function flush_wasm_entry_trampoline_jit_queue() { console.log(`jit queue generated ${buffer.length} byte(s) of wasm`); counters.bytesGenerated += buffer.length; const traceModule = new WebAssembly.Module(buffer); + const wasmImports = builder.getWasmImports(); - const traceInstance = new WebAssembly.Instance(traceModule, { - i: builder.getImportedFunctionTable(), - c: builder.getConstants(), - m: { h: (Module).asm.memory }, - }); + const traceInstance = new WebAssembly.Instance(traceModule, wasmImports); // Now that we've jitted the trampolines, go through and fix up the function pointers // to point to the new jitted trampolines instead of the default implementations diff --git a/src/mono/wasm/runtime/jiterpreter-jit-call.ts b/src/mono/wasm/runtime/jiterpreter-jit-call.ts index 3f016adc00510..b1bf6a52a75a5 100644 --- a/src/mono/wasm/runtime/jiterpreter-jit-call.ts +++ b/src/mono/wasm/runtime/jiterpreter-jit-call.ts @@ -11,7 +11,7 @@ import { WasmOpcode } from "./jiterpreter-opcodes"; import { WasmValtype, WasmBuilder, addWasmFunctionPointer as addWasmFunctionPointer, _now, elapsedTimes, counters, getWasmFunctionTable, applyOptions, - recordFailure, getOptions + recordFailure, getOptions, bytesFromHex } from "./jiterpreter-support"; import cwraps from "./cwraps"; @@ -157,7 +157,7 @@ class TrampolineInfo { } // this is cached replacements for Module.getWasmTableEntry(); -// we could add and +// we could add and // if we need to export the original function getWasmTableEntry(index: number) { let result = fnCache[index]; @@ -236,9 +236,7 @@ function getIsWasmEhSupported(): boolean { // Probe whether the current environment can handle wasm exceptions try { // Load and compile the wasm version of do_jit_call_indirect. This serves as a way to probe for wasm EH - const bytes = new Uint8Array(doJitCall16.length / 2); - for (let i = 0; i < doJitCall16.length; i += 2) - bytes[i / 2] = parseInt(doJitCall16.substring(i, i + 2), 16); + const bytes = bytesFromHex(doJitCall16); counters.bytesGenerated += bytes.length; doJitCallModule = new WebAssembly.Module(bytes); @@ -396,7 +394,7 @@ export function mono_interp_flush_jitcall_queue(): void { // Emit function imports for (let i = 0; i < trampImports.length; i++) - builder.defineImportedFunction("i", trampImports[i][0], trampImports[i][1], true, false, trampImports[i][2]); + builder.defineImportedFunction("i", trampImports[i][0], trampImports[i][1], true, trampImports[i][2]); builder._generateImportSection(); // Function section @@ -444,12 +442,9 @@ export function mono_interp_flush_jitcall_queue(): void { console.log(`do_jit_call queue flush generated ${buffer.length} byte(s) of wasm`); counters.bytesGenerated += buffer.length; const traceModule = new WebAssembly.Module(buffer); + const wasmImports = builder.getWasmImports(); - const traceInstance = new WebAssembly.Instance(traceModule, { - i: builder.getImportedFunctionTable(), - c: builder.getConstants(), - m: { h: (Module).asm.memory } - }); + const traceInstance = new WebAssembly.Instance(traceModule, wasmImports); for (let i = 0; i < jitQueue.length; i++) { const info = jitQueue[i]; diff --git a/src/mono/wasm/runtime/jiterpreter-opcodes.ts b/src/mono/wasm/runtime/jiterpreter-opcodes.ts index ccefa7b41327c..3d347bdea7452 100644 --- a/src/mono/wasm/runtime/jiterpreter-opcodes.ts +++ b/src/mono/wasm/runtime/jiterpreter-opcodes.ts @@ -30,6 +30,12 @@ export type OpcodeInfoTable = { [key: number]: [name: string, length_u16: number, dregs: number, sregs: number, optype: MintOpArgType]; } +export type SimdInfoSubtable = Array + +export type SimdInfoTable = { + [argument_count: number] : SimdInfoSubtable +} + // Keep this in sync with the wasm spec (but I don't think any changes will impact it), // Note that prefix opcodes aren't in this enum, since making them write properly is awkward. @@ -229,5 +235,245 @@ export const enum WasmOpcode { i64_extend_32_s, PREFIX_sat = 0xfc, + PREFIX_simd = 0xfd, PREFIX_atomic = 0xfe } + +export const enum WasmSimdOpcode { + v128_load = 0x00, + v128_load8x8_s = 0x01, + v128_load8x8_u = 0x02, + v128_load16x4_s = 0x03, + v128_load16x4_u = 0x04, + v128_load32x2_s = 0x05, + v128_load32x2_u = 0x06, + v128_load8_splat = 0x07, + v128_load16_splat = 0x08, + v128_load32_splat = 0x09, + v128_load64_splat = 0x0a, + v128_store = 0x0b, + v128_const = 0x0c, + i8x16_shuffle = 0x0d, + i8x16_swizzle = 0x0e, + i8x16_splat = 0x0f, + i16x8_splat = 0x10, + i32x4_splat = 0x11, + i64x2_splat = 0x12, + f32x4_splat = 0x13, + f64x2_splat = 0x14, + i8x16_extract_lane_s = 0x15, + i8x16_extract_lane_u = 0x16, + i8x16_replace_lane = 0x17, + i16x8_extract_lane_s = 0x18, + i16x8_extract_lane_u = 0x19, + i16x8_replace_lane = 0x1a, + i32x4_extract_lane = 0x1b, + i32x4_replace_lane = 0x1c, + i64x2_extract_lane = 0x1d, + i64x2_replace_lane = 0x1e, + f32x4_extract_lane = 0x1f, + f32x4_replace_lane = 0x20, + f64x2_extract_lane = 0x21, + f64x2_replace_lane = 0x22, + i8x16_eq = 0x23, + i8x16_ne = 0x24, + i8x16_lt_s = 0x25, + i8x16_lt_u = 0x26, + i8x16_gt_s = 0x27, + i8x16_gt_u = 0x28, + i8x16_le_s = 0x29, + i8x16_le_u = 0x2a, + i8x16_ge_s = 0x2b, + i8x16_ge_u = 0x2c, + i16x8_eq = 0x2d, + i16x8_ne = 0x2e, + i16x8_lt_s = 0x2f, + i16x8_lt_u = 0x30, + i16x8_gt_s = 0x31, + i16x8_gt_u = 0x32, + i16x8_le_s = 0x33, + i16x8_le_u = 0x34, + i16x8_ge_s = 0x35, + i16x8_ge_u = 0x36, + i32x4_eq = 0x37, + i32x4_ne = 0x38, + i32x4_lt_s = 0x39, + i32x4_lt_u = 0x3a, + i32x4_gt_s = 0x3b, + i32x4_gt_u = 0x3c, + i32x4_le_s = 0x3d, + i32x4_le_u = 0x3e, + i32x4_ge_s = 0x3f, + i32x4_ge_u = 0x40, + f32x4_eq = 0x41, + f32x4_ne = 0x42, + f32x4_lt = 0x43, + f32x4_gt = 0x44, + f32x4_le = 0x45, + f32x4_ge = 0x46, + f64x2_eq = 0x47, + f64x2_ne = 0x48, + f64x2_lt = 0x49, + f64x2_gt = 0x4a, + f64x2_le = 0x4b, + f64x2_ge = 0x4c, + v128_not = 0x4d, + v128_and = 0x4e, + v128_andnot = 0x4f, + v128_or = 0x50, + v128_xor = 0x51, + v128_bitselect = 0x52, + i8x16_abs = 0x60, + i8x16_neg = 0x61, + i8x16_all_true = 0x63, + i8x16_bitmask = 0x64, + i8x16_narrow_i16x8_s = 0x65, + i8x16_narrow_i16x8_u = 0x66, + i8x16_shl = 0x6b, + i8x16_shr_s = 0x6c, + i8x16_shr_u = 0x6d, + i8x16_add = 0x6e, + i8x16_add_sat_s = 0x6f, + i8x16_add_sat_u = 0x70, + i8x16_sub = 0x71, + i8x16_sub_sat_s = 0x72, + i8x16_sub_sat_u = 0x73, + i8x16_min_s = 0x76, + i8x16_min_u = 0x77, + i8x16_max_s = 0x78, + i8x16_max_u = 0x79, + i8x16_avgr_u = 0x7b, + i16x8_abs = 0x80, + i16x8_neg = 0x81, + i16x8_all_true = 0x83, + i16x8_bitmask = 0x84, + i16x8_narrow_i32x4_s = 0x85, + i16x8_narrow_i32x4_u = 0x86, + i16x8_extend_low_i8x16_s = 0x87, + i16x8_extend_high_i8x16_s = 0x88, + i16x8_extend_low_i8x16_u = 0x89, + i16x8_extend_high_i8x16_u = 0x8a, + i16x8_shl = 0x8b, + i16x8_shr_s = 0x8c, + i16x8_shr_u = 0x8d, + i16x8_add = 0x8e, + i16x8_add_sat_s = 0x8f, + i16x8_add_sat_u = 0x90, + i16x8_sub = 0x91, + i16x8_sub_sat_s = 0x92, + i16x8_sub_sat_u = 0x93, + i16x8_mul = 0x95, + i16x8_min_s = 0x96, + i16x8_min_u = 0x97, + i16x8_max_s = 0x98, + i16x8_max_u = 0x99, + i16x8_avgr_u = 0x9b, + i32x4_abs = 0xa0, + i32x4_neg = 0xa1, + i32x4_all_true = 0xa3, + i32x4_bitmask = 0xa4, + i32x4_extend_low_i16x8_s = 0xa7, + i32x4_extend_high_i16x8_s = 0xa8, + i32x4_extend_low_i16x8_u = 0xa9, + i32x4_extend_high_i16x8_u = 0xaa, + i32x4_shl = 0xab, + i32x4_shr_s = 0xac, + i32x4_shr_u = 0xad, + i32x4_add = 0xae, + i32x4_sub = 0xb1, + i32x4_mul = 0xb5, + i32x4_min_s = 0xb6, + i32x4_min_u = 0xb7, + i32x4_max_s = 0xb8, + i32x4_max_u = 0xb9, + i32x4_dot_i16x8_s = 0xba, + i64x2_abs = 0xc0, + i64x2_neg = 0xc1, + i64x2_bitmask = 0xc4, + i64x2_extend_low_i32x4_s = 0xc7, + i64x2_extend_high_i32x4_s = 0xc8, + i64x2_extend_low_i32x4_u = 0xc9, + i64x2_extend_high_i32x4_u = 0xca, + i64x2_shl = 0xcb, + i64x2_shr_s = 0xcc, + i64x2_shr_u = 0xcd, + i64x2_add = 0xce, + i64x2_sub = 0xd1, + i64x2_mul = 0xd5, + f32x4_ceil = 0x67, + f32x4_floor = 0x68, + f32x4_trunc = 0x69, + f32x4_nearest = 0x6a, + f64x2_ceil = 0x74, + f64x2_floor = 0x75, + f64x2_trunc = 0x7a, + f64x2_nearest = 0x94, + f32x4_abs = 0xe0, + f32x4_neg = 0xe1, + f32x4_sqrt = 0xe3, + f32x4_add = 0xe4, + f32x4_sub = 0xe5, + f32x4_mul = 0xe6, + f32x4_div = 0xe7, + f32x4_min = 0xe8, + f32x4_max = 0xe9, + f32x4_pmin = 0xea, + f32x4_pmax = 0xeb, + f64x2_abs = 0xec, + f64x2_neg = 0xed, + f64x2_sqrt = 0xef, + f64x2_add = 0xf0, + f64x2_sub = 0xf1, + f64x2_mul = 0xf2, + f64x2_div = 0xf3, + f64x2_min = 0xf4, + f64x2_max = 0xf5, + f64x2_pmin = 0xf6, + f64x2_pmax = 0xf7, + i32x4_trunc_sat_f32x4_s = 0xf8, + i32x4_trunc_sat_f32x4_u = 0xf9, + f32x4_convert_i32x4_s = 0xfa, + f32x4_convert_i32x4_u = 0xfb, + v128_load32_zero = 0x5c, + v128_load64_zero = 0x5d, + i16x8_extmul_low_i8x16_s = 0x9c, + i16x8_extmul_high_i8x16_s = 0x9d, + i16x8_extmul_low_i8x16_u = 0x9e, + i16x8_extmul_high_i8x16_u = 0x9f, + i32x4_extmul_low_i16x8_s = 0xbc, + i32x4_extmul_high_i16x8_s = 0xbd, + i32x4_extmul_low_i16x8_u = 0xbe, + i32x4_extmul_high_i16x8_u = 0xbf, + i64x2_extmul_low_i32x4_s = 0xdc, + i64x2_extmul_high_i32x4_s = 0xdd, + i64x2_extmul_low_i32x4_u = 0xde, + i64x2_extmul_high_i32x4_u = 0xdf, + i16x8_q15mulr_sat_s = 0x82, + v128_any_true = 0x53, + v128_load8_lane = 0x54, + v128_load16_lane = 0x55, + v128_load32_lane = 0x56, + v128_load64_lane = 0x57, + v128_store8_lane = 0x58, + v128_store16_lane = 0x59, + v128_store32_lane = 0x5a, + v128_store64_lane = 0x5b, + i64x2_eq = 0xd6, + i64x2_ne = 0xd7, + i64x2_lt_s = 0xd8, + i64x2_gt_s = 0xd9, + i64x2_le_s = 0xda, + i64x2_ge_s = 0xdb, + i64x2_all_true = 0xc3, + f64x2_convert_low_i32x4_s = 0xfe, + f64x2_convert_low_i32x4_u = 0xff, + i32x4_trunc_sat_f64x2_s_zero = 0xfc, + i32x4_trunc_sat_f64x2_u_zero = 0xfd, + f32x4_demote_f64x2_zero = 0x5e, + f64x2_promote_low_f32x4 = 0x5f, + i8x16_popcnt = 0x62, + i16x8_extadd_pairwise_i8x16_s = 0x7c, + i16x8_extadd_pairwise_i8x16_u = 0x7d, + i32x4_extadd_pairwise_i16x8_s = 0x7e, + i32x4_extadd_pairwise_i16x8_u = 0x7f, +} diff --git a/src/mono/wasm/runtime/jiterpreter-support.ts b/src/mono/wasm/runtime/jiterpreter-support.ts index 010e67496b19b..306757573a59e 100644 --- a/src/mono/wasm/runtime/jiterpreter-support.ts +++ b/src/mono/wasm/runtime/jiterpreter-support.ts @@ -4,7 +4,7 @@ import { mono_assert } from "./types"; import { NativePointer, ManagedPointer, VoidPtr } from "./types/emscripten"; import { Module, runtimeHelpers } from "./globals"; -import { WasmOpcode } from "./jiterpreter-opcodes"; +import { WasmOpcode, WasmSimdOpcode } from "./jiterpreter-opcodes"; import { MintOpcode } from "./mintops"; import cwraps from "./cwraps"; @@ -118,7 +118,6 @@ type ImportedFunctionInfo = { typeIndex: number; module: string; name: string; - assumeUsed: boolean; func: Function; } @@ -166,6 +165,7 @@ export class WasmBuilder { nextConstantSlot = 0; compressImportNames = false; + lockImports = false; constructor(constantSlotCount: number) { this.stack = [new BlobBuilder()]; @@ -178,6 +178,7 @@ export class WasmBuilder { this.stackSize = 1; this.inSection = false; this.inFunction = false; + this.lockImports = false; this.locals.clear(); this.functionTypeCount = this.permanentFunctionTypeCount; @@ -186,13 +187,12 @@ export class WasmBuilder { this.functionTypesByIndex = Object.create(this.permanentFunctionTypesByIndex); this.nextImportIndex = 0; - this.importedFunctionCount = this.permanentImportedFunctionCount; + this.importedFunctionCount = 0; this.importedFunctions = Object.create(this.permanentImportedFunctions); for (const k in this.importedFunctions) { const f = this.importedFunctions[k]; - if (!f.assumeUsed) - f.index = undefined; + f.index = undefined; } this.functions.length = 0; @@ -235,15 +235,45 @@ export class WasmBuilder { return current.getArrayView(false).slice(0, current.size); } + getWasmImports () : WebAssembly.Imports { + const result : any = { + c: this.getConstants(), + m: { h: (Module).asm.memory }, + f: { f: getWasmFunctionTable() }, + }; + + const importsToEmit = this.getImportsToEmit(); + + for (let i = 0; i < importsToEmit.length; i++) { + const ifi = importsToEmit[i]; + if (typeof (ifi.func) !== "function") + throw new Error(`Import '${ifi.name}' not found or not a function`); + + const mangledName = this.getCompressedName(ifi); + let subTable = result[ifi.module]; + if (!subTable) { + subTable = result[ifi.module] = {}; + } + subTable[mangledName] = ifi.func; + } + + return result; + } + // HACK: Approximate amount of space we need to generate the full module at present // FIXME: This does not take into account any other functions already generated if they weren't // emitted into the module immediately - get bytesGeneratedSoFar() { + get bytesGeneratedSoFar () { + const importSize = this.compressImportNames + // mod (2 bytes) name (2-3 bytes) type (1 byte) typeidx (1-2 bytes) + ? 8 + // we keep the uncompressed import names somewhat short, generally, so +12 bytes is about right + : 20; + return this.stack[0].size + // HACK: A random constant for section headers and padding 32 + - // mod (2 bytes) name (2-3 bytes) type (1 byte) typeidx (1-2 bytes) - (this.importedFunctionCount * 8) + + (this.importedFunctionCount * importSize) + // type index for each function (this.functions.length * 2) + // export entry for each export @@ -264,7 +294,13 @@ export class WasmBuilder { return this.current.appendU8(value); } - appendU32(value: number) { + appendSimd (value: WasmSimdOpcode) { + this.current.appendU8(WasmOpcode.PREFIX_simd); + // Yes that's right. We're using LEB128 to encode 8-bit opcodes. Why? I don't know + return this.current.appendULeb(value); + } + + appendU32 (value: number) { return this.current.appendU32(value); } @@ -424,8 +460,8 @@ export class WasmBuilder { return imports; } - getCompressedName(ifi: ImportedFunctionInfo) { - if (!this.compressImportNames || typeof (ifi.index) !== "number") + getCompressedName (ifi: ImportedFunctionInfo) { + if (!this.compressImportNames || typeof(ifi.index) !== "number") return ifi.name; let result = compressedNameCache[ifi.index!]; @@ -434,23 +470,31 @@ export class WasmBuilder { return result; } - _generateImportSection() { - const importsToEmit = []; + getImportsToEmit () { + const result = []; for (const k in this.importedFunctions) { - const f = this.importedFunctions[k]; - if (f.index !== undefined) - importsToEmit.push(f); + const v = this.importedFunctions[k]; + if (typeof (v.index) !== "number") + continue; + result.push(v); } - importsToEmit.sort((lhs, rhs) => lhs.index! - rhs.index!); + result.sort((lhs, rhs) => lhs.index! - rhs.index!); + // console.log("result=[" + result.map(f => `#${f.index} ${f.module}.${f.name}`) + "]"); + return result; + } + + _generateImportSection () { + const importsToEmit = this.getImportsToEmit(); + this.lockImports = true; // Import section this.beginSection(2); - this.appendULeb(1 + importsToEmit.length + this.constantSlots.length); + this.appendULeb(2 + importsToEmit.length + this.constantSlots.length); - // console.log(`referenced ${importsToEmit.length}/${allImports.length} import(s)`); + // console.log(`referenced ${importsToEmit.length} import(s)`); for (let i = 0; i < importsToEmit.length; i++) { const ifi = importsToEmit[i]; - // console.log(` #${ifi.index} ${ifi.module}.${ifi.name} = ${ifi.friendlyName}`); + // console.log(` #${ifi.index} ${ifi.module}.${ifi.name} = ${ifi.func}`); this.appendName(ifi.module); this.appendName(this.getCompressedName(ifi)); this.appendU8(0x0); // function @@ -472,14 +516,26 @@ export class WasmBuilder { this.appendU8(0x00); // Minimum size is in 64k pages, not bytes this.appendULeb(0x01); + + this.appendName("f"); + this.appendName("f"); + // tabletype + this.appendU8(0x01); + // funcref + this.appendU8(0x70); + // limits = { min=0x01, max=infinity } + this.appendU8(0x00); + this.appendULeb(0x01); } defineImportedFunction( module: string, name: string, functionTypeName: string, - assumeUsed: boolean, permanent: boolean, func: Function | number - ): ImportedFunctionInfo { - if (permanent && (this.importedFunctionCount > this.permanentImportedFunctionCount)) - throw new Error("New permanent imports cannot be defined after non-permanent ones"); + permanent: boolean, func: Function | number + ) : ImportedFunctionInfo { + if (this.lockImports) + throw new Error("Import section already generated"); + if (permanent && (this.importedFunctionCount > 0)) + throw new Error("New permanent imports cannot be defined after any indexes have been assigned"); const type = this.functionTypes[functionTypeName]; if (!type) throw new Error("No function type named " + functionTypeName); @@ -487,23 +543,15 @@ export class WasmBuilder { throw new Error("A permanent import must have a permanent function type"); const typeIndex = type[0]; const table = permanent ? this.permanentImportedFunctions : this.importedFunctions; - const index = assumeUsed - ? ( - permanent - ? this.permanentImportedFunctionCount++ - : this.importedFunctionCount++ - ) - : undefined; if (typeof (func) === "number") func = getWasmFunctionTable().get(func); if (typeof (func) !== "function") throw new Error(`Value passed for imported function ${name} was not a function or valid function pointer`); const result = table[name] = { - index, + index: undefined, typeIndex, module, name, - assumeUsed, func }; return result; @@ -581,11 +629,21 @@ export class WasmBuilder { this.endSection(); } - callImport(name: string) { + call_indirect (functionTypeName: string, tableIndex: number) { + const type = this.functionTypes[functionTypeName]; + if (!type) + throw new Error("No function type named " + functionTypeName); + const typeIndex = type[0]; + this.appendU8(WasmOpcode.call_indirect); + this.appendULeb(typeIndex); + this.appendULeb(tableIndex); + } + + callImport (name: string) { const func = this.importedFunctions[name]; if (!func) throw new Error("No imported function named " + name); - if (func.index === undefined) + if (typeof (func.index) !== "number") func.index = this.importedFunctionCount++; this.appendU8(WasmOpcode.call); this.appendULeb(func.index); @@ -1325,6 +1383,9 @@ export const elapsedTimes = { compilation: 0 }; +export const simdFallbackCounters : { [name: string] : number } = { +}; + export const counters = { traceCandidates: 0, tracesCompiled: 0, @@ -1336,6 +1397,7 @@ export const counters = { nullChecksEliminated: 0, backBranchesEmitted: 0, backBranchesNotEmitted: 0, + simdFallback: simdFallbackCounters, }; export const _now = (globalThis.performance && globalThis.performance.now) @@ -1636,6 +1698,13 @@ export function importDef(name: string, fn: Function): [string, string, Function return [name, name, fn]; } +export function bytesFromHex (hex: string) : Uint8Array { + const bytes = new Uint8Array(hex.length / 2); + for (let i = 0; i < hex.length; i += 2) + bytes[i / 2] = parseInt(hex.substring(i, i + 2), 16); + return bytes; +} + export type JiterpreterOptions = { enableAll?: boolean; enableTraces: boolean; @@ -1644,6 +1713,7 @@ export type JiterpreterOptions = { enableBackwardBranches: boolean; enableCallResume: boolean; enableWasmEh: boolean; + enableSimd: boolean; // For locations where the jiterpreter heuristic says we will be unable to generate // a trace, insert an entry point opcode anyway. This enables collecting accurate // stats for options like estimateHeat, but raises overhead. @@ -1685,6 +1755,7 @@ const optionNames: { [jsName: string]: string } = { "enableBackwardBranches": "jiterpreter-backward-branch-entries-enabled", "enableCallResume": "jiterpreter-call-resume-enabled", "enableWasmEh": "jiterpreter-wasm-eh-enabled", + "enableSimd": "jiterpreter-simd-enabled", "enableStats": "jiterpreter-stats-enabled", "disableHeuristic": "jiterpreter-disable-heuristic", "estimateHeat": "jiterpreter-estimate-heat", diff --git a/src/mono/wasm/runtime/jiterpreter-trace-generator.ts b/src/mono/wasm/runtime/jiterpreter-trace-generator.ts index 0f80f0661ee97..3011f2548f7a1 100644 --- a/src/mono/wasm/runtime/jiterpreter-trace-generator.ts +++ b/src/mono/wasm/runtime/jiterpreter-trace-generator.ts @@ -2,24 +2,28 @@ // The .NET Foundation licenses this file to you under the MIT license. import { mono_assert, MonoMethod } from "./types"; +import { Module } from "./imports"; import { NativePointer } from "./types/emscripten"; import { getU16, getI16, getU32_unaligned, getI32_unaligned, getF32_unaligned, getF64_unaligned, } from "./memory"; -import { WasmOpcode } from "./jiterpreter-opcodes"; -import { MintOpcode, OpcodeInfo } from "./mintops"; +import { WasmOpcode, WasmSimdOpcode } from "./jiterpreter-opcodes"; +import { + MintOpcode, OpcodeInfo, SimdInfo, + SimdIntrinsic2, SimdIntrinsic3, SimdIntrinsic4 +} from "./mintops"; import cwraps from "./cwraps"; import { MintOpcodePtr, WasmValtype, WasmBuilder, append_memset_dest, append_bailout, append_exit, append_memmove_dest_src, try_append_memset_fast, - try_append_memmove_fast, counters, + try_append_memmove_fast, counters, bytesFromHex, getMemberOffset, JiterpMember, BailoutReason, getOpcodeTableValue } from "./jiterpreter-support"; import { - sizeOfDataItem, + sizeOfDataItem, sizeOfV128, sizeOfStackval, disabledOpcodes, countCallTargets, callTargetCounts, trapTraceErrors, @@ -158,7 +162,7 @@ export function generateWasmBody ( ) : number { const abort = 0; let isFirstInstruction = true, isConditionallyExecuted = false, - firstOpcodeInBlock = true; + firstOpcodeInBlock = true, containsSimd = false; let result = 0, prologueOpcodeCounter = 0, conditionalOpcodeCounter = 0; @@ -203,9 +207,20 @@ export function generateWasmBody ( let opcode = getU16(ip); const info = OpcodeInfo[opcode]; + const isSimdIntrins = (opcode >= MintOpcode.MINT_SIMD_INTRINS_P_P) && + (opcode <= MintOpcode.MINT_SIMD_INTRINS_P_PPP); + const simdIntrinsArgCount = isSimdIntrins + ? opcode - MintOpcode.MINT_SIMD_INTRINS_P_P + 2 + : 0; + const simdIntrinsIndex = isSimdIntrins + ? getArgU16(ip, 1 + simdIntrinsArgCount) + : 0; + mono_assert(info, () => `invalid opcode ${opcode}`); - const opname = info[0]; + const opname = isSimdIntrins + ? SimdInfo[simdIntrinsArgCount][simdIntrinsIndex] + : info[0]; const _ip = ip; const isBackBranchTarget = builder.options.noExitBackwardBranches && is_backward_branch_target(ip, startOfBody, backwardBranchTable), @@ -1293,6 +1308,14 @@ export function generateWasmBody ( append_exit(builder, ip, exitOpcodeCounter, BailoutReason.ComplexBranch); } else ip = abort; + } else if ( + (opcode >= MintOpcode.MINT_SIMD_V128_LDC) && + (opcode <= MintOpcode.MINT_SIMD_INTRINS_P_PPP) + ) { + if (!emit_simd(builder, ip, opcode, opname, simdIntrinsArgCount, simdIntrinsIndex)) + ip = abort; + else + containsSimd = true; } else if (opcodeValue === 0) { // This means it was explicitly marked as no-value in the opcode value table // so we can just skip over it. This is done for things like nops. @@ -1376,6 +1399,11 @@ export function generateWasmBody ( // console.log(`estimated size: ${builder.size + builder.cfg.overheadBytes + builder.bytesGeneratedSoFar}`); + // HACK: Traces containing simd will be *much* shorter than non-simd traces, + // which will cause both the heuristic and our length requirement outside + // to reject them. For now, just add a big constant to the length + if (containsSimd) + result += 10240; return result; } @@ -1404,12 +1432,16 @@ function append_branch_target_block (builder: WasmBuilder, ip: MintOpcodePtr, is builder.cfg.startBranchBlock(ip, isBackBranchTarget); } -function append_ldloc (builder: WasmBuilder, offset: number, opcode: WasmOpcode) { +function append_ldloc (builder: WasmBuilder, offset: number, opcodeOrPrefix: WasmOpcode, simdOpcode?: WasmSimdOpcode) { builder.local("pLocals"); - builder.appendU8(opcode); + builder.appendU8(opcodeOrPrefix); + if (simdOpcode !== undefined) { + // This looks wrong but I assure you it's correct. + builder.appendULeb(simdOpcode); + } // stackval is 8 bytes, but pLocals might not be 8 byte aligned so we use 4 // wasm spec prohibits alignment higher than natural alignment, just to be annoying - const alignment = (opcode > WasmOpcode.f64_load) ? 0 : 2; + const alignment = (simdOpcode !== undefined) || (opcodeOrPrefix > WasmOpcode.f64_load) ? 0 : 2; builder.appendMemarg(offset, alignment); } @@ -1418,11 +1450,15 @@ function append_ldloc (builder: WasmBuilder, offset: number, opcode: WasmOpcode) // where the offset+alignment pair is referred to as a 'memarg' by the spec. // The actual store operation is equivalent to `pBase[offset] = value` (alignment has no // observable impact on behavior, other than causing compilation failures if out of range) -function append_stloc_tail (builder: WasmBuilder, offset: number, opcode: WasmOpcode) { - builder.appendU8(opcode); +function append_stloc_tail (builder: WasmBuilder, offset: number, opcodeOrPrefix: WasmOpcode, simdOpcode?: WasmSimdOpcode) { + builder.appendU8(opcodeOrPrefix); + if (simdOpcode !== undefined) { + // This looks wrong but I assure you it's correct. + builder.appendULeb(simdOpcode); + } // stackval is 8 bytes, but pLocals might not be 8 byte aligned so we use 4 // wasm spec prohibits alignment higher than natural alignment, just to be annoying - const alignment = (opcode > WasmOpcode.f64_store) ? 0 : 2; + const alignment = (simdOpcode !== undefined) || (opcodeOrPrefix > WasmOpcode.f64_store) ? 0 : 2; builder.appendMemarg(offset, alignment); invalidate_local(offset); } @@ -1759,13 +1795,13 @@ function emit_fieldop ( case MintOpcode.MINT_STFLD_O: { /* * Writing a ref-type field has to call an import to perform the write barrier anyway, - * and technically it should use a different kind of barrier from copy_pointer. So + * and technically it should use a different kind of barrier from copy_ptr. So * we define a special import that is responsible for performing the whole stfld_o * operation with as little trace-side overhead as possible * Previously the pseudocode looked like: * cknull_ptr = *(MonoObject *)&locals[objectOffset]; * if (!cknull_ptr) bailout; - * copy_pointer(cknull_ptr + fieldOffset, *(MonoObject *)&locals[localOffset]) + * copy_ptr(cknull_ptr + fieldOffset, *(MonoObject *)&locals[localOffset]) * The null check optimization also allows us to safely omit the bailout check * if we know that the target object isn't null. Even if the target object were * somehow null in this case (bad! shouldn't be possible!) it won't be a crash @@ -1938,7 +1974,7 @@ function emit_sfieldop ( // src append_ldloca(builder, localOffset, 0); // FIXME: Use mono_gc_wbarrier_set_field_internal - builder.callImport("copy_pointer"); + builder.callImport("copy_ptr"); return true; case MintOpcode.MINT_LDSFLD_VT: { const sizeBytes = getArgU16(ip, 4); @@ -2048,7 +2084,7 @@ const unopTable : { [opcode: number]: OpRec3 | undefined } = { [MintOpcode.MINT_POPCNT_I4]: [WasmOpcode.i32_popcnt, WasmOpcode.i32_load, WasmOpcode.i32_store], [MintOpcode.MINT_CLZ_I8]: [WasmOpcode.i64_clz, WasmOpcode.i64_load, WasmOpcode.i64_store], [MintOpcode.MINT_CTZ_I8]: [WasmOpcode.i64_ctz, WasmOpcode.i64_load, WasmOpcode.i64_store], - [MintOpcode.MINT_POPCNT_I8]: [WasmOpcode.i64_popcnt, WasmOpcode.i32_load, WasmOpcode.i32_store], + [MintOpcode.MINT_POPCNT_I8]: [WasmOpcode.i64_popcnt, WasmOpcode.i64_load, WasmOpcode.i64_store], }; // HACK: Generating correct wasm for these is non-trivial so we hand them off to C. @@ -3023,7 +3059,7 @@ function emit_indirectop (builder: WasmBuilder, ip: MintOpcodePtr, opcode: MintO builder.local("cknull_ptr"); // Load address of value so that copy_managed_pointer can grab it append_ldloca(builder, valueVarIndex, 0); - builder.callImport("copy_pointer"); + builder.callImport("copy_ptr"); } else { // Pre-load address for the store operation builder.local("cknull_ptr"); @@ -3237,6 +3273,337 @@ function emit_arrayop (builder: WasmBuilder, frame: NativePointer, ip: MintOpcod return true; } +const vec128Test = + "0061736d0100000001040160000003020100070801047465737400000a090107004100fd111a0b"; +let wasmSimdSupported : boolean | undefined; + +function getIsWasmSimdSupported () : boolean { + if (wasmSimdSupported !== undefined) + return wasmSimdSupported; + + // Probe whether the current environment can handle wasm v128 opcodes. + try { + // Load and compile a test module that uses i32x4.splat. See wasm-simd-feature-detect.wat/wasm + const bytes = bytesFromHex(vec128Test); + counters.bytesGenerated += bytes.length; + new WebAssembly.Module(bytes); + wasmSimdSupported = true; + } catch (exc) { + console.log("MONO_WASM: Disabling WASM SIMD support due to JIT failure", exc); + wasmSimdSupported = false; + } + + return wasmSimdSupported; +} + +function get_import_name ( + builder: WasmBuilder, typeName: string, + functionPtr: number +) : string { + const name = `${typeName}_${functionPtr.toString(16)}`; + if (typeof (builder.importedFunctions[name]) !== "object") + builder.defineImportedFunction("s", name, typeName, false, functionPtr); + + return name; +} + +const simdCreateSizes = { + [MintOpcode.MINT_SIMD_V128_I1_CREATE]: 1, + [MintOpcode.MINT_SIMD_V128_I2_CREATE]: 2, + [MintOpcode.MINT_SIMD_V128_I4_CREATE]: 4, + [MintOpcode.MINT_SIMD_V128_I8_CREATE]: 8, +}; + +const simdCreateLoadOps = { + [MintOpcode.MINT_SIMD_V128_I1_CREATE]: WasmOpcode.i32_load8_s, + [MintOpcode.MINT_SIMD_V128_I2_CREATE]: WasmOpcode.i32_load16_s, + [MintOpcode.MINT_SIMD_V128_I4_CREATE]: WasmOpcode.i32_load, + [MintOpcode.MINT_SIMD_V128_I8_CREATE]: WasmOpcode.i64_load, +}; + +const simdCreateStoreOps = { + [MintOpcode.MINT_SIMD_V128_I1_CREATE]: WasmOpcode.i32_store8, + [MintOpcode.MINT_SIMD_V128_I2_CREATE]: WasmOpcode.i32_store16, + [MintOpcode.MINT_SIMD_V128_I4_CREATE]: WasmOpcode.i32_store, + [MintOpcode.MINT_SIMD_V128_I8_CREATE]: WasmOpcode.i64_store, +}; + +function emit_simd ( + builder: WasmBuilder, ip: MintOpcodePtr, + opcode: MintOpcode, opname: string, + argCount: number, index: number +) : boolean { + // First, if compiling an intrinsic attempt to emit the special vectorized implementation + // We only do this if SIMD is enabled since we'll be using the v128 opcodes. + if (builder.options.enableSimd && getIsWasmSimdSupported()) { + switch (argCount) { + case 2: + if (emit_simd_2(builder, ip, index)) + return true; + break; + case 3: + if (emit_simd_3(builder, ip, index)) + return true; + break; + case 4: + if (emit_simd_4(builder, ip, index)) + return true; + break; + } + } + + // Fall back to a mix of non-vectorized wasm and the interpreter's implementation of the opcodes + // The ideal way to call the intrinsic implementations would be statically via the import table, + // but we don't have a way to add entries to the import table at compile time yet. + switch (opcode) { + case MintOpcode.MINT_SIMD_V128_LDC: { + if (builder.options.enableSimd && getIsWasmSimdSupported()) { + builder.local("pLocals"); + builder.appendSimd(WasmSimdOpcode.v128_const); + const view = Module.HEAPU8.slice(ip + 4, ip + 4 + sizeOfV128); + builder.appendBytes(view); + append_simd_store(builder, ip); + } else { + // dest + append_ldloca(builder, getArgU16(ip, 1), sizeOfV128); + // src (ip + 2) + builder.ptr_const(ip + 4); + append_memmove_dest_src(builder, sizeOfV128); + } + return true; + } + case MintOpcode.MINT_SIMD_V128_I1_CREATE: + case MintOpcode.MINT_SIMD_V128_I2_CREATE: + case MintOpcode.MINT_SIMD_V128_I4_CREATE: + case MintOpcode.MINT_SIMD_V128_I8_CREATE: { + // These opcodes pack a series of locals into a vector + const elementSize = simdCreateSizes[opcode], + numElements = sizeOfV128 / elementSize, + destOffset = getArgU16(ip, 1), + srcOffset = getArgU16(ip, 2), + loadOp = simdCreateLoadOps[opcode], + storeOp = simdCreateStoreOps[opcode]; + for (let i = 0; i < numElements; i++) { + builder.local("pLocals"); + // load element from stack slot + append_ldloc(builder, srcOffset + (i * sizeOfStackval), loadOp); + // then store to destination element + append_stloc_tail(builder, destOffset + (i * elementSize), storeOp); + } + return true; + } + case MintOpcode.MINT_SIMD_INTRINS_P_P: { + counters.simdFallback[opname] = (counters.simdFallback[opname] || 0) + 1; + // res + append_ldloca(builder, getArgU16(ip, 1), sizeOfV128); + // src + append_ldloca(builder, getArgU16(ip, 2), 0); + const importName = get_import_name(builder, "simd_p_p", cwraps.mono_jiterp_get_simd_intrinsic(1, index)); + builder.callImport(importName); + return true; + } + case MintOpcode.MINT_SIMD_INTRINS_P_PP: { + counters.simdFallback[opname] = (counters.simdFallback[opname] || 0) + 1; + // res + append_ldloca(builder, getArgU16(ip, 1), sizeOfV128); + // src + append_ldloca(builder, getArgU16(ip, 2), 0); + append_ldloca(builder, getArgU16(ip, 3), 0); + const importName = get_import_name(builder, "simd_p_pp", cwraps.mono_jiterp_get_simd_intrinsic(2, index)); + builder.callImport(importName); + return true; + } + case MintOpcode.MINT_SIMD_INTRINS_P_PPP: { + counters.simdFallback[opname] = (counters.simdFallback[opname] || 0) + 1; + // res + append_ldloca(builder, getArgU16(ip, 1), sizeOfV128); + // src + append_ldloca(builder, getArgU16(ip, 2), 0); + append_ldloca(builder, getArgU16(ip, 3), 0); + append_ldloca(builder, getArgU16(ip, 4), 0); + const importName = get_import_name(builder, "simd_p_ppp", cwraps.mono_jiterp_get_simd_intrinsic(3, index)); + builder.callImport(importName); + return true; + } + default: + console.log(`emit_simd failed for ${opname}`); + return false; + } +} + +function append_simd_store (builder: WasmBuilder, ip: MintOpcodePtr) { + append_stloc_tail(builder, getArgU16(ip, 1), WasmOpcode.PREFIX_simd, WasmSimdOpcode.v128_store); +} + +function append_simd_2_load (builder: WasmBuilder, ip: MintOpcodePtr, loadOp?: WasmSimdOpcode) { + builder.local("pLocals"); + // This || is harmless since v128_load is 0 + append_ldloc(builder, getArgU16(ip, 2), WasmOpcode.PREFIX_simd, loadOp || WasmSimdOpcode.v128_load); +} + +function append_simd_3_load (builder: WasmBuilder, ip: MintOpcodePtr) { + builder.local("pLocals"); + append_ldloc(builder, getArgU16(ip, 2), WasmOpcode.PREFIX_simd, WasmSimdOpcode.v128_load); + append_ldloc(builder, getArgU16(ip, 3), WasmOpcode.PREFIX_simd, WasmSimdOpcode.v128_load); +} + +function append_simd_4_load (builder: WasmBuilder, ip: MintOpcodePtr) { + builder.local("pLocals"); + append_ldloc(builder, getArgU16(ip, 2), WasmOpcode.PREFIX_simd, WasmSimdOpcode.v128_load); + append_ldloc(builder, getArgU16(ip, 3), WasmOpcode.PREFIX_simd, WasmSimdOpcode.v128_load); + append_ldloc(builder, getArgU16(ip, 4), WasmOpcode.PREFIX_simd, WasmSimdOpcode.v128_load); +} + +const simdShiftTable = new Set([ + SimdIntrinsic3.V128_I1_LEFT_SHIFT, + SimdIntrinsic3.V128_I2_LEFT_SHIFT, + SimdIntrinsic3.V128_I4_LEFT_SHIFT, + SimdIntrinsic3.V128_I8_LEFT_SHIFT, + + SimdIntrinsic3.V128_I1_RIGHT_SHIFT, + SimdIntrinsic3.V128_I2_RIGHT_SHIFT, + SimdIntrinsic3.V128_I4_RIGHT_SHIFT, + + SimdIntrinsic3.V128_I1_URIGHT_SHIFT, + SimdIntrinsic3.V128_I2_URIGHT_SHIFT, + SimdIntrinsic3.V128_I4_URIGHT_SHIFT, + SimdIntrinsic3.V128_I8_URIGHT_SHIFT, +]); + +// eslint-disable-next-line @typescript-eslint/no-unused-vars +function append_stloc_simd_zero (builder: WasmBuilder, offset: number) { + builder.local("pLocals"); + builder.appendSimd(WasmSimdOpcode.v128_const); + builder.appendBytes(new Uint8Array(sizeOfV128)); + append_stloc_tail(builder, offset, WasmOpcode.PREFIX_simd, WasmSimdOpcode.v128_store); +} + +// FIXME: One of the custom implementations causes xharness to break + +function emit_simd_2 (builder: WasmBuilder, ip: MintOpcodePtr, index: SimdIntrinsic2) : boolean { + const simple = cwraps.mono_jiterp_get_simd_opcode(1, index); + if (simple) { + append_simd_2_load(builder, ip); + builder.appendSimd(simple); + append_simd_store(builder, ip); + return true; + } + + switch (index) { + case SimdIntrinsic2.V128_I1_CREATE_SCALAR: + // Zero then write scalar component + builder.local("pLocals"); + append_stloc_simd_zero(builder, getArgU16(ip, 1)); + append_ldloc(builder, getArgU16(ip, 2), WasmOpcode.i32_load8_s); + append_stloc_tail(builder, getArgU16(ip, 1), WasmOpcode.i32_store8); + return true; + case SimdIntrinsic2.V128_I2_CREATE_SCALAR: + // Zero then write scalar component + builder.local("pLocals"); + append_stloc_simd_zero(builder, getArgU16(ip, 1)); + append_ldloc(builder, getArgU16(ip, 2), WasmOpcode.i32_load16_s); + append_stloc_tail(builder, getArgU16(ip, 1), WasmOpcode.i32_store16); + return true; + case SimdIntrinsic2.V128_I4_CREATE_SCALAR: + // Zero then write scalar component + builder.local("pLocals"); + append_stloc_simd_zero(builder, getArgU16(ip, 1)); + append_ldloc(builder, getArgU16(ip, 2), WasmOpcode.i32_load); + append_stloc_tail(builder, getArgU16(ip, 1), WasmOpcode.i32_store); + return true; + case SimdIntrinsic2.V128_I8_CREATE_SCALAR: + // Zero then write scalar component + builder.local("pLocals"); + append_stloc_simd_zero(builder, getArgU16(ip, 1)); + append_ldloc(builder, getArgU16(ip, 2), WasmOpcode.i64_load); + append_stloc_tail(builder, getArgU16(ip, 1), WasmOpcode.i64_store); + return true; + + case SimdIntrinsic2.V128_I1_CREATE: + append_simd_2_load(builder, ip, WasmSimdOpcode.v128_load8_splat); + append_simd_store(builder, ip); + return true; + case SimdIntrinsic2.V128_I2_CREATE: + append_simd_2_load(builder, ip, WasmSimdOpcode.v128_load16_splat); + append_simd_store(builder, ip); + return true; + case SimdIntrinsic2.V128_I4_CREATE: + append_simd_2_load(builder, ip, WasmSimdOpcode.v128_load32_splat); + append_simd_store(builder, ip); + return true; + case SimdIntrinsic2.V128_I8_CREATE: + append_simd_2_load(builder, ip, WasmSimdOpcode.v128_load64_splat); + append_simd_store(builder, ip); + return true; + + default: + return false; + } +} + +function emit_simd_3 (builder: WasmBuilder, ip: MintOpcodePtr, index: SimdIntrinsic3) : boolean { + const simple = cwraps.mono_jiterp_get_simd_opcode(2, index); + if (simple) { + const isShift = simdShiftTable.has(index); + if (isShift) { + builder.local("pLocals"); + append_ldloc(builder, getArgU16(ip, 2), WasmOpcode.PREFIX_simd, WasmSimdOpcode.v128_load); + append_ldloc(builder, getArgU16(ip, 3), WasmOpcode.i32_load); + builder.appendSimd(simple); + append_simd_store(builder, ip); + } else { + append_simd_3_load(builder, ip); + builder.appendSimd(simple); + append_simd_store(builder, ip); + } + return true; + } + + switch (index) { + case SimdIntrinsic3.V128_BITWISE_EQUALITY: + case SimdIntrinsic3.V128_BITWISE_INEQUALITY: + append_simd_3_load(builder, ip); + // FIXME: i64x2_ne and i64x2_any_true? + builder.appendSimd(WasmSimdOpcode.i64x2_eq); + builder.appendSimd(WasmSimdOpcode.i64x2_all_true); + if (index === SimdIntrinsic3.V128_BITWISE_INEQUALITY) + builder.appendU8(WasmOpcode.i32_eqz); + append_stloc_tail(builder, getArgU16(ip, 1), WasmOpcode.i32_store); + return true; + default: + return false; + } + + return false; +} + +function emit_simd_4 (builder: WasmBuilder, ip: MintOpcodePtr, index: SimdIntrinsic4) : boolean { + const simple = cwraps.mono_jiterp_get_simd_opcode(3, index); + if (simple) { + append_simd_4_load(builder, ip); + builder.appendSimd(simple); + append_simd_store(builder, ip); + return true; + } + + return false; + + switch (index) { + case SimdIntrinsic4.V128_CONDITIONAL_SELECT: + builder.local("pLocals"); + // Wasm spec: result = ior𝑁(iand𝑁(𝑖1, 𝑖3), iand𝑁(𝑖2, inot𝑁(𝑖3))) + // Our opcode: *arg0 = (*arg2 & *arg1) | (*arg3 & ~*arg1) + append_ldloc(builder, getArgU16(ip, 3), WasmOpcode.PREFIX_simd, WasmSimdOpcode.v128_load); + append_ldloc(builder, getArgU16(ip, 4), WasmOpcode.PREFIX_simd, WasmSimdOpcode.v128_load); + append_ldloc(builder, getArgU16(ip, 2), WasmOpcode.PREFIX_simd, WasmSimdOpcode.v128_load); + builder.appendSimd(WasmSimdOpcode.v128_bitselect); + append_simd_store(builder, ip); + return true; + default: + return false; + } +} + function append_safepoint (builder: WasmBuilder, ip: MintOpcodePtr) { // Check whether a safepoint is required builder.ptr_const(cwraps.mono_jiterp_get_polling_required_address()); diff --git a/src/mono/wasm/runtime/jiterpreter.ts b/src/mono/wasm/runtime/jiterpreter.ts index 1d355272ae85c..7c2e66bde25cb 100644 --- a/src/mono/wasm/runtime/jiterpreter.ts +++ b/src/mono/wasm/runtime/jiterpreter.ts @@ -12,7 +12,8 @@ import { MintOpcode, OpcodeInfo } from "./mintops"; import cwraps from "./cwraps"; import { MintOpcodePtr, WasmValtype, WasmBuilder, addWasmFunctionPointer, - _now, elapsedTimes, counters, getRawCwrap, importDef, + _now, elapsedTimes, + counters, getRawCwrap, importDef, JiterpreterOptions, getOptions, recordFailure, JiterpMember, getMemberOffset, BailoutReasonNames, BailoutReason @@ -138,6 +139,8 @@ export const traceInfo: { [key: string]: TraceInfo } = {}; export const sizeOfDataItem = 4, sizeOfObjectHeader = 8, + sizeOfV128 = 16, + sizeOfStackval = 8, // While stats are enabled, dump concise stats every N traces so that it's clear a long-running // task isn't frozen if it's jitting lots of traces autoDumpInterval = 500; @@ -261,7 +264,7 @@ function getTraceImports() { traceImports = [ importDef("bailout", recordBailout), - importDef("copy_pointer", getRawCwrap("mono_wasm_copy_managed_pointer")), + importDef("copy_ptr", getRawCwrap("mono_wasm_copy_managed_pointer")), importDef("entry", getRawCwrap("mono_jiterp_increase_entry_count")), importDef("value_copy", getRawCwrap("mono_jiterp_value_copy")), importDef("gettype", getRawCwrap("mono_jiterp_gettype_ref")), @@ -376,8 +379,7 @@ function initialize_builder(builder: WasmBuilder) { WasmValtype.i32, true ); builder.defineType( - "copy_pointer", - { + "copy_ptr", { "dest": WasmValtype.i32, "src": WasmValtype.i32 }, @@ -693,13 +695,34 @@ function initialize_builder(builder: WasmBuilder) { }, WasmValtype.i32, true ); + builder.defineType( + "simd_p_p", { + "arg0": WasmValtype.i32, + "arg1": WasmValtype.i32, + }, WasmValtype.void, true + ); + builder.defineType( + "simd_p_pp", { + "arg0": WasmValtype.i32, + "arg1": WasmValtype.i32, + "arg2": WasmValtype.i32, + }, WasmValtype.void, true + ); + builder.defineType( + "simd_p_ppp", { + "arg0": WasmValtype.i32, + "arg1": WasmValtype.i32, + "arg2": WasmValtype.i32, + "arg3": WasmValtype.i32, + }, WasmValtype.void, true + ); const traceImports = getTraceImports(); // Pre-define function imports as persistent for (let i = 0; i < traceImports.length; i++) { mono_assert(traceImports[i], () => `trace #${i} missing`); - builder.defineImportedFunction("i", traceImports[i][0], traceImports[i][1], false, true, traceImports[i][2]); + builder.defineImportedFunction("i", traceImports[i][0], traceImports[i][1], true, traceImports[i][2]); } } @@ -836,17 +859,15 @@ function generate_wasm( if (trace > 0) console.log(`${((builder.base)).toString(16)} ${methodFullName || traceName} generated ${buffer.length} byte(s) of wasm`); counters.bytesGenerated += buffer.length; + if (buffer.length >= maxModuleSize) { console.warn(`MONO_WASM: Jiterpreter generated too much code (${buffer.length} bytes) for trace ${traceName}. Please report this issue.`); return 0; } - const traceModule = new WebAssembly.Module(buffer); - const traceInstance = new WebAssembly.Instance(traceModule, { - i: builder.getImportedFunctionTable(), - c: builder.getConstants(), - m: { h: (Module).asm.memory }, - }); + const traceModule = new WebAssembly.Module(buffer); + const wasmImports = builder.getWasmImports(); + const traceInstance = new WebAssembly.Instance(traceModule, wasmImports); // Get the exported trace function const fn = traceInstance.exports[traceName]; @@ -907,7 +928,7 @@ function generate_wasm( console.log(builder.traceBuf[i]); } - console.log(`// MONO_WASM: ${methodFullName || methodName}:${traceOffset.toString(16)} generated, blob follows //`); + console.log(`// MONO_WASM: ${methodFullName || traceName} generated, blob follows //`); let s = "", j = 0; try { // We may have thrown an uncaught exception while inside a block, @@ -1194,7 +1215,10 @@ export function jiterpreter_dump_stats(b?: boolean, concise?: boolean) { console.log(`// ${keys[i]}: ${abortCounts[keys[i]]} abort(s)`); } - if ((typeof (globalThis.setTimeout) === "function") && (b !== undefined)) + for (const k in counters.simdFallback) + console.log(`// simd ${k}: ${counters.simdFallback[k]} fallback insn(s)`); + + if ((typeof(globalThis.setTimeout) === "function") && (b !== undefined)) setTimeout( () => jiterpreter_dump_stats(b), 15000 diff --git a/src/mono/wasm/runtime/wasm-simd-feature-detect.wasm b/src/mono/wasm/runtime/wasm-simd-feature-detect.wasm new file mode 100644 index 0000000000000000000000000000000000000000..5d7c49d0bcbda0301cd143711052253b05534c49 GIT binary patch literal 39 ucmZQbEY4+QU|?WmVN76PU}j=uVCP_DDM>9ZVPN3mWMpS>WcVv6#SH*n1qBiS literal 0 HcmV?d00001 diff --git a/src/mono/wasm/runtime/wasm-simd-feature-detect.wat b/src/mono/wasm/runtime/wasm-simd-feature-detect.wat new file mode 100644 index 0000000000000..8cd56adf584e1 --- /dev/null +++ b/src/mono/wasm/runtime/wasm-simd-feature-detect.wat @@ -0,0 +1,6 @@ +(module + (func $test (export "test") + (i32x4.splat (i32.const 0)) + drop + ) +) diff --git a/src/mono/wasm/wasm.proj b/src/mono/wasm/wasm.proj index cb76cb0d19e02..5bb5fb38c0ec0 100644 --- a/src/mono/wasm/wasm.proj +++ b/src/mono/wasm/wasm.proj @@ -25,7 +25,7 @@ $([MSBuild]::NormalizeDirectory('$(PkgMicrosoft_NETCore_Runtime_ICU_Transport)', 'runtimes', 'browser-wasm', 'native', 'lib')) $([MSBuild]::NormalizeDirectory('$(PkgMicrosoft_NETCore_Runtime_ICU_Transport)', 'runtimes', 'browser-wasm-threads', 'native', 'lib')) - false + true true false emcc @@ -279,7 +279,7 @@ <_EmccLinkFlags Include="-s INITIAL_MEMORY=$(EmccInitialHeapSize)" /> <_EmccLinkFlags Include="-s STACK_SIZE=$(EmccStackSize)" /> - <_EmccCommonFlags Condition="'$(WasmEnableSIMD)' == 'true'" Include="-msimd128" /> + <_EmccCommonFlags Include="-msimd128" /> <_EmccCommonFlags Condition="'$(MonoWasmThreads)' == 'true'" Include="-s USE_PTHREADS=1" /> <_EmccLinkFlags Condition="'$(MonoWasmThreads)' == 'true'" Include="-Wno-pthreads-mem-growth" /> <_EmccLinkFlags Condition="'$(MonoWasmThreads)' == 'true'" Include="-s PTHREAD_POOL_SIZE=0" /> @@ -539,9 +539,9 @@ - + Date: Fri, 5 May 2023 17:26:29 -0700 Subject: [PATCH 2/4] Repair merge damage --- src/mono/wasm/runtime/jiterpreter-trace-generator.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mono/wasm/runtime/jiterpreter-trace-generator.ts b/src/mono/wasm/runtime/jiterpreter-trace-generator.ts index 3011f2548f7a1..cceebbd61785c 100644 --- a/src/mono/wasm/runtime/jiterpreter-trace-generator.ts +++ b/src/mono/wasm/runtime/jiterpreter-trace-generator.ts @@ -2,7 +2,7 @@ // The .NET Foundation licenses this file to you under the MIT license. import { mono_assert, MonoMethod } from "./types"; -import { Module } from "./imports"; +import { Module } from "./globals"; import { NativePointer } from "./types/emscripten"; import { getU16, getI16, From 2ee9c69a4a94935bf7007765df57bb452d1a69f3 Mon Sep 17 00:00:00 2001 From: Katelyn Gadd Date: Fri, 5 May 2023 17:55:33 -0700 Subject: [PATCH 3/4] Cleanup --- src/mono/wasm/runtime/CMakeLists.txt | 1 + src/mono/wasm/runtime/jiterpreter-trace-generator.ts | 10 ++-------- 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/src/mono/wasm/runtime/CMakeLists.txt b/src/mono/wasm/runtime/CMakeLists.txt index 6e2ac177da001..1a39d1520c873 100644 --- a/src/mono/wasm/runtime/CMakeLists.txt +++ b/src/mono/wasm/runtime/CMakeLists.txt @@ -40,6 +40,7 @@ set_target_properties(dotnet PROPERTIES set(ignoreMeWasmOptFlags "${CONFIGURATION_WASM_OPT_FLAGS}") set(ignoreMeWasmOptAdditionalFlags "${WASM_OPT_ADDITIONAL_FLAGS}") +set(ignoreMeEmsdkPath "${EMSDK_PATH}") if(CMAKE_BUILD_TYPE STREQUAL "Release") add_custom_command(TARGET dotnet diff --git a/src/mono/wasm/runtime/jiterpreter-trace-generator.ts b/src/mono/wasm/runtime/jiterpreter-trace-generator.ts index cceebbd61785c..cdce074e9845b 100644 --- a/src/mono/wasm/runtime/jiterpreter-trace-generator.ts +++ b/src/mono/wasm/runtime/jiterpreter-trace-generator.ts @@ -3353,8 +3353,6 @@ function emit_simd ( } // Fall back to a mix of non-vectorized wasm and the interpreter's implementation of the opcodes - // The ideal way to call the intrinsic implementations would be statically via the import table, - // but we don't have a way to add entries to the import table at compile time yet. switch (opcode) { case MintOpcode.MINT_SIMD_V128_LDC: { if (builder.options.enableSimd && getIsWasmSimdSupported()) { @@ -3426,7 +3424,7 @@ function emit_simd ( return true; } default: - console.log(`emit_simd failed for ${opname}`); + console.log(`MONO_WASM: jiterpreter emit_simd failed for ${opname}`); return false; } } @@ -3444,6 +3442,7 @@ function append_simd_2_load (builder: WasmBuilder, ip: MintOpcodePtr, loadOp?: W function append_simd_3_load (builder: WasmBuilder, ip: MintOpcodePtr) { builder.local("pLocals"); append_ldloc(builder, getArgU16(ip, 2), WasmOpcode.PREFIX_simd, WasmSimdOpcode.v128_load); + // FIXME: Can rhs be a scalar? We handle shifts separately already append_ldloc(builder, getArgU16(ip, 3), WasmOpcode.PREFIX_simd, WasmSimdOpcode.v128_load); } @@ -3470,7 +3469,6 @@ const simdShiftTable = new Set([ SimdIntrinsic3.V128_I8_URIGHT_SHIFT, ]); -// eslint-disable-next-line @typescript-eslint/no-unused-vars function append_stloc_simd_zero (builder: WasmBuilder, offset: number) { builder.local("pLocals"); builder.appendSimd(WasmSimdOpcode.v128_const); @@ -3478,8 +3476,6 @@ function append_stloc_simd_zero (builder: WasmBuilder, offset: number) { append_stloc_tail(builder, offset, WasmOpcode.PREFIX_simd, WasmSimdOpcode.v128_store); } -// FIXME: One of the custom implementations causes xharness to break - function emit_simd_2 (builder: WasmBuilder, ip: MintOpcodePtr, index: SimdIntrinsic2) : boolean { const simple = cwraps.mono_jiterp_get_simd_opcode(1, index); if (simple) { @@ -3586,8 +3582,6 @@ function emit_simd_4 (builder: WasmBuilder, ip: MintOpcodePtr, index: SimdIntrin return true; } - return false; - switch (index) { case SimdIntrinsic4.V128_CONDITIONAL_SELECT: builder.local("pLocals"); From 77f87b56b810390389072372f7b4f0e90360e8cc Mon Sep 17 00:00:00 2001 From: Katelyn Gadd Date: Fri, 5 May 2023 19:29:58 -0700 Subject: [PATCH 4/4] Guard interp simd support behind runtime options --- src/mono/mono/mini/interp/transform-simd.c | 4 ++-- src/mono/mono/utils/options-def.h | 6 ++++++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/src/mono/mono/mini/interp/transform-simd.c b/src/mono/mono/mini/interp/transform-simd.c index c441753d10e34..bb7c2699ffbe6 100644 --- a/src/mono/mono/mini/interp/transform-simd.c +++ b/src/mono/mono/mini/interp/transform-simd.c @@ -705,12 +705,12 @@ interp_emit_simd_intrinsics (TransformData *td, MonoMethod *cmethod, MonoMethodS class_ns = m_class_get_name_space (cmethod->klass); class_name = m_class_get_name (cmethod->klass); - if (!strcmp (class_ns, "System.Runtime.Intrinsics")) { + if (mono_opt_interp_simd_v128 && !strcmp (class_ns, "System.Runtime.Intrinsics")) { if (!strcmp (class_name, "Vector128")) return emit_sri_vector128 (td, cmethod, csignature); else if (!strcmp (class_name, "Vector128`1")) return emit_sri_vector128_t (td, cmethod, csignature); - } else if (!strcmp (class_ns, "System.Runtime.Intrinsics.Wasm")) { + } else if (mono_opt_interp_simd_packedsimd && !strcmp (class_ns, "System.Runtime.Intrinsics.Wasm")) { if (!strcmp (class_name, "PackedSimd")) { gboolean res = emit_sri_packedsimd (td, cmethod, csignature); #if HOST_BROWSER diff --git a/src/mono/mono/utils/options-def.h b/src/mono/mono/utils/options-def.h index 29cf845ca1b44..6d8715c2465ff 100644 --- a/src/mono/mono/utils/options-def.h +++ b/src/mono/mono/utils/options-def.h @@ -60,6 +60,12 @@ DEFINE_BOOL_READONLY(readonly_flag, "readonly-flag", FALSE, "Example") DEFINE_BOOL(wasm_exceptions, "wasm-exceptions", FALSE, "Enable codegen for WASM exceptions") DEFINE_BOOL(wasm_gc_safepoints, "wasm-gc-safepoints", FALSE, "Use GC safepoints on WASM") DEFINE_BOOL(aot_lazy_assembly_load, "aot-lazy-assembly-load", FALSE, "Load assemblies referenced by AOT images lazily") +#if HOST_BROWSER +DEFINE_BOOL(interp_simd_v128, "interp-simd-v128", FALSE, "Enable interpreter Vector128 support") +#else +DEFINE_BOOL(interp_simd_v128, "interp-simd-v128", TRUE, "Enable interpreter Vector128 support") +#endif +DEFINE_BOOL(interp_simd_packedsimd, "interp-simd-packedsimd", FALSE, "Enable interpreter WASM PackedSimd support") #if HOST_BROWSER