From 95f1c83ec0090115653727e94cfbb90c65ebb855 Mon Sep 17 00:00:00 2001 From: Cyrill Burth Date: Fri, 20 Sep 2024 12:54:39 +0200 Subject: [PATCH 01/15] [ADD] AMX implementation and Sapphire rapids config --- .../X86/Payload/AVX512_AMX_Payload.hpp | 58 ++ .../X86/Platform/SapphireRapidsConfig.hpp | 62 ++ .../X86/Payload/AVX512_AMX_Payload.cpp | 605 ++++++++++++++++++ 3 files changed, 725 insertions(+) create mode 100644 include/firestarter/Environment/X86/Payload/AVX512_AMX_Payload.hpp create mode 100644 include/firestarter/Environment/X86/Platform/SapphireRapidsConfig.hpp create mode 100644 src/firestarter/Environment/X86/Payload/AVX512_AMX_Payload.cpp diff --git a/include/firestarter/Environment/X86/Payload/AVX512_AMX_Payload.hpp b/include/firestarter/Environment/X86/Payload/AVX512_AMX_Payload.hpp new file mode 100644 index 00000000..08a789fd --- /dev/null +++ b/include/firestarter/Environment/X86/Payload/AVX512_AMX_Payload.hpp @@ -0,0 +1,58 @@ +/****************************************************************************** + * FIRESTARTER - A Processor Stress Test Utility + * Copyright (C) 2020 TU Dresden, Center for Information Services and High + * Performance Computing + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + * Contact: daniel.hackenberg@tu-dresden.de + *****************************************************************************/ + +#pragma once + +#include + +namespace firestarter::environment::x86::payload { +class AVX512_AMX_Payload final : public X86Payload { +public: + AVX512_AMX_Payload(asmjit::x86::Features const &supportedFeatures) + : X86Payload(supportedFeatures, {asmjit::x86::Features::Id::kAMX_BF16}, + "AVX512_AMX", 8, 32) {} + + int compilePayload( + std::vector> const &proportion, + unsigned instructionCacheSize, + std::list const &dataCacheBufferSize, unsigned ramBufferSize, + unsigned thread, unsigned numberOfLines, bool dumpRegisters, + bool errorDetection) override; + std::list getAvailableInstructions() const override; + void init(unsigned long long *memoryAddr, + unsigned long long bufferSize) override; + + firestarter::environment::payload::Payload *clone() const override { + return new AVX512_AMX_Payload(this->supportedFeatures()); + }; + +private: + const std::map instructionFlops = { + {"REG", 32}, {"L1_L", 32}, {"L1_BROADCAST", 16}, {"L1_S", 16}, + {"L1_LS", 16}, {"L2_L", 32}, {"L2_S", 16}, {"L2_LS", 16}, + {"L3_L", 32}, {"L3_S", 16}, {"L3_LS", 16}, {"L3_P", 16}, + {"RAM_L", 32}, {"RAM_S", 16}, {"RAM_LS", 16}, {"RAM_P", 16}, + {"AMX", 512}}; + + const std::map instructionMemory = { + {"RAM_L", 64}, {"RAM_S", 128}, {"RAM_LS", 128}, {"RAM_P", 64}}; +}; +} // namespace firestarter::environment::x86::payload diff --git a/include/firestarter/Environment/X86/Platform/SapphireRapidsConfig.hpp b/include/firestarter/Environment/X86/Platform/SapphireRapidsConfig.hpp new file mode 100644 index 00000000..68dbb23a --- /dev/null +++ b/include/firestarter/Environment/X86/Platform/SapphireRapidsConfig.hpp @@ -0,0 +1,62 @@ +/****************************************************************************** + * FIRESTARTER - A Processor Stress Test Utility + * Copyright (C) 2020 TU Dresden, Center for Information Services and High + * Performance Computing + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + * Contact: daniel.hackenberg@tu-dresden.de + *****************************************************************************/ + +#pragma once + +#include +#include + +namespace firestarter::environment::x86::platform { +class SapphireRapidsConfig final : public X86PlatformConfig { + +public: + SapphireRapidsConfig(asmjit::x86::Features const &supportedFeatures, + unsigned family, unsigned model, unsigned threads) + : X86PlatformConfig("SKL_XEONEP", 6, {85}, {1, 2}, 0, + {32768, 1048576, 1441792}, 1048576000, 1536, family, + model, threads, + new payload::AVX512_AMX_Payload(supportedFeatures)) {} + + std::vector> + getDefaultPayloadSettings() const override { +/* return std::vector>({{"RAM_S", 3}, + {"RAM_P", 1}, + {"L3_S", 1}, + {"L3_P", 1}, + {"L2_S", 4}, + {"L2_L", 70}, + {"L1_S", 0}, + {"L1_L", 40}, + {"REG", 140}, + {"AMX", 4}});*/ + return std::vector>({{"RAM_S", 0}, + {"RAM_P", 0}, + {"L3_S", 0}, + {"L3_P", 0}, + {"L2_S", 0}, + {"L2_L", 0}, + {"L1_S", 0}, + {"L1_L", 0}, + {"REG", 140}, + {"AMX", 0}}); + } +}; +} // namespace firestarter::environment::x86::platform diff --git a/src/firestarter/Environment/X86/Payload/AVX512_AMX_Payload.cpp b/src/firestarter/Environment/X86/Payload/AVX512_AMX_Payload.cpp new file mode 100644 index 00000000..96027e25 --- /dev/null +++ b/src/firestarter/Environment/X86/Payload/AVX512_AMX_Payload.cpp @@ -0,0 +1,605 @@ +/****************************************************************************** + * FIRESTARTER - A Processor Stress Test Utility + * Copyright (C) 2020 TU Dresden, Center for Information Services and High + * Performance Computing + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + * Contact: daniel.hackenberg@tu-dresden.de + *****************************************************************************/ +#include /* Definition of ARCH_* constants */ +#include +#include +#include + +#define XFEATURE_XTILECFG 17 +#define XFEATURE_XTILEDATA 18 +#define XFEATURE_MASK_XTILECFG (1 << XFEATURE_XTILECFG) +#define XFEATURE_MASK_XTILEDATA (1 << XFEATURE_XTILEDATA) +#define XFEATURE_MASK_XTILE (XFEATURE_MASK_XTILECFG | XFEATURE_MASK_XTILEDATA) + +#define ARCH_GET_XCOMP_PERM 0x1022 +#define ARCH_REQ_XCOMP_PERM 0x1023 + +#define MAX 1024 +#define MAX_ROWS 16 +#define MAX_COLS 64 +#define STRIDE 1 + +#define NUMBER_RANDOM 4 +#define MAX_BITS 7 +#define MIN_BITS 4 + +using namespace firestarter::environment::x86::payload; +using namespace asmjit; +using namespace asmjit::x86; + +// Define struct that is used as config and loaded through ldtilecfg() +typedef struct __tile_config +{ + uint8_t palette_id; + uint8_t start_row; + uint8_t reserved_0[14]; + uint16_t colsb[16]; + uint8_t rows[16]; +} __tilecfg; + +void create_AMX_config(__tilecfg *tileinfo); +void request_permission(); +void init_buffer_int8_rand(uintptr_t buf1, uintptr_t buf2); + +int AVX512_AMX_Payload::compilePayload( + std::vector> const &proportion, + unsigned instructionCacheSize, + std::list const &dataCacheBufferSize, unsigned ramBufferSize, + unsigned thread, unsigned numberOfLines, bool dumpRegisters, + bool errorDetection) { + + // Compute the sequence of instruction groups and the number of its repetions + // to reach the desired size + auto sequence = this->generateSequence(proportion); + auto repetitions = + this->getNumberOfSequenceRepetitions(sequence, numberOfLines / thread); + + // compute count of flops and memory access for performance report + unsigned flops = 0; + unsigned bytes = 0; + + for (const auto &item : sequence) { + auto it = this->instructionFlops.find(item); + + if (it == this->instructionFlops.end()) { + workerLog::error() << "Instruction group " << item << " undefined in " + << name() << "."; + return EXIT_FAILURE; + } + + flops += it->second; + + it = this->instructionMemory.find(item); + + if (it != this->instructionMemory.end()) { + bytes += it->second; + } + } + + this->_flops = repetitions * flops; + this->_bytes = repetitions * bytes; + this->_instructions = repetitions * sequence.size() * 4 + 6; + + // calculate the buffer sizes + auto l1i_cache_size = instructionCacheSize / thread; + auto dataCacheBufferSizeIterator = dataCacheBufferSize.begin(); + auto l1_size = *dataCacheBufferSizeIterator / thread; + std::advance(dataCacheBufferSizeIterator, 1); + auto l2_size = *dataCacheBufferSizeIterator / thread; + std::advance(dataCacheBufferSizeIterator, 1); + auto l3_size = *dataCacheBufferSizeIterator / thread; + auto ram_size = ramBufferSize / thread; + + // calculate the reset counters for the buffers + auto l2_loop_count = + getL2LoopCount(sequence, numberOfLines, l2_size * thread, thread); + auto l3_loop_count = + getL3LoopCount(sequence, numberOfLines, l3_size * thread, thread); + auto ram_loop_count = + getRAMLoopCount(sequence, numberOfLines, ram_size * thread, thread); + + CodeHolder code; + code.init(this->rt.environment()); + + if (nullptr != this->loadFunction) { + this->rt.release(&this->loadFunction); + } + + Builder cb(&code); + cb.addValidationOptions( + BaseEmitter::ValidationOptions::kValidationOptionAssembler | + BaseEmitter::ValidationOptions::kValidationOptionIntermediate); + + auto pointer_reg = rax; + auto l1_addr = rbx; + auto l2_addr = rcx; + auto l3_addr = r8; + auto ram_addr = r9; + auto l2_count_reg = r10; + auto l3_count_reg = r11; + auto ram_count_reg = r12; + auto temp_reg = r13; + auto temp_reg2 = rbp; + auto offset_reg = r14; + auto addrHigh_reg = r15; + auto iter_reg = mm0; + auto shift_reg = std::vector({rdi, rsi, rdx}); + auto shift_reg32 = std::vector({edi, esi, edx}); + auto nr_shift_regs = 3; + auto mul_regs = 3; + auto add_regs = 24; + auto alt_dst_regs = 5; + auto ram_reg = zmm30; + + FuncDetail func; + func.init(FuncSignatureT( + CallConv::kIdHost), + this->rt.environment()); + + FuncFrame frame; + frame.init(func); + + // make zmm registers dirty + for (int i = 0; i < 32; i++) { + frame.addDirtyRegs(Zmm(i)); + } + for (int i = 0; i < 8; i++) { + frame.addDirtyRegs(Mm(i)); + } + // make all other used registers dirty except RAX + frame.addDirtyRegs(l1_addr, l2_addr, l3_addr, ram_addr, l2_count_reg, + l3_count_reg, ram_count_reg, temp_reg, offset_reg, + addrHigh_reg, iter_reg, ram_addr); + for (const auto ® : shift_reg) { + frame.addDirtyRegs(reg); + } + + FuncArgsAssignment args(&func); + // FIXME: asmjit assigment to mm0 does not seem to be supported + args.assignAll(pointer_reg, addrHigh_reg, temp_reg); + args.updateFuncFrame(frame); + frame.finalize(); + + cb.emitProlog(frame); + cb.emitArgsAssignment(frame, args); + + // FIXME: movq from temp_reg to iter_reg + cb.movq(iter_reg, temp_reg); + + // stop right away if low load is selected + auto FunctionExit = cb.newLabel(); + + cb.mov(temp_reg, ptr_64(addrHigh_reg)); + cb.test(temp_reg, temp_reg); + cb.jz(FunctionExit); + + cb.mov(offset_reg, + Imm(64)); // increment after each cache/memory access + // Initialize registers for shift operations + for (auto const ® : shift_reg32) { + cb.mov(reg, Imm(0xAAAAAAAA)); + } + + + // Init AMX registers and config + __tilecfg tile_data = {0}; + request_permission(); + create_AMX_config(&tile_data); // Create tilecfg and fill it + + static bool init = true; + uintptr_t src1, src2; + uint64_t src3; + unsigned int aligned_alloc_size = static_cast(MAX*sizeof(__bfloat16)); + if(aligned_alloc_size % 1024){ // aligned_alloc expects size to be multiple of alignment (aka 1024) + aligned_alloc_size = aligned_alloc_size + (1024 - (aligned_alloc_size % 1024)); + } + src1 = (uintptr_t) aligned_alloc(1024, aligned_alloc_size); + src2 = (uintptr_t) aligned_alloc(1024, aligned_alloc_size); + src3 = (uint64_t) aligned_alloc(1024, aligned_alloc_size); + if(((void*)src1 == nullptr) || (void*)src2 == nullptr || (void*)src3 == nullptr){ // uintptr_t garantuees we can cast it to void* and back + std::cout << "[ERROR]: Allocation of source and target buffer for AMX failed. Aborting...\n"; + exit(1); + } + + //Init buffers + init_buffer_int8_rand(src1, src2); + memset((void*) src3, 0, aligned_alloc_size); + + cb.tileloaddt1(tmm6, zmmword_ptr(src1)); + cb.tileloaddt1(tmm7, zmmword_ptr(src2)); // Ensure no overflows through loading x and -x in src2 + + cb.tileloaddt1(tmm0, zmmword_ptr(src3)); // Preload with 0 + cb.tileloaddt1(tmm1, zmmword_ptr(src3)); + cb.tileloaddt1(tmm2, zmmword_ptr(src3)); + cb.tileloaddt1(tmm3, zmmword_ptr(src3)); + cb.tileloaddt1(tmm4, zmmword_ptr(src3)); + cb.tileloaddt1(tmm5, zmmword_ptr(src3)); + + // Initialize AVX512-Registers for FMA Operations + cb.vmovapd(zmm0, zmmword_ptr(pointer_reg)); + cb.vmovapd(zmm1, zmmword_ptr(pointer_reg, 64)); + cb.vmovapd(zmm2, zmmword_ptr(pointer_reg, 128)); + auto add_start = mul_regs; + auto add_end = mul_regs + add_regs - 1; + auto trans_start = add_regs + mul_regs; + auto trans_end = add_regs + mul_regs + alt_dst_regs - 1; + for (int i = add_start; i <= trans_end; i++) { + cb.vmovapd(Zmm(i), zmmword_ptr(pointer_reg, 256 + i * 64)); + } + cb.mov(l1_addr, pointer_reg); // address for L1-buffer + cb.mov(l2_addr, pointer_reg); + cb.add(l2_addr, Imm(l1_size)); // address for L2-buffer + cb.mov(l3_addr, pointer_reg); + cb.add(l3_addr, Imm(l2_size)); // address for L3-buffer + cb.mov(ram_addr, pointer_reg); + cb.add(ram_addr, Imm(l3_size)); // address for RAM-buffer + cb.mov(l2_count_reg, Imm(l2_loop_count)); + workerLog::trace() << "reset counter for L2-buffer with " + << l2_loop_count + << " cache line accesses per loop (" + << l2_size/1024 + << ") KiB"; + cb.mov(l3_count_reg, Imm(l3_loop_count)); + workerLog::trace() << "reset counter for L3-buffer with " + << l3_loop_count + << " cache line accesses per loop (" + << l3_size/1024 + << ") KiB"; + cb.mov(ram_count_reg, Imm(ram_loop_count)); + workerLog::trace() << "reset counter for RAM-buffer with " + << ram_loop_count + << " cache line accesses per loop (" + << ram_size/1024 + << ") KiB"; + + cb.align(kAlignCode, 64); + + auto Loop = cb.newLabel(); + cb.bind(Loop); + + auto shift_pos = 0; + bool left = false; + auto add_dest = add_start + 1; + auto mov_dst = trans_start; + auto mov_src = mov_dst + 1; + unsigned l1_offset = 0; + int counter=0; + +#define L1_INCREMENT() \ + l1_offset += 64; \ + if (l1_offset < l1_size * 0.5) { \ + cb.add(l1_addr, offset_reg); \ + } else { \ + l1_offset = 0; \ + cb.mov(l1_addr, pointer_reg); \ + } + +#define L2_INCREMENT() cb.add(l2_addr, offset_reg) + +#define L3_INCREMENT() cb.add(l3_addr, offset_reg) + +#define RAM_INCREMENT() cb.add(ram_addr, offset_reg) + +int amx = 0; +int reg = 0; + + for (unsigned count = 0; count < repetitions; count++) { + for (const auto &item : sequence) { + if (item == "REG") { + cb.vfmadd231pd(Zmm(add_dest), zmm0, zmm2); + cb.vfmadd231pd(Zmm(mov_dst), zmm2, zmm1); + cb.xor_(shift_reg[(shift_pos + nr_shift_regs - 1) % nr_shift_regs], + temp_reg); + mov_dst++; + reg++; + } else if (item == "L1_L") { + cb.vfmadd231pd(Zmm(add_dest), zmm0, zmm2); + cb.vfmadd231pd(Zmm(add_dest), zmm1, zmmword_ptr(l1_addr, 64)); + L1_INCREMENT(); + } else if (item == "L1_BROADCAST") { + cb.vfmadd231pd(Zmm(add_dest), zmm0, zmm2); + cb.vbroadcastsd(Zmm(add_dest), ptr_64(l1_addr, 64)); + L1_INCREMENT(); + } else if (item == "L1_S") { + cb.vmovapd(zmmword_ptr(l1_addr, 64), Zmm(add_dest)); + cb.vfmadd231pd(Zmm(add_dest), zmm0, zmm2); + L1_INCREMENT(); + } else if (item == "L1_LS") { + cb.vmovapd(zmmword_ptr(l1_addr, 64), Zmm(add_dest)); + cb.vfmadd231pd(Zmm(add_dest), zmm0, zmmword_ptr(l1_addr, 128)); + L1_INCREMENT(); + } else if (item == "L2_L") { + cb.vfmadd231pd(Zmm(add_dest), zmm0, zmm2); + cb.vfmadd231pd(Zmm(add_dest), zmm1, zmmword_ptr(l2_addr, 64)); + L2_INCREMENT(); + } else if (item == "L2_S") { + cb.vmovapd(zmmword_ptr(l2_addr, 64), Zmm(add_dest)); + cb.vfmadd231pd(Zmm(add_dest), zmm0, zmm2); + L2_INCREMENT(); + } else if (item == "L2_LS") { + cb.vmovapd(zmmword_ptr(l2_addr, 64), Zmm(add_dest)); + cb.vfmadd231pd(Zmm(add_dest), zmm0, zmmword_ptr(l2_addr, 128)); + L2_INCREMENT(); + } else if (item == "L3_L") { + cb.vfmadd231pd(Zmm(add_dest), zmm0, zmm2); + cb.vfmadd231pd(Zmm(add_dest), zmm1, zmmword_ptr(l3_addr, 64)); + L3_INCREMENT(); + } else if (item == "L3_S") { + cb.vmovapd(zmmword_ptr(l3_addr, 64), Zmm(add_dest)); + cb.vfmadd231pd(Zmm(add_dest), zmm0, zmm2); + L3_INCREMENT(); + } else if (item == "L3_LS") { + cb.vmovapd(zmmword_ptr(l3_addr, 64), Zmm(add_dest)); + cb.vfmadd231pd(Zmm(add_dest), zmm0, zmmword_ptr(l3_addr, 128)); + L3_INCREMENT(); + } else if (item == "L3_P") { + cb.vfmadd231pd(Zmm(add_dest), zmm0, zmmword_ptr(l1_addr, 64)); + cb.prefetcht2(ptr(l3_addr)); + L3_INCREMENT(); + } else if (item == "RAM_L") { + cb.vfmadd231pd(Zmm(add_dest), zmm0, zmm2); + cb.vfmadd231pd(ram_reg, zmm1, zmmword_ptr(ram_addr, 64)); + RAM_INCREMENT(); + } else if (item == "RAM_S") { + cb.vmovapd(zmmword_ptr(ram_addr, 64), Zmm(add_dest)); + cb.vfmadd231pd(Zmm(add_dest), zmm0, zmm2); + RAM_INCREMENT(); + } else if (item == "RAM_LS") { + cb.vmovapd(zmmword_ptr(ram_addr, 64), Zmm(add_dest)); + cb.vfmadd231pd(Zmm(add_dest), zmm0, zmmword_ptr(ram_addr, 128)); + RAM_INCREMENT(); + } else if (item == "RAM_P") { + cb.vfmadd231pd(Zmm(add_dest), zmm0, zmmword_ptr(l1_addr, 64)); + cb.prefetcht2(ptr(ram_addr)); + RAM_INCREMENT(); + } else if (item == "AMX") { + cb.tdpbssd(Tmm(counter%6), tmm6, tmm7); // TODO: If asmJit supports bf16 operations, change this to bf16 and init buffer for bf16 + counter++; + amx++; + } else { + workerLog::error() << "Instruction group " << item << " not found in " + << this->name() << "."; + return EXIT_FAILURE; + } + + if (left) { + cb.shr(shift_reg32[shift_pos], Imm(1)); + } else { + cb.shl(shift_reg32[shift_pos], Imm(1)); + } + add_dest++; + if (add_dest > add_end) { + add_dest = add_start; + } + if (mov_dst > trans_end) { + mov_dst = trans_start; + } + mov_src++; + if (mov_src > trans_end) { + mov_src = trans_start; + } + shift_pos++; + if (shift_pos == nr_shift_regs) { + shift_pos = 0; + left = !left; + } + } + } + + printf("[DIAGNOSTIC] Counted %d AVX instructions and %d AMX instructions\n", reg, amx); + + cb.movq(temp_reg, iter_reg); // restore iteration counter + if (this->getRAMSequenceCount(sequence) > 0) { + // reset RAM counter + auto NoRamReset = cb.newLabel(); + + cb.sub(ram_count_reg, Imm(1)); + cb.jnz(NoRamReset); + cb.mov(ram_count_reg, Imm(ram_loop_count)); + cb.mov(ram_addr, pointer_reg); + cb.add(ram_addr, Imm(l3_size)); + cb.bind(NoRamReset); + // adds always two instruction + this->_instructions += 2; + } + cb.inc(temp_reg); // increment iteration counter + if (this->getL2SequenceCount(sequence) > 0) { + // reset L2-Cache counter + auto NoL2Reset = cb.newLabel(); + + cb.sub(l2_count_reg, Imm(1)); + cb.jnz(NoL2Reset); + cb.mov(l2_count_reg, Imm(l2_loop_count)); + cb.mov(l2_addr, pointer_reg); + cb.add(l2_addr, Imm(l1_size)); + cb.bind(NoL2Reset); + // adds always two instruction + this->_instructions += 2; + } + cb.movq(iter_reg, temp_reg); // store iteration counter + if (this->getL3SequenceCount(sequence) > 0) { + // reset L3-Cache counter + auto NoL3Reset = cb.newLabel(); + + cb.sub(l3_count_reg, Imm(1)); + cb.jnz(NoL3Reset); + cb.mov(l3_count_reg, Imm(l3_loop_count)); + cb.mov(l3_addr, pointer_reg); + cb.add(l3_addr, Imm(l2_size)); + cb.bind(NoL3Reset); + // adds always two instruction + this->_instructions += 2; + } + cb.mov(l1_addr, pointer_reg); + + if (dumpRegisters) { + auto SkipRegistersDump = cb.newLabel(); + + cb.test(ptr_64(pointer_reg, -8), Imm(firestarter::DumpVariable::Wait)); + cb.jnz(SkipRegistersDump); + + // dump all the ymm register + for (int i = 0; i < (int)this->registerCount(); i++) { + cb.vmovapd( + zmmword_ptr(pointer_reg, -64 - this->registerSize() * 8 * (i + 1)), + Zmm(i)); + } + + // set read flag + cb.mov(ptr_64(pointer_reg, -8), Imm(firestarter::DumpVariable::Wait)); + + cb.bind(SkipRegistersDump); + } + + if (errorDetection) { + this->emitErrorDetectionCode( + cb, iter_reg, addrHigh_reg, pointer_reg, temp_reg, temp_reg2); + } + + cb.test(ptr_64(addrHigh_reg), Imm(LOAD_HIGH)); + cb.jnz(Loop); + + cb.bind(FunctionExit); + + cb.movq(rax, iter_reg); + + cb.emitEpilog(frame); + + cb.finalize(); + + // String sb; + // cb.dump(sb); + + Error err = this->rt.add(&this->loadFunction, &code); + if (err) { + workerLog::error() << "Asmjit adding Assembler to JitRuntime failed in " + << __FILE__ << " at " << __LINE__; + return EXIT_FAILURE; + } + + // skip if we could not determine cache size + if (l1i_cache_size != 0) { + auto loopSize = code.labelOffset(FunctionExit) - code.labelOffset(Loop); + auto instructionCachePercentage = 100 * loopSize / l1i_cache_size; + + if (loopSize > l1i_cache_size) { + workerLog::warn() << "Work-loop is bigger than the L1i-Cache."; + } + + workerLog::trace() << "Using " << loopSize << " of " << l1i_cache_size + << " Bytes (" << instructionCachePercentage + << "%) from the L1i-Cache for the work-loop."; + workerLog::trace() << "Sequence size: " << sequence.size(); + workerLog::trace() << "Repetition count: " << repetitions; + } + + return EXIT_SUCCESS; +} + +std::list AVX512_AMX_Payload::getAvailableInstructions() const { + std::list instructions; + + transform(this->instructionFlops.begin(), this->instructionFlops.end(), + back_inserter(instructions), + [](const auto &item) { return item.first; }); + + return instructions; +} + +void AVX512_AMX_Payload::init(unsigned long long *memoryAddr, + unsigned long long bufferSize) { + X86Payload::init(memoryAddr, bufferSize, 0.27948995982e-4, 0.27948995982e-4); +} + +void create_AMX_config(__tilecfg *tileinfo){ + // Create tile_cfg, fill it and return + + int i; + tileinfo->palette_id = 1; + tileinfo->start_row = 0; + + + for (i = 0; i < 8; ++i) + { + tileinfo->colsb[i] = MAX_COLS; + tileinfo->rows[i] = MAX_ROWS; + } + + _tile_loadconfig(tileinfo); +} + + +void request_permission(){ + + long rc; + unsigned long bitmask; + rc = syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA); + + if(rc){ + printf("XTILE_DATA request failed: %ld", rc); + } + + rc = syscall(SYS_arch_prctl, ARCH_GET_XCOMP_PERM, &bitmask); + if (rc){ + printf("prctl(ARCH_GET_XCOMP_PERM) error: %ld", rc); + } + if (bitmask & XFEATURE_MASK_XTILE){ + //printf("ARCH_REQ_XCOMP_PERM XTILE_DATA successful.\n"); + } + else{ + printf("[ERROR] ARCH_REQ_XCOMP_PERM XTILE_DATA unsuccessful!\n"); + } + + +} + +void init_buffer_int8_rand(uintptr_t src1, uintptr_t src2){ + + // Initialize buffer with random values + // Multiplication always produces either random_init^2 or (-1) * random_init^2 + // Accumulation operation always on (random_init^2 + -random_init^2) = 0 ensures stable values + + int8_t *buf1 = (int8_t*) src1; + int8_t *buf2 = (int8_t*) src2; + int rows, colsb; + + // TODO: Change MAX_ROWS/MAXC_COLS from constant to maximum size check by asmJit + // Currently not supported by asmJit + // Alternative: Manually parse CPUID + rows = MAX_ROWS; + colsb = MAX_COLS; + + for(int i = 0; i Date: Fri, 20 Sep 2024 14:01:13 +0200 Subject: [PATCH 02/15] [FIX] update asmjit calls --- .../Environment/X86/Payload/AVX512_AMX_Payload.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/firestarter/Environment/X86/Payload/AVX512_AMX_Payload.hpp b/include/firestarter/Environment/X86/Payload/AVX512_AMX_Payload.hpp index 08a789fd..7e0ad072 100644 --- a/include/firestarter/Environment/X86/Payload/AVX512_AMX_Payload.hpp +++ b/include/firestarter/Environment/X86/Payload/AVX512_AMX_Payload.hpp @@ -26,8 +26,8 @@ namespace firestarter::environment::x86::payload { class AVX512_AMX_Payload final : public X86Payload { public: - AVX512_AMX_Payload(asmjit::x86::Features const &supportedFeatures) - : X86Payload(supportedFeatures, {asmjit::x86::Features::Id::kAMX_BF16}, + AVX512_AMX_Payload(asmjit::CpuFeatures const &supportedFeatures) + : X86Payload(supportedFeatures, {asmjit::CpuFeatures::X86::kAMX_BF16}, "AVX512_AMX", 8, 32) {} int compilePayload( From 2b9adf9b3feb90526d2120d66d55b23ff04bf5c7 Mon Sep 17 00:00:00 2001 From: Cyrill Burth Date: Fri, 20 Sep 2024 14:02:11 +0200 Subject: [PATCH 03/15] [FIX] include Sapphire Rapids config --- include/firestarter/Environment/X86/X86Environment.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/include/firestarter/Environment/X86/X86Environment.hpp b/include/firestarter/Environment/X86/X86Environment.hpp index 11ad940e..198e5b10 100644 --- a/include/firestarter/Environment/X86/X86Environment.hpp +++ b/include/firestarter/Environment/X86/X86Environment.hpp @@ -24,6 +24,7 @@ #include #include +#include #include #include #include From a91ea44db2b063cdbe4ba5b79b6df43304822362 Mon Sep 17 00:00:00 2001 From: Cyrill Burth Date: Fri, 20 Sep 2024 14:05:45 +0200 Subject: [PATCH 04/15] [FIX] asmjit call in sapphire rapids config --- .../Environment/X86/Platform/SapphireRapidsConfig.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/firestarter/Environment/X86/Platform/SapphireRapidsConfig.hpp b/include/firestarter/Environment/X86/Platform/SapphireRapidsConfig.hpp index 68dbb23a..ac812bcb 100644 --- a/include/firestarter/Environment/X86/Platform/SapphireRapidsConfig.hpp +++ b/include/firestarter/Environment/X86/Platform/SapphireRapidsConfig.hpp @@ -28,7 +28,7 @@ namespace firestarter::environment::x86::platform { class SapphireRapidsConfig final : public X86PlatformConfig { public: - SapphireRapidsConfig(asmjit::x86::Features const &supportedFeatures, + SapphireRapidsConfig(asmjit::CpuFeatures const &supportedFeatures, unsigned family, unsigned model, unsigned threads) : X86PlatformConfig("SKL_XEONEP", 6, {85}, {1, 2}, 0, {32768, 1048576, 1441792}, 1048576000, 1536, family, From e2a87314693b12066fb1173f49ea44ccab8a1496 Mon Sep 17 00:00:00 2001 From: Cyrill Burth Date: Fri, 20 Sep 2024 14:10:29 +0200 Subject: [PATCH 05/15] [FIX] add new files to cmake --- src/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 6136bb35..a750a882 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -15,6 +15,7 @@ SET(FIRESTARTER_FILES firestarter/Environment/X86/Payload/X86Payload.cpp firestarter/Environment/X86/Payload/AVX512Payload.cpp + firestarter/Environment/X86/Payload/AVX512_AMX_Payload.cpp firestarter/Environment/X86/Payload/FMA4Payload.cpp firestarter/Environment/X86/Payload/FMAPayload.cpp firestarter/Environment/X86/Payload/ZENFMAPayload.cpp From 529bb7f48bb7cbaecfe71140a94e992b0a04f83f Mon Sep 17 00:00:00 2001 From: Cyrill Burth Date: Fri, 20 Sep 2024 14:15:09 +0200 Subject: [PATCH 06/15] [FIX] adapted workload to new asmjit api --- .../Environment/X86/Payload/AVX512_AMX_Payload.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/firestarter/Environment/X86/Payload/AVX512_AMX_Payload.cpp b/src/firestarter/Environment/X86/Payload/AVX512_AMX_Payload.cpp index 96027e25..4fba16fc 100644 --- a/src/firestarter/Environment/X86/Payload/AVX512_AMX_Payload.cpp +++ b/src/firestarter/Environment/X86/Payload/AVX512_AMX_Payload.cpp @@ -124,9 +124,9 @@ int AVX512_AMX_Payload::compilePayload( } Builder cb(&code); - cb.addValidationOptions( - BaseEmitter::ValidationOptions::kValidationOptionAssembler | - BaseEmitter::ValidationOptions::kValidationOptionIntermediate); + cb.addDiagnosticOptions( + asmjit::DiagnosticOptions::kValidateAssembler | + asmjit::DiagnosticOptions::kValidateIntermediate ); auto pointer_reg = rax; auto l1_addr = rbx; @@ -152,7 +152,7 @@ int AVX512_AMX_Payload::compilePayload( FuncDetail func; func.init(FuncSignatureT( - CallConv::kIdHost), + CallConv::kCDecl), this->rt.environment()); FuncFrame frame; @@ -271,7 +271,7 @@ int AVX512_AMX_Payload::compilePayload( << ram_size/1024 << ") KiB"; - cb.align(kAlignCode, 64); + cb.align(AlignMode::kCode, 64); auto Loop = cb.newLabel(); cb.bind(Loop); From b3b94f84fd79be9215b5415f636e2310aa5f999c Mon Sep 17 00:00:00 2001 From: Cyrill Burth Date: Fri, 20 Sep 2024 14:16:10 +0200 Subject: [PATCH 07/15] [FIX] adapted workload to new asmjit api --- src/firestarter/Environment/X86/Payload/AVX512_AMX_Payload.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/firestarter/Environment/X86/Payload/AVX512_AMX_Payload.cpp b/src/firestarter/Environment/X86/Payload/AVX512_AMX_Payload.cpp index 4fba16fc..0a270438 100644 --- a/src/firestarter/Environment/X86/Payload/AVX512_AMX_Payload.cpp +++ b/src/firestarter/Environment/X86/Payload/AVX512_AMX_Payload.cpp @@ -152,7 +152,7 @@ int AVX512_AMX_Payload::compilePayload( FuncDetail func; func.init(FuncSignatureT( - CallConv::kCDecl), + CallConvId::kCDecl), this->rt.environment()); FuncFrame frame; From 8e8e82d28042b6a564875a5d47a13cc7cb336b1e Mon Sep 17 00:00:00 2001 From: Cyrill Burth Date: Fri, 20 Sep 2024 14:19:50 +0200 Subject: [PATCH 08/15] [FIX] add missing compiler flags --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c94e0144..b74681be 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -39,7 +39,7 @@ git_submodule_update() if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC") else() -SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -O2 -fdata-sections -ffunction-sections") +SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mamx-int8 -mamx-tile -Wall -Wextra -O2 -fdata-sections -ffunction-sections") endif() if(CMAKE_SYSTEM_NAME STREQUAL "Darwin") From 892f2346ecb2886267592d3697a9c23cbdaec0e3 Mon Sep 17 00:00:00 2001 From: Cyrill Burth Date: Fri, 20 Sep 2024 14:25:51 +0200 Subject: [PATCH 09/15] [FIX] register SApphire Rapids config --- include/firestarter/Environment/X86/X86Environment.hpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/include/firestarter/Environment/X86/X86Environment.hpp b/include/firestarter/Environment/X86/X86Environment.hpp index 198e5b10..05155611 100644 --- a/include/firestarter/Environment/X86/X86Environment.hpp +++ b/include/firestarter/Environment/X86/X86Environment.hpp @@ -89,7 +89,8 @@ class X86Environment final : public Environment { REGISTER(HaswellEPConfig), REGISTER(SandyBridgeConfig), REGISTER(SandyBridgeEPConfig), REGISTER(NehalemConfig), REGISTER(NehalemEPConfig), REGISTER(BulldozerConfig), - REGISTER(NaplesConfig), REGISTER(RomeConfig)}; + REGISTER(NaplesConfig), REGISTER(RomeConfig), + REGISTER(SapphireRapidsConfig)}; std::list platformConfigs; @@ -97,6 +98,7 @@ class X86Environment final : public Environment { const std::list> fallbackPlatformConfigsCtor = { + REGISTER(SapphireRapidsConfig), // AMX + AVX512 REGISTER(SkylakeSPConfig), // AVX512 REGISTER(BulldozerConfig), // FMA4 REGISTER(HaswellConfig), // FMA From 5c5f935c048c5c83d5d044ab407da5aa8c92ace2 Mon Sep 17 00:00:00 2001 From: Cyrill Burth Date: Fri, 20 Sep 2024 14:30:19 +0200 Subject: [PATCH 10/15] [REMOVED] unneded prints --- src/firestarter/Environment/X86/Payload/AVX512_AMX_Payload.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/firestarter/Environment/X86/Payload/AVX512_AMX_Payload.cpp b/src/firestarter/Environment/X86/Payload/AVX512_AMX_Payload.cpp index 0a270438..4890568c 100644 --- a/src/firestarter/Environment/X86/Payload/AVX512_AMX_Payload.cpp +++ b/src/firestarter/Environment/X86/Payload/AVX512_AMX_Payload.cpp @@ -404,8 +404,6 @@ int reg = 0; } } } - - printf("[DIAGNOSTIC] Counted %d AVX instructions and %d AMX instructions\n", reg, amx); cb.movq(temp_reg, iter_reg); // restore iteration counter if (this->getRAMSequenceCount(sequence) > 0) { From edf806cee6abb3c1066be61da248021bb7566b4b Mon Sep 17 00:00:00 2001 From: Cyrill Burth Date: Fri, 20 Sep 2024 15:59:03 +0200 Subject: [PATCH 11/15] [ADD] use bf16 --- .../X86/Platform/SapphireRapidsConfig.hpp | 2 +- .../X86/Payload/AVX512_AMX_Payload.cpp | 33 ++++++++----------- 2 files changed, 14 insertions(+), 21 deletions(-) diff --git a/include/firestarter/Environment/X86/Platform/SapphireRapidsConfig.hpp b/include/firestarter/Environment/X86/Platform/SapphireRapidsConfig.hpp index ac812bcb..6e6bf3ba 100644 --- a/include/firestarter/Environment/X86/Platform/SapphireRapidsConfig.hpp +++ b/include/firestarter/Environment/X86/Platform/SapphireRapidsConfig.hpp @@ -56,7 +56,7 @@ class SapphireRapidsConfig final : public X86PlatformConfig { {"L1_S", 0}, {"L1_L", 0}, {"REG", 140}, - {"AMX", 0}}); + {"AMX", 1}}); } }; } // namespace firestarter::environment::x86::platform diff --git a/src/firestarter/Environment/X86/Payload/AVX512_AMX_Payload.cpp b/src/firestarter/Environment/X86/Payload/AVX512_AMX_Payload.cpp index 4890568c..c3692a92 100644 --- a/src/firestarter/Environment/X86/Payload/AVX512_AMX_Payload.cpp +++ b/src/firestarter/Environment/X86/Payload/AVX512_AMX_Payload.cpp @@ -57,7 +57,7 @@ typedef struct __tile_config void create_AMX_config(__tilecfg *tileinfo); void request_permission(); -void init_buffer_int8_rand(uintptr_t buf1, uintptr_t buf2); +void init_buffer_rand(uintptr_t buf1, uintptr_t buf2); int AVX512_AMX_Payload::compilePayload( std::vector> const &proportion, @@ -221,7 +221,7 @@ int AVX512_AMX_Payload::compilePayload( } //Init buffers - init_buffer_int8_rand(src1, src2); + init_buffer_rand(src1, src2); memset((void*) src3, 0, aligned_alloc_size); cb.tileloaddt1(tmm6, zmmword_ptr(src1)); @@ -299,9 +299,6 @@ int AVX512_AMX_Payload::compilePayload( #define RAM_INCREMENT() cb.add(ram_addr, offset_reg) -int amx = 0; -int reg = 0; - for (unsigned count = 0; count < repetitions; count++) { for (const auto &item : sequence) { if (item == "REG") { @@ -310,7 +307,6 @@ int reg = 0; cb.xor_(shift_reg[(shift_pos + nr_shift_regs - 1) % nr_shift_regs], temp_reg); mov_dst++; - reg++; } else if (item == "L1_L") { cb.vfmadd231pd(Zmm(add_dest), zmm0, zmm2); cb.vfmadd231pd(Zmm(add_dest), zmm1, zmmword_ptr(l1_addr, 64)); @@ -372,9 +368,8 @@ int reg = 0; cb.prefetcht2(ptr(ram_addr)); RAM_INCREMENT(); } else if (item == "AMX") { - cb.tdpbssd(Tmm(counter%6), tmm6, tmm7); // TODO: If asmJit supports bf16 operations, change this to bf16 and init buffer for bf16 + cb.tdpbf16ps(Tmm(counter%6), tmm6, tmm7); counter++; - amx++; } else { workerLog::error() << "Instruction group " << item << " not found in " << this->name() << "."; @@ -570,34 +565,32 @@ void request_permission(){ } -void init_buffer_int8_rand(uintptr_t src1, uintptr_t src2){ +void init_buffer_bf16_rand(uintptr_t src1, uintptr_t src2){ // Initialize buffer with random values - // Multiplication always produces either random_init^2 or (-1) * random_init^2 - // Accumulation operation always on (random_init^2 + -random_init^2) = 0 ensures stable values + // Multiplication always produces either 1 or -1 + // Accumulation operation always on (1 + -1) = 0 ensures stable values - int8_t *buf1 = (int8_t*) src1; - int8_t *buf2 = (int8_t*) src2; - int rows, colsb; + __bfloat16 *buf1 = (__bfloat16*) src1; + __bfloat16 *buf2 = (__bfloat16*) src2; // TODO: Change MAX_ROWS/MAXC_COLS from constant to maximum size check by asmJit // Currently not supported by asmJit // Alternative: Manually parse CPUID - rows = MAX_ROWS; - colsb = MAX_COLS; for(int i = 0; i Date: Fri, 20 Sep 2024 16:00:46 +0200 Subject: [PATCH 12/15] [FIX] typo --- src/firestarter/Environment/X86/Payload/AVX512_AMX_Payload.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/firestarter/Environment/X86/Payload/AVX512_AMX_Payload.cpp b/src/firestarter/Environment/X86/Payload/AVX512_AMX_Payload.cpp index c3692a92..ab51d9ea 100644 --- a/src/firestarter/Environment/X86/Payload/AVX512_AMX_Payload.cpp +++ b/src/firestarter/Environment/X86/Payload/AVX512_AMX_Payload.cpp @@ -565,7 +565,7 @@ void request_permission(){ } -void init_buffer_bf16_rand(uintptr_t src1, uintptr_t src2){ +void init_buffer_rand(uintptr_t src1, uintptr_t src2){ // Initialize buffer with random values // Multiplication always produces either 1 or -1 From 7caf6cb6466705a8afa66b3fecf95293b087ae4d Mon Sep 17 00:00:00 2001 From: Cyrill Burth Date: Mon, 23 Sep 2024 15:02:39 +0200 Subject: [PATCH 13/15] [ADD] limit init value --- src/firestarter/Environment/X86/Payload/AVX512_AMX_Payload.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/firestarter/Environment/X86/Payload/AVX512_AMX_Payload.cpp b/src/firestarter/Environment/X86/Payload/AVX512_AMX_Payload.cpp index ab51d9ea..2b91c46e 100644 --- a/src/firestarter/Environment/X86/Payload/AVX512_AMX_Payload.cpp +++ b/src/firestarter/Environment/X86/Payload/AVX512_AMX_Payload.cpp @@ -579,7 +579,7 @@ void init_buffer_rand(uintptr_t src1, uintptr_t src2){ // Alternative: Manually parse CPUID for(int i = 0; i Date: Mon, 23 Sep 2024 15:05:24 +0200 Subject: [PATCH 14/15] [REMOVED] unnecessary defines --- .../Environment/X86/Payload/AVX512_AMX_Payload.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/firestarter/Environment/X86/Payload/AVX512_AMX_Payload.cpp b/src/firestarter/Environment/X86/Payload/AVX512_AMX_Payload.cpp index 2b91c46e..f078b214 100644 --- a/src/firestarter/Environment/X86/Payload/AVX512_AMX_Payload.cpp +++ b/src/firestarter/Environment/X86/Payload/AVX512_AMX_Payload.cpp @@ -35,11 +35,7 @@ #define MAX 1024 #define MAX_ROWS 16 #define MAX_COLS 64 -#define STRIDE 1 -#define NUMBER_RANDOM 4 -#define MAX_BITS 7 -#define MIN_BITS 4 using namespace firestarter::environment::x86::payload; using namespace asmjit; From 0427834456d112ecb94eb3ac838b32cbddd22e29 Mon Sep 17 00:00:00 2001 From: Cyrill Burth Date: Mon, 23 Sep 2024 15:32:30 +0200 Subject: [PATCH 15/15] [ADD] use AVX512 config with AMX --- .../X86/Platform/SapphireRapidsConfig.hpp | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/include/firestarter/Environment/X86/Platform/SapphireRapidsConfig.hpp b/include/firestarter/Environment/X86/Platform/SapphireRapidsConfig.hpp index 6e6bf3ba..25e2b708 100644 --- a/include/firestarter/Environment/X86/Platform/SapphireRapidsConfig.hpp +++ b/include/firestarter/Environment/X86/Platform/SapphireRapidsConfig.hpp @@ -37,7 +37,7 @@ class SapphireRapidsConfig final : public X86PlatformConfig { std::vector> getDefaultPayloadSettings() const override { -/* return std::vector>({{"RAM_S", 3}, + return std::vector>({{"RAM_S", 3}, {"RAM_P", 1}, {"L3_S", 1}, {"L3_P", 1}, @@ -46,16 +46,6 @@ class SapphireRapidsConfig final : public X86PlatformConfig { {"L1_S", 0}, {"L1_L", 40}, {"REG", 140}, - {"AMX", 4}});*/ - return std::vector>({{"RAM_S", 0}, - {"RAM_P", 0}, - {"L3_S", 0}, - {"L3_P", 0}, - {"L2_S", 0}, - {"L2_L", 0}, - {"L1_S", 0}, - {"L1_L", 0}, - {"REG", 140}, {"AMX", 1}}); } };