From bc1fb1771d5f45be5c1228805653d15bf4ccac85 Mon Sep 17 00:00:00 2001 From: Maximilian Date: Fri, 15 Nov 2024 01:30:17 +0100 Subject: [PATCH] Changed s_rdma_qp_ctx to transport aes-key, compression- and dpi-bit to the roce_stack --- hw/hdl/network/rdma/rdma_mux_retrans.sv | 53 ++++++++++++++++++- hw/hdl/network/rdma/roce_stack.sv | 35 +++++++++--- hw/hdl/shell/cnfg_slave_avx.sv | 36 +++++++++---- .../wr_hdl/template_gen/lynx_pkg_tmplt.txt | 14 +++++ sw/include/cDefs.hpp | 4 +- sw/src/bThread.cpp | 26 +++++++-- 6 files changed, 143 insertions(+), 25 deletions(-) diff --git a/hw/hdl/network/rdma/rdma_mux_retrans.sv b/hw/hdl/network/rdma/rdma_mux_retrans.sv index 740c1c57..3442ee77 100644 --- a/hw/hdl/network/rdma/rdma_mux_retrans.sv +++ b/hw/hdl/network/rdma/rdma_mux_retrans.sv @@ -156,7 +156,7 @@ logic [0:0] state_C, state_N; logic rd_C, rd_N; logic actv_C, actv_N; -logic [LEN_BITS-BEAT_LOG_BITS:0] cnt_C, cnt_N; +logic [LEN_BITS-BEAT_LOG_BITS:0] cnt_C, cnt_N, cnt_ddr_wr; logic tr_done; @@ -274,6 +274,22 @@ always_comb begin: DP endcase end +// Counting the outgoing data transmissions to the retrans buffer +always_ff @ (posedge aclk) begin + + if(aresetn == 1'b0) begin + cnt_ddr_wr <= 1'b0; + end else begin + if(s_req_net.valid) begin + // Once a new command comes in, set the transmission counter to the length transmitted via the command interface + cnt_ddr_wr <= s_req_net.data.len[LEN_BITS-1:0]/64; + end else begin + // Decrement the counter with every successfull write to the retrans-memory + cnt_ddr_wr <= (axis_ddr_wr.tvalid & axis_ddr_wr.tready) ? (cnt_ddr_wr-1) : cnt_ddr_wr; + end + end +end + // Mux always_comb begin if(state_C == ST_MUX) begin @@ -320,12 +336,45 @@ assign axis_net.tlast = actv_C ? (rd_C ? s_axis_user_rsp.tlast : s_axis_user_req assign axis_ddr_wr.tdata = s_axis_user_req.tdata; assign axis_ddr_wr.tkeep = s_axis_user_req.tkeep; -assign axis_ddr_wr.tlast = s_axis_user_req.tlast; +assign axis_ddr_wr.tlast = (cnt_ddr_wr == 1); // // DEBUG // +/* ila_retrans inst_ila_retrans ( + .clk(aclk), + .probe0(s_req_net.valid), + .probe1(s_req_net.data), // 128 + .probe2(s_req_net.ready), + .probe3(s_axis_user_req.tvalid), + .probe4(s_axis_user_req.tdata), // 512 + .probe5(s_axis_user_req.tkeep), // 64 + .probe6(s_axis_user_req.tready), + .probe7(s_axis_user_req.tlast), + .probe8(m_axis_net.tvalid), + .probe9(m_axis_net.tdata), // 512 + .probe10(m_axis_net.tkeep), // 64 + .probe11(m_axis_net.tready), + .probe12(m_axis_net.tlast), + .probe13(m_req_ddr_wr.valid), + .probe14(m_req_ddr_wr.data), // 128 + .probe15(m_req_ddr_wr.ready), + .probe16(m_axis_ddr.tvalid), + .probe17(m_axis_ddr.tdata), // 512 + .probe18(m_axis_ddr.tkeep), // 64 + .probe19(m_axis_ddr.tready), + .probe20(m_axis_ddr.tlast), + .probe21(seq_snk_valid), + .probe22(seq_snk_ready), + .probe23(rd_snk), + .probe24(actv_snk), + .probe25(cnt_C), // 26 + .probe26(state_C), + .probe27(cnt_ddr_wr), // 26 + .probe28(tr_done) +); */ + /* create_ip -name ila -vendor xilinx.com -library ip -version 6.2 -module_name ila_retrans set_property -dict [list CONFIG.C_PROBE29_WIDTH {22} CONFIG.C_PROBE23_WIDTH {28} CONFIG.C_NUM_OF_PROBES {35} CONFIG.Component_Name {ila_retrans} CONFIG.C_EN_STRG_QUAL {1} CONFIG.C_PROBE34_MU_CNT {2} CONFIG.C_PROBE33_MU_CNT {2} CONFIG.C_PROBE32_MU_CNT {2} CONFIG.C_PROBE31_MU_CNT {2} CONFIG.C_PROBE30_MU_CNT {2} CONFIG.C_PROBE29_MU_CNT {2} CONFIG.C_PROBE28_MU_CNT {2} CONFIG.C_PROBE27_MU_CNT {2} CONFIG.C_PROBE26_MU_CNT {2} CONFIG.C_PROBE25_MU_CNT {2} CONFIG.C_PROBE24_MU_CNT {2} CONFIG.C_PROBE23_MU_CNT {2} CONFIG.C_PROBE22_MU_CNT {2} CONFIG.C_PROBE21_MU_CNT {2} CONFIG.C_PROBE20_MU_CNT {2} CONFIG.C_PROBE19_MU_CNT {2} CONFIG.C_PROBE18_MU_CNT {2} CONFIG.C_PROBE17_MU_CNT {2} CONFIG.C_PROBE16_MU_CNT {2} CONFIG.C_PROBE15_MU_CNT {2} CONFIG.C_PROBE14_MU_CNT {2} CONFIG.C_PROBE13_MU_CNT {2} CONFIG.C_PROBE12_MU_CNT {2} CONFIG.C_PROBE11_MU_CNT {2} CONFIG.C_PROBE10_MU_CNT {2} CONFIG.C_PROBE9_MU_CNT {2} CONFIG.C_PROBE8_MU_CNT {2} CONFIG.C_PROBE7_MU_CNT {2} CONFIG.C_PROBE6_MU_CNT {2} CONFIG.C_PROBE5_MU_CNT {2} CONFIG.C_PROBE4_MU_CNT {2} CONFIG.C_PROBE3_MU_CNT {2} CONFIG.C_PROBE2_MU_CNT {2} CONFIG.C_PROBE1_MU_CNT {2} CONFIG.C_PROBE0_MU_CNT {2} CONFIG.ALL_PROBE_SAME_MU_CNT {2}] [get_ips ila_retrans] diff --git a/hw/hdl/network/rdma/roce_stack.sv b/hw/hdl/network/rdma/roce_stack.sv index f265f037..89c08e73 100644 --- a/hw/hdl/network/rdma/roce_stack.sv +++ b/hw/hdl/network/rdma/roce_stack.sv @@ -216,6 +216,27 @@ assign m_rdma_wr_req.valid = rdma_wr_req.valid; assign m_rdma_wr_req.data = rdma_wr_req.data; assign rdma_wr_req.ready = m_rdma_wr_req.ready; + +// +// QP_CTX Remapping +// + +metaIntf #(.STYPE(rdma_qp_ctx_old_t)) s_rdma_qp_interface_old (); + +// Signals for ready and valid are assigned right away +assign s_rdma_qp_interface.ready = s_rdma_qp_interface_old.ready; +assign s_rdma_qp_interface_old.valid = s_rdma_qp_interface.valid; + +// Data remapping +assign s_rdma_qp_interface_old.data.vaddr = s_rdma_qp_interface.data.vaddr; +assign s_rdma_qp_interface_old.data.r_key = s_rdma_qp_interface.data.r_key; +assign s_rdma_qp_interface_old.data.local_psn = s_rdma_qp_interface.data.local_psn; +assign s_rdma_qp_interface_old.data.remote_psn = s_rdma_qp_interface.data.remote_psn; +assign s_rdma_qp_interface_old.data.qp_num = s_rdma_qp_interface.data.qp_num; +assign s_rdma_qp_interface_old.data.new_state = s_rdma_qp_interface.data.new_state; + + + // // RoCE stack // @@ -278,7 +299,7 @@ assign rdma_wr_req.ready = m_rdma_wr_req.ready; .probe9(m_axis_tx.tlast), .probe10(s_rdma_qp_interface.valid), .probe11(s_rdma_qp_interface.ready), - .probe12(s_rdma_qp_interface.data), // 184 + .probe12(s_rdma_qp_interface.data), // 314 .probe13(s_rdma_conn_interface.valid), .probe14(s_rdma_conn_interface.ready), .probe15(s_rdma_conn_interface.data), // 184 @@ -369,9 +390,9 @@ rocev2_ip rocev2_inst( .s_axis_mem_read_data_TLAST(axis_rdma_rd.tlast), // QP intf - .s_axis_qp_interface_TVALID(s_rdma_qp_interface.valid), - .s_axis_qp_interface_TREADY(s_rdma_qp_interface.ready), - .s_axis_qp_interface_TDATA(s_rdma_qp_interface.data), + .s_axis_qp_interface_TVALID(s_rdma_qp_interface_old.valid), + .s_axis_qp_interface_TREADY(s_rdma_qp_interface_old.ready), + .s_axis_qp_interface_TDATA(s_rdma_qp_interface_old.data), .s_axis_qp_conn_interface_TVALID(s_rdma_conn_interface.valid), .s_axis_qp_conn_interface_TREADY(s_rdma_conn_interface.ready), .s_axis_qp_conn_interface_TDATA(s_rdma_conn_interface.data), @@ -446,9 +467,9 @@ rocev2_ip rocev2_inst( .s_axis_mem_read_data_TLAST(axis_rdma_rd.tlast), // QP intf - .s_axis_qp_interface_V_TVALID(s_rdma_qp_interface.valid), - .s_axis_qp_interface_V_TREADY(s_rdma_qp_interface.ready), - .s_axis_qp_interface_V_TDATA(s_rdma_qp_interface.data), + .s_axis_qp_interface_V_TVALID(s_rdma_qp_interface_old.valid), + .s_axis_qp_interface_V_TREADY(s_rdma_qp_interface_old.ready), + .s_axis_qp_interface_V_TDATA(s_rdma_qp_interface_old.data), .s_axis_qp_conn_interface_V_TVALID(s_rdma_conn_interface.valid), .s_axis_qp_conn_interface_V_TREADY(s_rdma_conn_interface.ready), .s_axis_qp_conn_interface_V_TDATA(s_rdma_conn_interface.data), diff --git a/hw/hdl/shell/cnfg_slave_avx.sv b/hw/hdl/shell/cnfg_slave_avx.sv index 19e621a4..e70a8efd 100644 --- a/hw/hdl/shell/cnfg_slave_avx.sv +++ b/hw/hdl/shell/cnfg_slave_avx.sv @@ -354,7 +354,7 @@ localparam integer SYNC_STAT_REG = 8; localparam integer NET_ARP_REG = 9; // RDMA // 49 (RW) : Write QP context -localparam integer RDMA_CTX_REG = 10; +localparam integer RDMA_CTX_REG_1 = 10; // 50 (RW) : Write QP connection localparam integer RDMA_CONN_REG = 11; // TCP @@ -364,6 +364,9 @@ localparam integer TCP_OPEN_PORT_STAT_REG = 13; localparam integer TCP_OPEN_CONN_REG = 14; localparam integer TCP_OPEN_CONN_STAT_REG = 15; +// Add another register for RDMA CTX to cover more than 256 bits +localparam integer RDMA_CTX_REG_2 = 16; + // 64 (RO) : Status DMA completion localparam integer STAT_DMA_REG = 2**PID_BITS; // @@ -636,10 +639,16 @@ always_ff @(posedge aclk) begin `endif `ifdef EN_RDMA - RDMA_CTX_REG: // Context + RDMA_CTX_REG_1: // Context + for (int i = 0; i < AVX_DATA_BITS/8; i++) begin + if(s_axim_ctrl.wstrb[i]) begin + slv_reg[RDMA_CTX_REG_1][(i*8)+:8] <= s_axim_ctrl.wdata[(i*8)+:8]; + end + end + RDMA_CTX_REG_2: for (int i = 0; i < AVX_DATA_BITS/8; i++) begin if(s_axim_ctrl.wstrb[i]) begin - slv_reg[RDMA_CTX_REG][(i*8)+:8] <= s_axim_ctrl.wdata[(i*8)+:8]; + slv_reg[RDMA_CTX_REG_2][(i*8)+:8] <= s_axim_ctrl.wdata[(i*8)+:8]; m_rdma_qp_interface.valid <= 1'b1; end end @@ -747,8 +756,10 @@ always_ff @(posedge aclk) begin `endif `ifdef EN_RDMA - [RDMA_CTX_REG:RDMA_CTX_REG]: - axi_rdata[0] <= m_rdma_qp_interface.ready; + [RDMA_CTX_REG_1:RDMA_CTX_REG_1]: + axi_rdata <= slv_reg[RDMA_CTX_REG_1]; + [RDMA_CTX_REG_2:RDMA_CTX_REG_2]: + axi_rdata[0] <= m_rdma_qp_interface.ready; [RDMA_CONN_REG:RDMA_CONN_REG]: axi_rdata[0] <= m_rdma_conn_interface.ready; `endif @@ -1225,13 +1236,16 @@ ram_tp_nc #( .b_data_out(b_data_out_rdma_wr) ); -// RDMA qp interface +// RDMA qp interface - add the new bits for AES-key, compression- and DPI-requirements assign m_rdma_qp_interface.data.new_state = 0; -assign m_rdma_qp_interface.data.qp_num = slv_reg[RDMA_CTX_REG][0+:24]; // qpn -assign m_rdma_qp_interface.data.r_key = slv_reg[RDMA_CTX_REG][32+:32]; // r_key -assign m_rdma_qp_interface.data.local_psn = slv_reg[RDMA_CTX_REG][64+:24]; -assign m_rdma_qp_interface.data.remote_psn = slv_reg[RDMA_CTX_REG][64+24+:24]; // psns -assign m_rdma_qp_interface.data.vaddr = slv_reg[RDMA_CTX_REG][128+:VADDR_BITS]; // vaddr +assign m_rdma_qp_interface.data.qp_num = slv_reg[RDMA_CTX_REG_1][0+:24]; // qpn +assign m_rdma_qp_interface.data.r_key = slv_reg[RDMA_CTX_REG_1][32+:32]; // r_key +assign m_rdma_qp_interface.data.local_psn = slv_reg[RDMA_CTX_REG_1][64+:24]; +assign m_rdma_qp_interface.data.remote_psn = slv_reg[RDMA_CTX_REG_1][64+24+:24]; // psns +assign m_rdma_qp_interface.data.vaddr = slv_reg[RDMA_CTX_REG_1][128+:VADDR_BITS]; // vaddr +assign m_rdma_qp_interface.data.aes_key = slv_reg[RDMA_CTX_REG_2][0+:128]; // aes_key +assign m_rdma_qp_interface.data.compression_enabled = slv_reg[RDMA_CTX_REG_2][128+:1]; // compression-bit +assign m_rdma_qp_interface.data.dpi_enabled = slv_reg[RDMA_CTX_REG_2][129+:1]; // dpi-bit // RDMA connection interface assign m_rdma_conn_interface.data.local_qpn = slv_reg[RDMA_CONN_REG][0+:16]; diff --git a/scripts/wr_hdl/template_gen/lynx_pkg_tmplt.txt b/scripts/wr_hdl/template_gen/lynx_pkg_tmplt.txt index 8d2bfd71..e291e6ae 100644 --- a/scripts/wr_hdl/template_gen/lynx_pkg_tmplt.txt +++ b/scripts/wr_hdl/template_gen/lynx_pkg_tmplt.txt @@ -536,6 +536,7 @@ package lynxTypes; // RDMA // + // Signal used for establishing a new QP. Send first, before rdma_qp_conn_t. Now also includes information about the Balboa-capabilities. typedef struct packed { logic [47:0] vaddr; logic [31:0] r_key; @@ -543,8 +544,21 @@ package lynxTypes; logic [23:0] remote_psn; logic [23:0] qp_num; logic [31:0] new_state; + logic [127:0] aes_key; + logic compression_enabled; + logic dpi_enabled; } rdma_qp_ctx_t; + typedef struct packed { + logic [47:0] vaddr; + logic [31:0] r_key; + logic [23:0] local_psn; + logic [23:0] remote_psn; + logic [23:0] qp_num; + logic [31:0] new_state; + } rdma_qp_ctx_old_t; + + // Signal used for establishing a new connection. Send after the rdma_qp_ctx_t. Adds the network-specific information required for a QP. typedef struct packed { logic [15:0] remote_udp_port; logic [127:0] remote_ip_address; diff --git a/sw/include/cDefs.hpp b/sw/include/cDefs.hpp index d2fa9e02..737a7543 100644 --- a/sw/include/cDefs.hpp +++ b/sw/include/cDefs.hpp @@ -198,6 +198,7 @@ enum class CoyoteAlloc { /* AVX regs */ // Control regs that get memory-mapped for controlling operations of the FPGA // These are the ones used for AVX-systems. Why is there a difference between AVX and legacy systems? +// Added a second RDMA_CTX_REG to cover the full width (which is larger than 256 bits) enum class CnfgAvxRegs : uint32_t { CTRL_REG = 0, ISR_REG = 1, @@ -209,12 +210,13 @@ enum class CnfgAvxRegs : uint32_t { SYNC_CTRL_REG = 7, SYNC_STAT_REG = 8, NET_ARP_REG = 9, - RDMA_CTX_REG = 10, + RDMA_CTX_REG_1 = 10, RDMA_CONN_REG = 11, TCP_OPEN_PORT_REG = 12, TCP_OPEN_PORT_STAT_REG = 13, TCP_OPEN_CONN_REG = 14, TCP_OPEN_CONN_STAT_REG = 15, + RDMA_CTX_REG_2 = 16, STAT_DMA_REG = 64 }; diff --git a/sw/src/bThread.cpp b/sw/src/bThread.cpp index b1b95cdb..e9943d6f 100644 --- a/sw/src/bThread.cpp +++ b/sw/src/bThread.cpp @@ -1018,7 +1018,7 @@ bool bThread::doArpLookup(uint32_t ip_addr) { */ bool bThread::writeQpContext(uint32_t port) { // Basic idea: Get information from the previously created qp-struct and write it to configuration memory - uint64_t offs[3]; + uint64_t offs[6]; if(fcnfg.en_rdma) { // Write QP context - QPN, rkey, local and remote PSN, vaddr offs[0] = ((static_cast(qpair->local.qpn) & 0xffffff) << qpContextQpnOffs) | @@ -1029,6 +1029,21 @@ bool bThread::writeQpContext(uint32_t port) { offs[2] = ((static_cast((uint64_t)qpair->remote.vaddr) & 0xffffffffffff) << qpContextVaddrOffs); + // Splitting up the 128 Bit AES-key to write it into two distinct 64-bit parts + uint64_t aes_key_high = static_cast(qpair->local.aes_key >> 64); + uint64_t aes_key_low = static_cast(qpair->local.aes_key & 0xFFFFFFFFFFFFFFFF); + + offs[3] = aes_key_high; + offs[4] = aes_key_low; + + // Writing the compression- and dpi-bits in the 2 LSBs of the sixth FPGA slave-register + const uint8_t bool1Offset = 0; + const uint8_t bool2Offset = 1; + + offs[5] = 0; + offs[5] |= (qpair->local.compression_enabled ? 1ULL : 0ULL) << bool1Offset; // Compression-bit is stored in LSB + offs[5] |= (qpair->local.dpi_enabled ? 1ULL : 0ULL) << bool2Offset; // DPI-Bit is stored in the next bit after the LSB + # ifdef VERBOSE std::cout << "bThread: Called writeQpContext on a RDMA-enabled design." << std::endl; std::cout << " - bThread - offs[0] " << offs[0] << std::endl; @@ -1039,10 +1054,13 @@ bool bThread::writeQpContext(uint32_t port) { // Write this information obtained from the QP-struct into configuration memory / registers #ifdef EN_AVX if(fcnfg.en_avx) { - if(_mm256_extract_epi32(cnfg_reg_avx[static_cast(CnfgAvxRegs::RDMA_CTX_REG)], 0)) - cnfg_reg_avx[static_cast(CnfgAvxRegs::RDMA_CTX_REG)] = _mm256_set_epi64x(0, offs[2], offs[1], offs[0]); - else + if(_mm256_extract_epi32(cnfg_reg_avx[static_cast(CnfgAvxRegs::RDMA_CTX_REG)], 0)) { + // Write to the upper and the lower register to transmit the data at over 256 bits + cnfg_reg_avx[static_cast(CnfgAvxRegs::RDMA_CTX_REG_1)] = _mm256_set_epi64x(0, offs[2], offs[1], offs[0]); + cnfg_reg_avx[static_cast