From 2bc3350951808c74c5a710791409c32feefd972e Mon Sep 17 00:00:00 2001 From: Navaneeth-Kunhi Purayil Date: Fri, 3 Jan 2025 14:12:35 +0100 Subject: [PATCH] [spatz_vrf] added better conflict handling between FPU and VLSU Performance achieved for kernels: 1) axpy_4096 : 36.5% 2) dotp_4096 : 48.3% 3) matmul_64x64x64 : 97.8% --- hw/ip/spatz/src/spatz.sv | 61 +++++++++++++---------- hw/ip/spatz/src/spatz_vlsu.sv | 4 +- hw/ip/spatz/src/spatz_vrf.sv | 93 +++++++++++++++++++++++++---------- 3 files changed, 105 insertions(+), 53 deletions(-) diff --git a/hw/ip/spatz/src/spatz.sv b/hw/ip/spatz/src/spatz.sv index 503a2ba..d9164f4 100644 --- a/hw/ip/spatz/src/spatz.sv +++ b/hw/ip/spatz/src/spatz.sv @@ -74,6 +74,9 @@ module spatz import spatz_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; #( // Number of ports of the vector register file localparam int unsigned NrWritePorts = 4; localparam int unsigned NrReadPorts = 8; + + // FPU buffer size (need atleast depth of 2 to hide conflicts) + localparam int unsigned FpuBufDepth = 4; ///////////// // Signals // @@ -98,9 +101,10 @@ module spatz import spatz_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; #( // Signals for buffering of FPU logic vrf_buf_en; - logic vrf_vfu_wvalid; - logic vrf_buf_ready; - logic vrf_buf_valid; + logic vrf_vfu_wvalid; + + logic buf_full, buf_empty; + logic [$clog2(FpuBufDepth)-1:0] buf_usage; // Buffer structure to track data information for writes from FPU to VRF typedef struct packed { @@ -222,11 +226,12 @@ module spatz import spatz_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; #( .rst_ni (rst_ni ), .testmode_i(testmode_i), // Write Ports - .waddr_i (vrf_waddr_buf ), - .wdata_i (vrf_wdata_buf ), - .we_i (vrf_we ), - .wbe_i (vrf_wbe_buf ), - .wvalid_o (vrf_wvalid), + .waddr_i (vrf_waddr_buf ), + .wdata_i (vrf_wdata_buf ), + .we_i (vrf_we ), + .wbe_i (vrf_wbe_buf ), + .wvalid_o (vrf_wvalid ), + .fpu_buf_usage_i (buf_usage), // Read Ports .raddr_i (vrf_raddr ), .re_i (vrf_re ), @@ -326,24 +331,30 @@ module spatz import spatz_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; #( ); `ifdef BUF_FPU - // To add one cycle latency of buffering to ensure that conflicts that arise - // with the VLSU interfaces can be hidden - assign vrf_buf_en = sb_we[VFU_VD_WD] && (!vrf_wvalid[VFU_VD_WD] || (vrf_wvalid[VFU_VD_WD] && vrf_buf_valid)); - spill_register #( - .T (vrf_buf_t) + // Buffering of FPU writes to VRF to hide the conflicts and achieve high FPU utilizations + assign vrf_buf_en = sb_we[VFU_VD_WD] && (!vrf_wvalid[VFU_VD_WD] || (vrf_wvalid[VFU_VD_WD] && !buf_empty)); + fifo_v3 #( + .FALL_THROUGH (1'b1 ), + .dtype (vrf_buf_t ), + .DEPTH (FpuBufDepth ) ) i_vfu_buf ( - .clk_i (clk_i), - .rst_ni (rst_ni), - - .valid_i (vrf_buf_en ), - .ready_o (vrf_buf_ready ), - .data_i ({vrf_wdata[VFU_VD_WD], vrf_waddr[VFU_VD_WD], vrf_wbe[VFU_VD_WD], sb_id[SB_VFU_VD_WD], vfu_rsp, vfu_rsp_valid}), - - .valid_o (vrf_buf_valid ), - .ready_i (vrf_wvalid[VFU_VD_WD] ), - .data_o (vrf_buf_data ) + .clk_i (clk_i), + .rst_ni (rst_ni), + .flush_i (1'b0), + .testmode_i (1'b0), + + .full_o (buf_full), + .empty_o (buf_empty), + .usage_o (buf_usage), + + .data_i ({vrf_wdata[VFU_VD_WD], vrf_waddr[VFU_VD_WD], vrf_wbe[VFU_VD_WD], sb_id[SB_VFU_VD_WD], vfu_rsp, vfu_rsp_valid}), + .push_i (vrf_buf_en && !buf_full), + + .data_o (vrf_buf_data), + .pop_i (vrf_wvalid[VFU_VD_WD] && !buf_empty) ); - assign vrf_vfu_wvalid = sb_we[VFU_VD_WD] && vrf_buf_ready; + assign vrf_vfu_wvalid = sb_we[VFU_VD_WD] && !buf_full; + `endif always_comb begin @@ -357,7 +368,7 @@ module spatz import spatz_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; #( vfu_rsp_buf_valid = vfu_rsp_valid; // If buffer is used and has valid data, use the buffered data `ifdef BUF_FPU - if (vrf_buf_valid) begin + if (!buf_empty) begin sb_we_buf [VFU_VD_WD] = 1'b1; vrf_wdata_buf[VFU_VD_WD] = vrf_buf_data.wdata; vrf_waddr_buf[VFU_VD_WD] = vrf_buf_data.waddr; diff --git a/hw/ip/spatz/src/spatz_vlsu.sv b/hw/ip/spatz/src/spatz_vlsu.sv index 3b49c84..db57a0d 100644 --- a/hw/ip/spatz/src/spatz_vlsu.sv +++ b/hw/ip/spatz/src/spatz_vlsu.sv @@ -660,7 +660,9 @@ module spatz_vlsu // Ack when the vector store finishes, or when the vector load commits to the VRF. // With more than an interface, we need to wait until all the interfaces commit to the VRF. assign vlsu_rsp_o = &vrf_commit_intf_valid && |vrf_req_valid_q ? vrf_req_q[0].rsp : '{id: commit_insn_q.id, default: '0}; - assign vlsu_rsp_valid_o = &vrf_commit_intf_valid && |vrf_req_valid_q ? |vrf_req_ready_q : vlsu_finished_req && !commit_insn_q.is_load; + + // TODO : Check if this is the same and fix if required + assign vlsu_rsp_valid_o = spatz_mem_finished_o; //&vrf_commit_intf_valid && |vrf_req_valid_q ? |vrf_req_ready_q : vlsu_finished_req && !commit_insn_q.is_load; ////////////// // Counters // diff --git a/hw/ip/spatz/src/spatz_vrf.sv b/hw/ip/spatz/src/spatz_vrf.sv index 126c2fd..4c132a0 100644 --- a/hw/ip/spatz/src/spatz_vrf.sv +++ b/hw/ip/spatz/src/spatz_vrf.sv @@ -10,7 +10,8 @@ module spatz_vrf import spatz_pkg::*; #( parameter int unsigned NrReadPorts = 5, - parameter int unsigned NrWritePorts = 3 + parameter int unsigned NrWritePorts = 3, + parameter int unsigned FpuBufDepth = 4 ) ( input logic clk_i, input logic rst_ni, @@ -21,6 +22,7 @@ module spatz_vrf input logic [NrWritePorts-1:0] we_i, input vrf_be_t [NrWritePorts-1:0] wbe_i, output logic [NrWritePorts-1:0] wvalid_o, + input logic [$clog2(FpuBufDepth)-1:0] fpu_buf_usage_i, // Read ports input vrf_addr_t [NrReadPorts-1:0] raddr_i, input logic [NrReadPorts-1:0] re_i, @@ -64,6 +66,10 @@ module spatz_vrf logic [NrVRFBanks-1:0] we; vrf_be_t [NrVRFBanks-1:0] wbe; + // Signals to handle conflicts between FPU and VLSU interfaces + logic [NrVRFBanks-1:0] w_vlsu_vfu_conflict; + logic [NrVRFBanks-1:0] w_vfu; + // Read signals vregfile_addr_t [NrVRFBanks-1:0][NrReadPortsPerBank-1:0] raddr; vrf_data_t [NrVRFBanks-1:0][NrReadPortsPerBank-1:0] rdata; @@ -92,32 +98,65 @@ module spatz_vrf // second priority has the LSU, and third priority has the slide unit. for (int unsigned bank = 0; bank < NrVRFBanks; bank++) begin // Bank write port 0 - Priority: vd (0) -> lsu (round-robin) <-> sld (round-robin) -`ifdef BUF_FPU - // At the moment it is as if the VLSU ports have higher priority than the FPU. - if (write_request[bank][VLSU_VD_WD0]) begin - waddr[bank] = f_vreg(waddr_i[VLSU_VD_WD0]); - wdata[bank] = wdata_i[VLSU_VD_WD0]; - we[bank] = 1'b1; - wbe[bank] = wbe_i[VLSU_VD_WD0]; - wvalid_o[VLSU_VD_WD0] = 1'b1; - end else if (write_request[bank][VLSU_VD_WD1]) begin - waddr[bank] = f_vreg(waddr_i[VLSU_VD_WD1]); - wdata[bank] = wdata_i[VLSU_VD_WD1]; - we[bank] = 1'b1; - wbe[bank] = wbe_i[VLSU_VD_WD1]; - wvalid_o[VLSU_VD_WD1] = 1'b1; - end else if (write_request[bank][VFU_VD_WD]) begin - waddr[bank] = f_vreg(waddr_i[VFU_VD_WD]); - wdata[bank] = wdata_i[VFU_VD_WD]; - we[bank] = 1'b1; - wbe[bank] = wbe_i[VFU_VD_WD]; - wvalid_o[VFU_VD_WD] = 1'b1; - end else if (write_request[bank][VSLDU_VD_WD]) begin - waddr[bank] = f_vreg(waddr_i[VSLDU_VD_WD]); - wdata[bank] = wdata_i[VSLDU_VD_WD]; - we[bank] = 1'b1; - wbe[bank] = wbe_i[VSLDU_VD_WD]; - wvalid_o[VSLDU_VD_WD] = 1'b1; +`ifdef BUF_FPU + // Check if there is a conflict between FPU and the VLSU interfaces + w_vlsu_vfu_conflict[bank] = (write_request[bank][VLSU_VD_WD0] | write_request[bank][VLSU_VD_WD1]) & write_request[bank][VFU_VD_WD]; + // If 2 conflicts (once with VLSU0 and VLSU1 each) encountered by FPU, then prioritize FPU + w_vfu[bank] = w_vlsu_vfu_conflict[bank] && (fpu_buf_usage_i >= 2'b10); + if (~w_vfu[bank]) begin + // Prioritize VLSU interfaces + if (write_request[bank][VLSU_VD_WD0]) begin + waddr[bank] = f_vreg(waddr_i[VLSU_VD_WD0]); + wdata[bank] = wdata_i[VLSU_VD_WD0]; + we[bank] = 1'b1; + wbe[bank] = wbe_i[VLSU_VD_WD0]; + wvalid_o[VLSU_VD_WD0] = 1'b1; + end else if (write_request[bank][VLSU_VD_WD1]) begin + waddr[bank] = f_vreg(waddr_i[VLSU_VD_WD1]); + wdata[bank] = wdata_i[VLSU_VD_WD1]; + we[bank] = 1'b1; + wbe[bank] = wbe_i[VLSU_VD_WD1]; + wvalid_o[VLSU_VD_WD1] = 1'b1; + end else if (write_request[bank][VFU_VD_WD]) begin + waddr[bank] = f_vreg(waddr_i[VFU_VD_WD]); + wdata[bank] = wdata_i[VFU_VD_WD]; + we[bank] = 1'b1; + wbe[bank] = wbe_i[VFU_VD_WD]; + wvalid_o[VFU_VD_WD] = 1'b1; + end else if (write_request[bank][VSLDU_VD_WD]) begin + waddr[bank] = f_vreg(waddr_i[VSLDU_VD_WD]); + wdata[bank] = wdata_i[VSLDU_VD_WD]; + we[bank] = 1'b1; + wbe[bank] = wbe_i[VSLDU_VD_WD]; + wvalid_o[VSLDU_VD_WD] = 1'b1; + end + end else begin + // Prioritize FPU + if (write_request[bank][VFU_VD_WD]) begin + waddr[bank] = f_vreg(waddr_i[VFU_VD_WD]); + wdata[bank] = wdata_i[VFU_VD_WD]; + we[bank] = 1'b1; + wbe[bank] = wbe_i[VFU_VD_WD]; + wvalid_o[VFU_VD_WD] = 1'b1; + end else if (write_request[bank][VLSU_VD_WD0]) begin + waddr[bank] = f_vreg(waddr_i[VLSU_VD_WD0]); + wdata[bank] = wdata_i[VLSU_VD_WD0]; + we[bank] = 1'b1; + wbe[bank] = wbe_i[VLSU_VD_WD0]; + wvalid_o[VLSU_VD_WD0] = 1'b1; + end else if (write_request[bank][VLSU_VD_WD1]) begin + waddr[bank] = f_vreg(waddr_i[VLSU_VD_WD1]); + wdata[bank] = wdata_i[VLSU_VD_WD1]; + we[bank] = 1'b1; + wbe[bank] = wbe_i[VLSU_VD_WD1]; + wvalid_o[VLSU_VD_WD1] = 1'b1; + end else if (write_request[bank][VSLDU_VD_WD]) begin + waddr[bank] = f_vreg(waddr_i[VSLDU_VD_WD]); + wdata[bank] = wdata_i[VSLDU_VD_WD]; + we[bank] = 1'b1; + wbe[bank] = wbe_i[VSLDU_VD_WD]; + wvalid_o[VSLDU_VD_WD] = 1'b1; + end end `else if (write_request[bank][VFU_VD_WD]) begin