From 45d0696b6229629c0d68ea7f21789b9568424b1f Mon Sep 17 00:00:00 2001 From: Cosmin Apreutesei Date: Wed, 30 Mar 2016 20:29:40 +0000 Subject: [PATCH 001/209] stub module and readme --- src/apps/mellanox/README.src.md | 4 ++++ src/apps/mellanox/connectx4.lua | 18 ++++++++++++++++++ 2 files changed, 22 insertions(+) create mode 100644 src/apps/mellanox/README.src.md create mode 100644 src/apps/mellanox/connectx4.lua diff --git a/src/apps/mellanox/README.src.md b/src/apps/mellanox/README.src.md new file mode 100644 index 0000000000..65ec01f352 --- /dev/null +++ b/src/apps/mellanox/README.src.md @@ -0,0 +1,4 @@ +# Mellanox ConnectX-4 Ethernet Controller App + +## MCX4 (apps.mellanox.connectx4) + diff --git a/src/apps/mellanox/connectx4.lua b/src/apps/mellanox/connectx4.lua new file mode 100644 index 0000000000..38b3c0a603 --- /dev/null +++ b/src/apps/mellanox/connectx4.lua @@ -0,0 +1,18 @@ + +--- Device driver for the Mellanox ConnectX-4 series Ethernet controller. + +module(...,package.seeall) + +local ffi = require "ffi" +local C = ffi.C +local lib = require("core.lib") +local pci = require("lib.hardware.pci") +local register = require("lib.hardware.register") +local index_set = require("lib.index_set") +local macaddress = require("lib.macaddress") +local mib = require("lib.ipc.shmem.mib") +local timer = require("core.timer") + +function selftest() + print'hello' +end From 8720a606a3e3110d5705a9e4679f53734c1d3466 Mon Sep 17 00:00:00 2001 From: Cosmin Apreutesei Date: Thu, 31 Mar 2016 02:14:37 +0300 Subject: [PATCH 002/209] pci init --- src/apps/mellanox/connectx4.lua | 50 ++++++++++++++++++++++++++++++++- 1 file changed, 49 insertions(+), 1 deletion(-) diff --git a/src/apps/mellanox/connectx4.lua b/src/apps/mellanox/connectx4.lua index 38b3c0a603..43f5b6ebb7 100644 --- a/src/apps/mellanox/connectx4.lua +++ b/src/apps/mellanox/connectx4.lua @@ -13,6 +13,54 @@ local macaddress = require("lib.macaddress") local mib = require("lib.ipc.shmem.mib") local timer = require("core.timer") +ConnectX4 = {} +ConnectX4.__index = ConnectX4 + +function ConnectX4:new(arg) + local conf = config.parse_app_arg(arg) + local self = setmetatable({}, self) + + local pciaddress = conf.pciaddress + + pci.unbind_device_from_linux(pciaddress) + pci.set_bus_master(pciaddress, true) + local base, fd = pci.map_pci_memory(pciaddress, 0) + + --[[ + register.define(config_registers_desc, self.r, self.base) + register.define(transmit_registers_desc, self.r, self.base) + register.define(receive_registers_desc, self.r, self.base) + register.define_array(packet_filter_desc, self.r, self.base) + register.define(statistics_registers_desc, self.s, self.base) + register.define_array(queue_statistics_registers_desc, self.qs, self.base) + self.txpackets = ffi.new("struct packet *[?]", num_descriptors) + self.rxpackets = ffi.new("struct packet *[?]", num_descriptors) + return self:init() + ]] + + return self +end + + function selftest() - print'hello' + local pcidev1 = lib.getenv("SNABB_PCI_CONNECTX40") or lib.getenv("SNABB_PCI0") + local pcidev2 = lib.getenv("SNABB_PCI_CONNECTX41") or lib.getenv("SNABB_PCI1") + if not pcidev1 + or pci.device_info(pcidev1).driver ~= 'apps.mellanox.connectx4' + or not pcidev2 + or pci.device_info(pcidev2).driver ~= 'apps.mellanox.connectx4' + then + print("SNABB_PCI_CONNECTX4[0|1]/SNABB_PCI[0|1] not set or not suitable.") + os.exit(engine.test_skipped_code) + end + + local device_info_1 = pci.device_info(pcidev1) + local device_info_2 = pci.device_info(pcidev2) + + print(device_info_1.model) + + local app1 = ConnectX4:new{pciaddress = pcidev1} + local app2 = ConnectX4:new{pciaddress = pcidev2} + + end From cd3a103ed84f68a744d3c7f7125d8f6ca960c8fa Mon Sep 17 00:00:00 2001 From: Cosmin Apreutesei Date: Thu, 31 Mar 2016 03:10:53 +0300 Subject: [PATCH 003/209] vendor/dev id --- src/lib/hardware/pci.lua | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/lib/hardware/pci.lua b/src/lib/hardware/pci.lua index b1cc789f28..d057a27a5c 100644 --- a/src/lib/hardware/pci.lua +++ b/src/lib/hardware/pci.lua @@ -82,6 +82,9 @@ local cards = { ["0x1924"] = { ["0x0903"] = {model = 'SFN7122F', driver = 'apps.solarflare.solarflare'} }, + ["0x15b3"] = { + ["0x1013" ] = {model = 'MT27700', driver = 'apps.mellanox.connectx4'} + }, } -- Return the name of the Lua module that implements support for this device. @@ -153,7 +156,7 @@ function root_check () end -- Return the canonical (abbreviated) representation of the PCI address. --- +-- -- example: canonical("0000:01:00.0") -> "01:00.0" function canonical (address) return address:gsub("^0000:", "") From dfbd705c03a13fb830d4be546f340a2177b64734 Mon Sep 17 00:00:00 2001 From: Cosmin Apreutesei Date: Thu, 31 Mar 2016 18:35:02 +0300 Subject: [PATCH 004/209] firmware revision register --- src/apps/mellanox/connectx4.lua | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/src/apps/mellanox/connectx4.lua b/src/apps/mellanox/connectx4.lua index 43f5b6ebb7..4dfcbe0f44 100644 --- a/src/apps/mellanox/connectx4.lua +++ b/src/apps/mellanox/connectx4.lua @@ -12,19 +12,36 @@ local index_set = require("lib.index_set") local macaddress = require("lib.macaddress") local mib = require("lib.ipc.shmem.mib") local timer = require("core.timer") +local bits, bitset = lib.bits, lib.bitset +local band, bor, lshift = bit.band, bit.bor, bit.lshift ConnectX4 = {} ConnectX4.__index = ConnectX4 +local init_segment_desc = [[ +fw_rev 0x0000 - RO Firmware Revision +]] + function ConnectX4:new(arg) - local conf = config.parse_app_arg(arg) local self = setmetatable({}, self) - - local pciaddress = conf.pciaddress + local conf = config.parse_app_arg(arg) + local pciaddress = pci.qualified(conf.pciaddress) pci.unbind_device_from_linux(pciaddress) pci.set_bus_master(pciaddress, true) local base, fd = pci.map_pci_memory(pciaddress, 0) + local r = {} --config registers + + register.define(init_segment_desc, r, base) + + print(r.fw_rev()) + + function self:stop() + if not base then return end + pci.set_bus_master(pciaddress, false) + pci.close_pci_resource(fd, base) + base, fd = nil + end --[[ register.define(config_registers_desc, self.r, self.base) @@ -41,7 +58,6 @@ function ConnectX4:new(arg) return self end - function selftest() local pcidev1 = lib.getenv("SNABB_PCI_CONNECTX40") or lib.getenv("SNABB_PCI0") local pcidev2 = lib.getenv("SNABB_PCI_CONNECTX41") or lib.getenv("SNABB_PCI1") @@ -57,10 +73,11 @@ function selftest() local device_info_1 = pci.device_info(pcidev1) local device_info_2 = pci.device_info(pcidev2) - print(device_info_1.model) - local app1 = ConnectX4:new{pciaddress = pcidev1} local app2 = ConnectX4:new{pciaddress = pcidev2} + engine.main({duration = 1, report={showlinks=true, showapps=false}}) + app1:stop() + app2:stop() end From 34929f1b01a339fe4ed11bc33322b6374039a4d4 Mon Sep 17 00:00:00 2001 From: Cosmin Apreutesei Date: Thu, 31 Mar 2016 18:37:12 +0300 Subject: [PATCH 005/209] print firmware revision maj and min --- src/apps/mellanox/connectx4.lua | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/apps/mellanox/connectx4.lua b/src/apps/mellanox/connectx4.lua index 4dfcbe0f44..d990ac2856 100644 --- a/src/apps/mellanox/connectx4.lua +++ b/src/apps/mellanox/connectx4.lua @@ -13,7 +13,7 @@ local macaddress = require("lib.macaddress") local mib = require("lib.ipc.shmem.mib") local timer = require("core.timer") local bits, bitset = lib.bits, lib.bitset -local band, bor, lshift = bit.band, bit.bor, bit.lshift +local band, bor, shl, shr = bit.band, bit.bor, bit.lshift, bit.rshift ConnectX4 = {} ConnectX4.__index = ConnectX4 @@ -34,7 +34,8 @@ function ConnectX4:new(arg) register.define(init_segment_desc, r, base) - print(r.fw_rev()) + local rev = r.fw_rev() + print(band(rev, 0xffff), shr(rev, 16)) function self:stop() if not base then return end From fe8065ff13e2ba923e6206944e0b1241d7c0f01e Mon Sep 17 00:00:00 2001 From: Cosmin Apreutesei Date: Fri, 13 May 2016 13:43:51 +0300 Subject: [PATCH 006/209] init procedure until QUERY_HCA_CAP --- src/apps/mellanox/connectx4.lua | 644 ++++++++++++++++++++++++++++++-- 1 file changed, 623 insertions(+), 21 deletions(-) diff --git a/src/apps/mellanox/connectx4.lua b/src/apps/mellanox/connectx4.lua index d990ac2856..e3f621aa13 100644 --- a/src/apps/mellanox/connectx4.lua +++ b/src/apps/mellanox/connectx4.lua @@ -1,4 +1,4 @@ - +--go@ git up --- Device driver for the Mellanox ConnectX-4 series Ethernet controller. module(...,package.seeall) @@ -13,14 +13,563 @@ local macaddress = require("lib.macaddress") local mib = require("lib.ipc.shmem.mib") local timer = require("core.timer") local bits, bitset = lib.bits, lib.bitset -local band, bor, shl, shr = bit.band, bit.bor, bit.lshift, bit.rshift +local floor = math.floor +local cast = ffi.cast +local band, bor, shl, shr, bswap, bnot = + bit.band, bit.bor, bit.lshift, bit.rshift, bit.bswap, bit.bnot ConnectX4 = {} ConnectX4.__index = ConnectX4 -local init_segment_desc = [[ -fw_rev 0x0000 - RO Firmware Revision -]] +--utils + +--alloc DMA memory in 4K-sized chunks and return an uint32 pointer +local function alloc_pages(pages) + local ptr, phy = memory.dma_alloc(4096 * pages) + assert(band(phy, 0xfff) == 0) --the phy address must be 4K-aligned + return cast('uint32_t*', ptr), phy +end + +--get an big-endian uint32 value from an uint32 pointer at a byte offset +function getint(addr, ofs) + local ofs = ofs/4 + assert(ofs == floor(ofs)) + return bswap(addr[ofs]) +end + +--set a big-endian uint32 value into an uint32 pointer at a byte offset +function setint(addr, ofs, val) + local ofs = ofs/4 + assert(ofs == floor(ofs)) + addr[ofs] = bswap(val) +end + +--extract a bit range from a value +local function getbits(val, bit2, bit1) + local mask = shl(2^(bit2-bit1+1)-1, bit1) + return shr(band(val, mask), bit1) +end + +--extract a bit range from a pointer +local function ptrbits(ptr, bit2, bit1) + local addr = cast('uint64_t', ptr) + return tonumber(getbits(addr, bit2, bit1)) +end + +--fit a value into a bit range and return the resulting value +local function setbits1(bit2, bit1, val) + local mask = shl(2^(bit2-bit1+1)-1, bit1) + return band(shl(val, bit1), mask) +end + +--set multiple bit ranges and return the resulting value +local function setbits(...) --bit2, bit1, val, ... + local endval = 0 + for i = 1, select('#', ...), 3 do + local bit2, bit1, val = select(i, ...) + endval = bor(endval, setbits1(bit2, bit1, val or 0)) + end + return endval +end + +--get the value of a bit at a certain a bit offset from a base address +local function getbit(addr, bit) + local i = math.floor(bit / 32) + local j = bit % 32 + return getbits(getint(addr, i * 4), j, j) +end + +--init segment (section 4.3) + +local init_seg = {} +init_seg.__index = init_seg + +function init_seg:getbits(ofs, bit2, bit1) + return getbits(getint(self.ptr, ofs), bit2, bit1) +end + +function init_seg:setbits(ofs, ...) + setint(self.ptr, ofs, setbits(...)) +end + +function init_seg:init(ptr) + return setmetatable({ptr = cast('uint32_t*', ptr)}, self) +end + +function init_seg:fw_rev() --maj, min, subminor + return + self:getbits(0, 15, 0), + self:getbits(0, 31, 16), + self:getbits(4, 15, 0) +end + +function init_seg:cmd_interface_rev() + return self:getbits(4, 31, 16) +end + +function init_seg:cmdq_phy_addr(addr) + if addr then + --must write the MSB of the addr first + self:setbits(0x10, 31, 0, ptrbits(addr, 63, 32)) + --also resets nic_interface and log_cmdq_* + self:setbits(0x14, 31, 12, ptrbits(addr, 31, 12)) + else + return cast('void*', + cast('uint64_t', self:getbits(0x10, 31, 0) * 2^32 + + cast('uint64_t', self:getbits(0x14, 31, 12)) * 2^12)) + end +end + +function init_seg:nic_interface(mode) + self:setbits(0x14, 9, 8, mode) +end + +function init_seg:log_cmdq_size() + return self:getbits(0x14, 7, 4) +end + +function init_seg:log_cmdq_stride() + return self:getbits(0x14, 3, 0) +end + +function init_seg:ring_doorbell(i) + self:setbits(0x18, i, i, 1) +end + +function init_seg:ready(i, val) + return self:getbits(0x1fc, 31, 31) == 0 +end + +function init_seg:nic_interface_supported() + return self:getbits(0x1fc, 26, 24) == 0 +end + +function init_seg:internal_timer() + return + self:getbits(0x1000, 31, 0) * 2^32 + + self:getbits(0x1004, 31, 0) +end + +function init_seg:clear_int() + self:setbits(0x100c, 0, 0, 1) +end + +function init_seg:health_syndrome() + return self:getbits(0x1010, 31, 24) +end + +--command queue (section 7.14.1) + +local cmdq = {} +cmdq.__index = cmdq + +--init cmds +local QUERY_HCA_CAP = 0x100 +local QUERY_ADAPTER = 0x101 +local INIT_HCA = 0x102 +local TEARDOWN_HCA = 0x103 +local ENABLE_HCA = 0x104 +local DISABLE_HCA = 0x105 +local QUERY_PAGES = 0x107 +local MANAGE_PAGES = 0x108 +local SET_HCA_CAP = 0x109 +local QUERY_ISSI = 0x10A +local SET_ISSI = 0x10B +local SET_DRIVER_VERSION = 0x10D + +function cmdq:new(init_seg) + local ptr, phy = alloc_pages(1) + local ib_ptr, ib_phy = alloc_pages(1) + local ob_ptr, ob_phy = alloc_pages(1) + return setmetatable({ + ptr = ptr, + phy = phy, + ib_ptr = ib_ptr, + ob_ptr = ob_ptr, + init_seg = init_seg, + size = init_seg:log_cmdq_size(), + stride = init_seg:log_cmdq_stride(), + }, self) +end + +function cmdq:getbits(ofs, bit2, bit1) + return getbits(getint(self.ptr, ofs), bit2, bit1) +end + +function cmdq:setbits(ofs, bit2, bit1, val) + setint(self.ptr, ofs, setbits(bit2, bit1, val)) +end + +function cmdq:setinbits(ofs, ...) --bit1, bit2, val, ... + assert(band(ofs, 3) == 0) --offset must be 4-byte aligned + if ofs <= 16 - 4 then --inline + self:setbits(0x10 + ofs, ...) + else --input mailbox + assert(ofs <= 16 - 4 + 4096) + setint(self.ib_ptr, ofs, setbits(...)) + end +end + +function cmdq:getoutbits(ofs, bit2, bit1) + if ofs <= 16 - 4 then --inline + return self:getbits(0x20 + ofs, bit2, bit1) + else --output mailbox + assert(ofs <= 16 - 4 + 4096) + return getbits(getint(self.ob_ptr, ofs), bit2, bit1) + end +end + +function cmdq:getoutaddr(ofs) + local ofs = (0x20 + ofs) / 4 + assert(ofs == math.floor(ofs)) + return self.ptr + ofs +end + +function cmdq:getbit(ofs, bit) + return getbit(self:getoutaddr(ofs), bit) +end + +local errors = { + 'signature error', + 'token error', + 'bad block number', + 'bad output pointer. pointer not aligned to mailbox size', + 'bad input pointer. pointer not aligned to mailbox size', + 'internal error', + 'input len error. input length less than 0x8', + 'output len error. output length less than 0x8', + 'reserved not zero', + 'bad command type', +} +local function checkz(z) + if z == 0 then return end + error('command error: '..(errors[z] or z)) +end + +function cmdq:post(last_in_ofs, last_out_ofs) + local in_sz = last_in_ofs + 4 + local out_sz = last_out_ofs + 4 + + self:setbits(0x00, 31, 24, 0x7) --type + + self:setbits(0x04, 31, 0, in_sz) --input_length + self:setbits(0x38, 31, 0, out_sz) --output_length + + self:setbits(0x08, 31, 0, ptrbits(self.ib_addr, 63, 32)) + self:setbits(0x0C, 31, 9, ptrbits(self.ib_addr, 31, 9)) + + self:setbits(0x30, 31, 0, ptrbits(self.ob_addr, 63, 32)) + self:setbits(0x34, 31, 9, ptrbits(self.ob_addr, 31, 9)) + + self:setbits(0x3C, 0, 0, 1) --set ownership + + self.init_seg:ring_doorbell(0) --post command + + --poll for command completion + while self:getbits(0x3C, 0, 0) == 1 do + C.usleep(1000) + end + + local token = self:getbits(0x3C, 31, 24) + local signature = self:getbits(0x3C, 23, 16) + local status = self:getbits(0x3C, 7, 1) + + checkz(status) + + return signature, token +end + +--see 12.2 Return Status Summary +function cmdq:checkstatus() + local status = self:getoutbits(0x00, 31, 24) + local syndrome = self:getoutbits(0x04, 31, 0) + if status == 0 then return end + error(string.format('status: 0x%x, syndrome: %d', status, syndrome)) +end + +function cmdq:enable_hca() + self:setinbits(0x00, 31, 16, ENABLE_HCA) + self:post(0x0C, 0x08) +end + +function cmdq:disable_hca() + self:setinbits(0x00, 31, 16, DISABLE_HCA) + self:post(0x0C, 0x08) +end + +function cmdq:query_issi() + self:setinbits(0x00, 31, 16, QUERY_ISSI) + self:post(0x0C, 0x6C) + self:checkstatus() + local cur_issi = self:getoutbits(0x08, 15, 0) + local t = {} + for i=0,80-1 do + t[i] = self:getbit(0x20, i) == 1 or nil + end + return { + cur_issi = cur_issi, + sup_issi = t, + } +end + +function cmdq:set_issi(issi) + self:setinbits(0x00, 31, 16, SET_ISSI) + self:setinbits(0x08, 15, 0, issi) + self:post(0x0C, 0x0C) + self:checkstatus() +end + +function cmdq:dump_issi(issi) + print(' cur_issi ', issi.cur_issi) + print(' sup_issi ') + for i=0,79 do + if issi.sup_issi[i] then + print(string.format( + ' %02d ', i)) + end + end +end + +local codes = { + boot = 1, + init = 2, + regular = 3, +} +function cmdq:query_pages(which) + self:setinbits(0x00, 31, 16, QUERY_PAGES) + self:setinbits(0x04, 15, 0, codes[which]) + self:post(0x0C, 0x0C) + self:checkstatus() + return self:getoutbits(0x0C, 31, 0) +end + +function cmdq:alloc_pages(addr, num_pages) + self:setinbits(0x00, 31, 16, MANAGE_PAGES) + self:setinbits(0x04, 15, 0, 1) --alloc + self:setinbits(0x0C, 31, 0, num_pages) + local addr = cast('char*', addr) + for i=0, num_pages-1 do + self:setinbits(0x10 + i*8, 31, 0, ptrbits(addr + 4096*i, 63, 32)) + self:setinbits(0x14 + i*8, 31, 12, ptrbits(addr + 4096*i, 31, 12)) + end + self:post(0x10 + num_pages*8, 0x0C) + self:checkstatus() +end + +local what_codes = { + max = 0, + cur = 1, +} +local which_codes = { + general = 0, + offload = 1, + flow_table = 7, +} +function cmdq:query_hca_cap(what, which) + self:setinbits(0x00, 31, 16, QUERY_HCA_CAP) + self:setinbits(0x04, + 15, 1, assert(which_codes[which]), + 0, 0, assert(what_codes[what])) + self:post(0x0C, 0x100C - 3000) + self:checkstatus() + local caps = {} + if which_caps == 'general' then + caps.log_max_cq_sz = self:getoutbits(0x18, 23, 16) + caps.log_max_cq = self:getoutbits(0x18, 4, 0) + caps.log_max_eq_sz = self:getoutbits(0x1C, 31, 24) + caps.log_max_mkey = self:getoutbits(0x1C, 21, 16) + caps.log_max_eq = self:getoutbits(0x1C, 3, 0) + caps.max_indirection = self:getoutbits(0x20, 31, 24) + caps.log_max_mrw_sz = self:getoutbits(0x20, 22, 16) + caps.log_max_klm_list_size = self:getoutbits(0x20, 5, 0) + caps.end_pad = self:getoutbits(0x2C, 31, 31) + caps.start_pad = self:getoutbits(0x2C, 28, 28) + caps.cache_line_128byte = self:getoutbits(0x2C, 27, 27) + caps.vport_counters = self:getoutbits(0x30, 30, 30) + caps.vport_group_manager = self:getoutbits(0x34, 31, 31) + caps.nic_flow_table = self:getoutbits(0x34, 25, 25) + caps.port_type = self:getoutbits(0x34, 9, 8) + caps.num_ports = self:getoutbits(0x34, 7, 0) + caps.log_max_msg = self:getoutbits(0x38, 28, 24) + caps.max_tc = self:getoutbits(0x38, 19, 16) + caps.cqe_version = self:getoutbits(0x3C, 3, 0) + caps.cmdif_checksum = self:getoutbits(0x40, 15, 14) + caps.wq_signature = self:getoutbits(0x40, 11, 11) + caps.sctr_data_cqe = self:getoutbits(0x40, 10, 10) + caps.eth_net_offloads = self:getoutbits(0x40, 3, 3) + caps.cq_oi = self:getoutbits(0x44, 31, 31) + caps.cq_resize = self:getoutbits(0x44, 30, 30) + caps.cq_moderation = self:getoutbits(0x44, 29, 29) + caps.cq_eq_remap = self:getoutbits(0x44, 25, 25) + caps.scqe_break_moderation = self:getoutbits(0x44, 21, 21) + caps.cq_period_start_from_cqe = self:getoutbits(0x44, 20, 20) + caps.imaicl = self:getoutbits(0x44, 14, 14) + caps.xrc = self:getoutbits(0x44, 3, 3) + caps.ud = self:getoutbits(0x44, 2, 2) + caps.uc = self:getoutbits(0x44, 1, 1) + caps.rc = self:getoutbits(0x44, 0, 0) + caps.uar_sz = self:getoutbits(0x48, 21, 16) + caps.log_pg_sz = self:getoutbits(0x48, 7, 0) + caps.bf = self:getoutbits(0x4C, 31, 31) + caps.driver_version = self:getoutbits(0x4C, 30, 30) + caps.pad_tx_eth_packet = self:getoutbits(0x4C, 29, 29) + caps.log_bf_reg_size = self:getoutbits(0x4C, 20, 16) + caps.log_max_transport_domain = self:getoutbits(0x64, 28, 24) + caps.log_max_pd = self:getoutbits(0x64, 20, 16) + caps.max_flow_counter = self:getoutbits(0x68, 15, 0) + caps.log_max_rq = self:getoutbits(0x6C, 28, 24) + caps.log_max_sq = self:getoutbits(0x6C, 20, 16) + caps.log_max_tir = self:getoutbits(0x6C, 12, 8) + caps.log_max_tis = self:getoutbits(0x6C, 4, 0) + caps.basic_cyclic_rcv_wqe = self:getoutbits(0x70, 31, 31) + caps.log_max_rmp = self:getoutbits(0x70, 28, 24) + caps.log_max_rqt = self:getoutbits(0x70, 20, 16) + caps.log_max_rqt_size = self:getoutbits(0x70, 12, 8) + caps.log_max_tis_per_sq = self:getoutbits(0x70, 4, 0) + caps.log_max_stride_sz_rq = self:getoutbits(0x74, 28, 24) + caps.log_min_stride_sz_rq = self:getoutbits(0x74, 20, 16) + caps.log_max_stride_sz_sq = self:getoutbits(0x74, 12, 8) + caps.log_min_stride_sz_sq = self:getoutbits(0x74, 4, 0) + caps.log_max_wq_sz = self:getoutbits(0x78, 4, 0) + caps.log_max_vlan_list = self:getoutbits(0x7C, 20, 16) + caps.log_max_current_mc_list = self:getoutbits(0x7C, 12, 8) + caps.log_max_current_uc_list = self:getoutbits(0x7C, 4, 0) + caps.log_max_l2_table = self:getoutbits(0x90, 28, 24) + caps.log_uar_page_sz = self:getoutbits(0x90, 15, 0) + caps.device_frequency_mhz = self:getoutbits(0x98, 31, 0) + elseif which_caps == 'offload' then + --TODO + elseif which_caps == 'flow_table' then + --TODO + end + return caps +end + +function cmdq:set_hca_cap(which, caps) + self:setinbits(0x00, 31, 16, SET_HCA_CAP) + self:setinbits(0x04, 15, 1, assert(which_codes[which])) + if which_caps == 'general' then + self:setinbits(0x18, + 23, 16, caps.log_max_cq_sz, + 4, 0, caps.log_max_cq) + self:setinbits(0x1C, + 31, 24, caps.log_max_eq_sz, + 21, 16, caps.log_max_mkey, + 3, 0, caps.log_max_eq) + self:setinbits(0x20, + 31, 24, caps.max_indirection, + 22, 16, caps.log_max_mrw_sz, + 5, 0, caps.log_max_klm_list_size) + self:setinbits(0x2C, + 31, 31, caps.end_pad, + 28, 28, caps.start_pad, + 27, 27, caps.cache_line_128byte) + self:setinbits(0x30, + 30, 30, caps.vport_counters) + self:setinbits(0x34, + 31, 31, caps.vport_group_manager, + 25, 25, caps.nic_flow_table, + 9, 8, caps.port_type, + 7, 0, caps.num_ports) + self:setinbits(0x38, + 28, 24, caps.log_max_msg, + 19, 16, caps.max_tc) + self:setinbits(0x3C, + 3, 0, caps.cqe_version) + self:setinbits(0x40, + 15, 14, caps.cmdif_checksum, + 11, 11, caps.wq_signature, + 10, 10, caps.sctr_data_cqe, + 3, 3, caps.eth_net_offloads) + self:setinbits(0x44, + 31, 31, caps.cq_oi, + 30, 30, caps.cq_resize, + 29, 29, caps.cq_moderation, + 25, 25, caps.cq_eq_remap, + 21, 21, caps.scqe_break_moderation, + 20, 20, caps.cq_period_start_from_cqe, + 14, 14, caps.imaicl, + 3, 3, caps.xrc, + 2, 2, caps.ud, + 1, 1, caps.uc, + 0, 0, caps.rc) + self:setinbits(0x48, + 21, 16, caps.uar_sz, + 7, 0, caps.log_pg_sz) + self:setinbits(0x4C, + 31, 31, caps.bf, + 30, 30, caps.driver_version, + 29, 29, caps.pad_tx_eth_packet, + 20, 16, caps.log_bf_reg_size) + self:setinbits(0x64, + 28, 24, caps.log_max_transport_domain, + 20, 16, caps.log_max_pd) + self:setinbits(0x68, + 15, 0, caps.max_flow_counter) + self:setinbits(0x6C, + 28, 24, caps.log_max_rq, + 20, 16, caps.log_max_sq, + 12, 8, caps.log_max_tir, + 4, 0, caps.log_max_tis) + self:setinbits(0x70, + 31, 31, caps.basic_cyclic_rcv_wqe, + 28, 24, caps.log_max_rmp, + 20, 16, caps.log_max_rqt, + 12, 8, caps.log_max_rqt_size, + 4, 0, caps.log_max_tis_per_sq) + self:setinbits(0x74, + 28, 24, caps.log_max_stride_sz_rq, + 20, 16, caps.log_min_stride_sz_rq, + 12, 8, caps.log_max_stride_sz_sq, + 4, 0, caps.log_min_stride_sz_sq) + self:setinbits(0x78, + 4, 0, caps.log_max_wq_sz) + self:setinbits(0x7C, + 20, 16, caps.log_max_vlan_list, + 12, 8, caps.log_max_current_mc_list, + 4, 0, caps.log_max_current_uc_list) + self:setinbits(0x90, + 28, 24, caps.log_max_l2_table, + 15, 0, caps.log_uar_page_sz) + self:setinbits(0x98, + 31, 0, caps.device_frequency_mhz) + elseif which_caps == 'offload' then + self:setinbits(0x00, + 31, 31, caps.csum_cap, + 30, 30, caps.vlan_cap, + 29, 29, caps.lro_cap, + 28, 28, caps.lro_psh_flag, + 27, 27, caps.lro_time_stamp, + 26, 25, caps.lro_max_msg_sz_mode, + 23, 23, caps.self_lb_en_modifiable, + 22, 22, caps.self_lb_mc, + 21, 21, caps.self_lb_uc, + 20, 16, caps.max_lso_cap, + 13, 12, caps.wqe_inline_mode, + 11, 8, caps.rss_ind_tbl_cap) + self:setinbits(0x08, + 15, 0, caps.lro_min_mss_size) + for i = 1, 4 do + self:setinbits(0x30 + (i-1)*4, 31, 0, caps.lro_timer_supported_periods[i]) + end + elseif which_caps == 'flow_table' then + --TODO + end + self:post(0x100C, 0x0C) + self:checkstatus() +end + +function init_seg:dump() + print('fw_rev ', self:fw_rev()) + print('cmd_interface_rev ', self:cmd_interface_rev()) + print('cmdq_phy_addr ', self:cmdq_phy_addr()) + print('log_cmdq_size ', self:log_cmdq_size()) + print('log_cmdq_stride ', self:log_cmdq_stride()) + print('ready ', self:ready()) + print('nic_interface_supported ', self:nic_interface_supported()) + print('internal_timer ', self:internal_timer()) + print('health_syndrome ', self:health_syndrome()) +end function ConnectX4:new(arg) local self = setmetatable({}, self) @@ -30,36 +579,89 @@ function ConnectX4:new(arg) pci.unbind_device_from_linux(pciaddress) pci.set_bus_master(pciaddress, true) local base, fd = pci.map_pci_memory(pciaddress, 0) - local r = {} --config registers - register.define(init_segment_desc, r, base) + local init_seg = init_seg:init(base) + + --allocate and set the command queue which also initializes the nic + local cmdq = cmdq:new(init_seg) + + --8.2 HCA Driver Start-up + + init_seg:cmdq_phy_addr(cmdq.phy) - local rev = r.fw_rev() - print(band(rev, 0xffff), shr(rev, 16)) + --wait until the nic is ready + while not init_seg:ready() do + C.usleep(1000) + end + + init_seg:dump() + + cmdq:enable_hca() + + local issi = cmdq:query_issi() + cmdq:dump_issi(issi) + + cmdq:set_issi(0) + + local boot_pages = cmdq:query_pages'boot' + print("query_pages'boot' ", boot_pages) + assert(boot_pages > 0) + + local bp_ptr, bp_phy = memory.dma_alloc(4096 * boot_pages) + assert(band(bp_phy, 0xfff) == 0) --the phy address must be 4K-aligned + cmdq:alloc_pages(bp_phy, boot_pages) + + local t = cmdq:query_hca_cap('cur', 'general') + print'query_hca_cap:' + for k,v in pairs(t) do + print('', k, v) + end + --[[ + cmdq:set_hca_cap() + cmdq:query_pages() + cmdq:manage_pages() + cmdq:init_hca() + cmdq:set_driver_version() + cmdq:create_eq() + cmdq:query_vport_state() + cmdq:modify_vport_context() + ]] function self:stop() if not base then return end + if cmdq then + cmdq:disable_hca() + end pci.set_bus_master(pciaddress, false) pci.close_pci_resource(fd, base) base, fd = nil end - --[[ - register.define(config_registers_desc, self.r, self.base) - register.define(transmit_registers_desc, self.r, self.base) - register.define(receive_registers_desc, self.r, self.base) - register.define_array(packet_filter_desc, self.r, self.base) - register.define(statistics_registers_desc, self.s, self.base) - register.define_array(queue_statistics_registers_desc, self.qs, self.base) - self.txpackets = ffi.new("struct packet *[?]", num_descriptors) - self.rxpackets = ffi.new("struct packet *[?]", num_descriptors) - return self:init() - ]] - return self end function selftest() + io.stdout:setvbuf'no' + + local ptr, phy = alloc_pages(1) + ptr[4] = bswap(1234) + assert(getint(ptr, 16) == 1234) + setint(ptr, 16, 4321) + assert(bswap(ptr[4]) == 4321) + assert(getint(ptr, 16) == 4321) + assert(getbits(0xdeadbeef, 31, 16) == 0xdead) + assert(getbits(0xdeadbeef, 15, 0) == 0xbeef) + assert(ptrbits(ffi.cast('void*', 0xdeadbeef), 15, 0) == 0xbeef) + assert(setbits(0, 0, 1) == 1) + assert(setbits(1, 1, 1) == 2) + assert(setbits(1, 0, 3) == 3) + local x = setbits(31, 16, 0xdead, 15, 0, 0xbeef) + print(bit.tohex(x), type(x)) + --assert(x == 0xdeadbeef) + ptr[4] = bswap(2) + assert(getbit(ptr, 4 * 4 * 8 + 0) == 0) + assert(getbit(ptr, 4 * 4 * 8 + 1) == 1) + local pcidev1 = lib.getenv("SNABB_PCI_CONNECTX40") or lib.getenv("SNABB_PCI0") local pcidev2 = lib.getenv("SNABB_PCI_CONNECTX41") or lib.getenv("SNABB_PCI1") if not pcidev1 From a6973da68fbe47695687fccfa9a7d0bd477eee58 Mon Sep 17 00:00:00 2001 From: Luke Gorrie Date: Sun, 22 May 2016 08:24:22 +0000 Subject: [PATCH 007/209] lib.hardware.pci: Add reset_device() This function can be useful for resetting a device that has persistent state, for example the firmware state on a Mellanox ConnectX-4 device. --- src/lib/hardware/README.md | 5 +++++ src/lib/hardware/pci.lua | 12 ++++++++++++ 2 files changed, 17 insertions(+) diff --git a/src/lib/hardware/README.md b/src/lib/hardware/README.md index 537b0b7c35..00a8b94913 100644 --- a/src/lib/hardware/README.md +++ b/src/lib/hardware/README.md @@ -62,6 +62,11 @@ Returns a table containing information about the PCI device by Returns the module name for a suitable device driver (if available) for a device of *model* from *vendor*. +— Function **pci.reset_device** *pciaddress* + +Reset a PCI device (function). Can be useful for returning the device +to a clean initial state. + — Function **pci.unbind_device_from_linux** *pciaddress* Forces Linux to unbind the device identified by *pciaddress* from any diff --git a/src/lib/hardware/pci.lua b/src/lib/hardware/pci.lua index d057a27a5c..19aba759e0 100644 --- a/src/lib/hardware/pci.lua +++ b/src/lib/hardware/pci.lua @@ -106,6 +106,18 @@ function is_usable (info) return info.driver and (info.interface == nil or info.status == 'down') end +-- Reset a PCI function. +-- See https://www.kernel.org/doc/Documentation/ABI/testing/sysfs-bus-pci +function reset_device (pciaddress) + root_check() + local p = path(pciaddress).."/reset" + if lib.can_write(p) then + lib.writefile(p, "1") + else + error("Cannot write: "..p) + end +end + --- Force Linux to release the device with `pciaddress`. --- The corresponding network interface (e.g. `eth0`) will disappear. function unbind_device_from_linux (pciaddress) From ccdf38e2facccc6d65118994c4a103b093a8a57d Mon Sep 17 00:00:00 2001 From: Luke Gorrie Date: Sun, 22 May 2016 08:35:09 +0000 Subject: [PATCH 008/209] mellanox: Added text/hexdump trace of Linux driver This is useful because some differences that are subtle when comparing source code are obvious when comparing hexdumps. If the card does not respond to a command the way we expect then we can check what we are doing differently to the Linux driver. --- src/apps/mellanox/trace-mlx5_core.txt.gz | Bin 0 -> 66676 bytes 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 src/apps/mellanox/trace-mlx5_core.txt.gz diff --git a/src/apps/mellanox/trace-mlx5_core.txt.gz b/src/apps/mellanox/trace-mlx5_core.txt.gz new file mode 100644 index 0000000000000000000000000000000000000000..ca40de3e7440b9586d91e469f4f6b1255eb1ce6a GIT binary patch literal 66676 zcmd3P30#cZ|G%v+i7cfo$$qL5OJq)jTc zYZ)z+_K{|$re^-1^E@;5?Yi#o|9!nYo~QGibDr~gpU>Go=X2g8;X88VQ>#@vLxwxr z?>04Gyx-bI!N}Cs-dxJj#c@b`q^(1BPFXgUrDfc|F!b*98iRo{HKS!dVP^9V)+}?6 z4&8DhX~-nGT;bS&?PiP;mC;+zX1XSrwT4`d7gm4qc+%J*owX7Nk9w^f$SIx_{LE1# zWzQQu%DbKy8*B&eziwz>u`69ye@k=BGsg`2W}So>-@b*{2Kt|~4&8Qdydn|Pr*l%Ezy2G)xd3UlE zb!{1Ie%u|s8~44OSfsXK2Nhqbm^Q^)vJUZWSNUeq$oo;sJ@sbV8#Mk}yRFB)ZhH)e zSx|L3C8Vz<>(D^&fOLJuHb*(<9n5u&alR!bHq9L-N|n}ET&E1Sdr(5823z~@wXI{G z+vX%28PBSH*K9XX;_Jcgu5GF|GFsuv&Z0F{2iY2WFm5aRRy8!4%!lsUwMJ?c;ST$Y23nUy)X*AC%408=y=!;7t7M!sGIx}D zi=zHt$CtLWDMTiuN@VdT9dCPFq4*-`K!4m;`krTWOGZtxg-wQwbBQBWT8dQp+O~of zQP>|e_BbQ%yxtbeF7oyam8v(<1=m*(UDHe5V6@+F3On*fjMS1;quyTIXmyHj**XJP z>i)N8ZxWVxXPZ8+cD~gcQ@Gx;@TxLv z_G2}-cV&l|j@#!MNgUU>c`dl4$EhXo(!0V?-v;LttNkS%jr|9<@0wV|Y^t`*eb&=> zrG+!_c0r;>mhFw?ydb$G<-fL8Oxb-UDe0;8hP{EdTVnF=C%w6t*1zN3K(0qjP-8Q7 zi~C_Er9-s#a0!ihR0$QX)%~;dFFcHLPtjkbbE&>D z!8iVZ1+Cv?uz@~ZmfMBl(4q*rGcWqzDP1LrE0m5N=P(o#0aO_LH$z? z)#jZN+NFMI?&F*GyRJvPd|=`%)l5IXr$YCUcB6&eC4G-;RhQjeAE|7%=M+EB*HpW( zJ8@c&NXYe?jTZNw?A{P3qkATQirQ;g-O>3Aa?Fn94+&GQGU)M9HWRZwMqAJQ@Ag#N z)%@L)_;);XnSEN3`=7j+Eq`O7aoXh5ie5AenL5Fo@V;fX(ZMt2y|QYrR2pdPHDWBd zU$vS*{KC3=~p{!I#rrypIx>&@dzjTTC1P4@Zp{}j(6yrscRiPQtS$M=};?8 zc4wV8+EQ!1Y)W?C*m>zrOXI!C>CBF-F8|9F2^pshdsQ~X`S$c=+pnx~we+GH>|E&M z-R1vwSeX0UHGi=j^J;Um+A~C-UX5p6nR!g2b69H6yXNs~uMXBZQ=UDJkBOrXUUd$W zE}GxmS(@N?_PW?~>o*>h?zp0$qRabk?c6Y^y;8Ra}|mG4_pbbu6JXdc7m(kFbMR2M>*N9PY?6cPqIraicN2_o&L-N34>F zQgL5G!qGl*Tv&C*1J_lDa}^Ghq`Ibz?5xR}p0&un!u3YUfoL7kmEqAQ%~@72)?{q$ zZf7!^%Hr~i=Tk{#cA=f_zShge<}Xgp+ntc;b%Hck$i;V&8@bb3UV49;2IXFD&!kW~ z$=A(U;Z<431wFsqT}9ID-L*GFkC6;&4n{WBaT?Zd?3p*$Mf=IkHE zEuGc9^b*tjt#)@5dM(4}n-4U*u8*0t&CvP5{P<%#>{R?;cm6$#K5OE;{vo7>w&DAk z^P^X^vYYo=J$~rndor;zDVtAamuYi+Rx9i$$HkXE6dkhw{#L`<%O3YpK!ajy!KD64N`ua6t}Wmk2cLyjyHna`v1pyFRg~XwiND?FU&>S*wzk-M?^WTjl0utV+Ad#+J{^+J@_h zXBg{EtkN?-61g!sjrf#$M_A9z-dpsh{3ut_k((8NjQrMZY=8fd-3dbD=s&zYrNejR zy^iHKiRZ5=78G2cl(}cASRwzFD&r+nkGx)%ClfYieNy(GyJCftu2h*U(K~J7`^;K4 zZ1K{=yG|?m(kx`_7UhIzyChY%>_v}yp0o?bX>-MFM}74%|44FNXr^%+Tf4Pe-fP4a zrOZ8BWWpw%el2`ODSOWr*)XlIKU%+2ldaRVz8hwe7A0moIY;jV?ZWOf^Y1;LLD81J z+%GSG(pN|?ywf|YSkV6T>uFb%a`zO;hB?6FMJ4^?=>AdAM~_oijy|0{tTFz9i=<4Q z06cyX@X@=*_(v|1vUQtICu^ozjuWamR@b<4;K{{Dd!%H;HlKby;Y!LEk4Ik1=q{8E z6JMW{wdbH%;n=Aw#dE^5TpYw~$2J!3PE$Iq=uLYqQ#TC@tZaEBQ@8MRvOr_}LzlRZ z9v`{#XT`G^-D6ec{~i*Sr9gSO=Y~v}6Pcpvp9Ni>Jm*6vYb1tatEw z)-D^i@=u45MTbsX_+x*pPs-gx7b~28rOI@PD+HeTYkFg$Y1*OFivF~6le9aPEmiQp znC;>mJzp#UdmP<8=|<3}K?Qa4mAe+F>;8R^8M=xwVb0s#3-~Ke9(|+jXn8irVEKVi zu?cN8Z4Zu{sNQr)7#{4?A=a_TY3E(}ne|13v^z_W3#ii1BMmWvzXqlz% zcz#~0+C+nxQ&OdCb@pA}tPpr7%r0iQsMX~-g=eqAS`Mo_X3o2367*nh_=p=-a}GS4 zSDU^sY4g~v7D7hG)81V2x;nkPK#xBCbedkdK!;*fn&HvAeyhd{dffxzA*?_(%6G zA>d4F-VbS1My$|{mI z|M+Y2XR%egKxQ(3=!(+Z3DJ&9T7@lQkUT+T=B0^*Ba6L+_ev52pUh5|c^ zhxEd;ssY5Txq$*|A!-|>S8`xFS+#vX9HrQhGT z{BU16#lQSUSl=Fn!33eX(wBAIYb{Sa8GKb}h?s#(-$VJ z_7e7#Nr_h%?^JY|pRr{Wp{s(y2&4yPylLca z!pvh)$Hqa*oj>@)@#(p74>ebh+&pR~11Azvi>I3fl#g0sULc3-YahG`rs0k{hLOf zAqw>^7ZL+5;JiX|8^1?tQO_?M;+5IoX#K4Ak1X8U-Henr@x3=H{zI??(&dT`cg3@?R5iwmwOv5 zRzF)bso-(3Ma2fe;KUJ!&)QKceaMEKhxU?ki;^E2r-)1{h?h}r*0!!VP_f`_*4*@V z{U#Fp|6;_S&quGC*si?1onlBH7-(yHmDIJxuvdJkj=lEva-+uGep&G=Tdq-FXPtSH zYc5o3WFYCF-8r!!DcEYu6cx8ouN3~cK6AIi)!Kb~)XyHM+F-Tw9bul);ndO_hgAJ% z8nhY~-nVaibB&}jN_p3fFDH*r?;4!muD!OT&r!TrZ+(Z!WTy_P7a%D1)B>`r7A_)_{&;c3y<#(iM5p%8n2n-QebyuZSBG+n+rnZ^MC8b zmlnRQPnzTucRp2@=vIE&=8mRffp_t)()jY$oI`QzYWKz4L?nOn&nf8b`qKDk5ZUU{ z&SwiukH1P&Xz;99Aewfpr~IUB#h_aEl7Y&d7+a^^=UO6RyFoTiOxF7!=36lH_o z<84C9=(-Nksj->HR{d&PUdI~S*99|n1vf8qefBE7{FPFXq2r+hr$c^8q6T%62EAfL_7Qs9lC@*?9^2Wh ztKAi4L&!~eWqSOLM}yo_x3Ca@k$KuVKDV3Xh&kaqCpT*4n!BA=yd{?tBN9xLt$XxX zj9jjtlj|~3($ww6fyok$>eo-xVw<_KOD+_wtyPWH%W;vP)4G0Btz3iLHbu?W3zI=T zXGhtDzQX^U3rY)b6VKtXFW80tJ(b{8U=Ok6chxSlU0+(b$Eh)G@;1dQWoLRU*4CZ| z{RC#Hq)BdmM(@W_AdU;eOs&cfw~FXlGn!SWggnRwsu044PW!(@lKo1 zrHVu(Z!cZE14Yo>mb@K{ki9c-q3Jq^`Qbr~ywKDQg@`2_GKI7vrf%0x)9{CiS*vpn z6+99n6Ht^74{K}XkP51mE-nET1Qg{^OWJ!ad`kCKf|9~T@4u#;o7Sx6*r)d9?c7|Y zi3UWiRJVM)qP4-rccL4YHD4}h-V`jYRFu;4Hs$%N&3+D2scttM`_48-_)3u_4KkXr zC`jF!FWaPOD?)_U3sz2z%SKP7?pf-dZ#QXa#xrZTTyKrtjZ+JAoZXnh9|i1gEIC+O zYB5_j$7*RtW7;T`=D)hTXCIn~K0V;mGcRq=l|#=&xE+f~v%1T(^)AoeExUp>qqeo= zDJz&-UOtVBtlh3uF-3B9E1Bz)Xp(+OH)&@q8g-^wVzMMGD|(ot7tZ+a*&e z4P2qKaiIsoxEUyLGqB=jP{GTfubY57p8%*o_TnTW_7F#Rz=}> zg!jD)F4MQ-DMlz_Wwl*qi*JF!i6!vgsrXJwRT+fv`i(ApfAK%9>CGPiD-@kN%fngGDB^QO2 zxhWz}R_9X>ok$W{no(x$Mz)(QA$i@Rkrh$E6th*BN)<)jMqCv$19dHWg*gmA?h75> zi`U#2rxhiw-5_S2L+Ib(kOS&epd!wssFkWG(ezNCQkY{qCiqn9-lgtVcxd%8Jkl$K zz`mvK1qgvM>wyahYRGFbf?69=bNdlM&i8JZXsvzSG12`B(o;r-`=~fnZdX)zxm(MA zU-wI2(d6gR{rr=N8D(w*@wa0OwcXAohXqp3FTGRcBh;hq@D|=L)uY+f?b9aG%0jhW z`m}rAo{}!U{qRY0Sg3{@DO$uPqpZfQVf2#tn?3Os%@d6-?LAJmRZt-_C#XvblHw-! zKdRWXtG-3A@Y(zf)e?ct)O{C-v>jpEF8$g)?@mdV+Kw8*xY#ww8lh4W zvC)E)zn(d{Sxs(0V`*0L3d^%^JQ8QR^DhrN&4GWE4ARk4MRR&MW~KwD9p?=ll0Wpw z12xMDLjxj)j1+X5H*-k-_#+RtTTU1r5HWP*X{ULshUCvY@<88m0$)JHu#tiX=j|Sn zFMH&HnPr$JCBL6U8YOona$|H=zIxK|q{xjqRrzOd&-_~7tKitz z!A8-Kk2K#Y@y}e#iECVA;p+Q`YSHjxPw11z1xy<`GPGmytQkt<$A!#PUp?&X?4d^q z#dGG(P?|VStVhjr*x5Bhk17?v3QQu3?YT6YO9Bzy)yLZ-r_k+XDCe_C&p0o9DX)y=+Q&PbC%Cg5+5hVRC_r5 zY~#?Qr;6tgXDCUG6YEn;F73M8Ysxt_PIb%35hsX2`GVJJ^;yKzgPKhwt&!y;OwX~pcmp@*s1f5O`ntJFD{y&4BK*%BQ{d?@zF z<3#e=ex=pp0t86JxxEf2X3Ut>@3yJ6dEBZ$rQY?G7WW%)=0`ZSXBg>>cOz|Lq@*%@ zSr^(JoVt#(Y^s`D>t-{=+gS{cyo`6Q)ZWbl1A2B&U7@VAD8_LXl+3B?JS(i-p||Y} zYbT|EzQ3!65lEuzE&GG9zbe;5h2n1YY(Aaj{H9BU&FXW~f8)!#-0qO;&3?NufO}2n zUzaf0qZIDp7RvjyYMGcsh?{Ku>ARGIsSQuxe<#Z>D>#U}vsm zdv%C6(S+!Fd#dWFyIL7O%0$7t&gXKeB(H;F)?EvF9gHrC^H(cx8mw{NabjbFxQC#0xskZUB#*;Uk4V}zjTB~yB%w2@ zVLB(9Y`E8sEJ9=(#T3Bvl@;t~E7`s(>FW;-TyY%`^*DUyEpxt}1^eA5&Hn3^30Ev# zgU%5pI~5Wu>*hoi3-}K;iW$hJ&m%ytvc|X4H*O+}I1BEywmtjYGFyCjz#`Bys0#iW zyOyg!d>>_gH)i#0^s=lgi#k0J@}d)m(j_*&Now%3lfWNZ8M`^?%?)=vz@#mgJsZAY zJy_|1GKrU9`CmoCnqtxJy0=qi^-6wmQ$e2@RCYjD>W92IvxpUub-;Q*tPQ6YWcu+_9hhLV$K{A|8Gr_d! zwb(dmET}=MFBQ`(A0_@r+~`&rSoDEqnp31R3^DzBR*{WBO9H+4{Qhk{De|8kCCd}) z0Txexa2QwW(d`tyR_FWue01vK-ung+-gTkB_J;dWOl2x1XH{W}$8wgzb8y~hW{m63 zg*-g38Ut@Ua)_zEU|0JZ-eo4!`=UO#+l1A)hNHqtO}v$qr{h+ zhbdD{qXZt3KI=TLX1OYfn36D<8*q50bMo+}hN9vv0aq*0n)NuQZQgXYdxbtl$9j-h zliOb#98WDIU8j<&`uBI54t5rlcO8&0nw@-Q@A%BpW-YffHGb#TkI*7(K+mMCIc2;W ziI{ErphI=EnL%Sb?u`Vi=$s2jGkff?JV#JlQIk;DU|wQMI83Aoy@}`Sn)?2CE#D( z%{hgA_4wd?EYC@FG9I(i}Q8*ts#;rnivrAi-n$B&^_))p{82 z*SYB2O8ij-UJg0j944#bDc169;Q$9_mg0GWf_I{RE7!Oc$4@TgFkUBu(Rd}z@`}Z4 z4GQ_wQI=x3kkogyx2s?^dMkZj6*Do*;(F2brV#Mwufq%Z1f2`NKT( zk+hI`71As#fqC4HDTo_8@jJ1dFYSoH8rs1H#3~OtVM^_JZB%X0;>3N5gXz07ZVUB9 zkzS68DW=tXWciMRF8wA~h)G-_OmGBXK!-|D;KQrbNq?3%tHDWm<%zPRD?!#m}fZX!zZ%+YGJt0{6zAOjF=g6 z-@^(tv@OPvU(5M@=kkxxlM|*MTu@}(`YAkQfF$jnnCabM95%9ybW_I+;Wk`*_3=nN|meQkSlibcs!!X>SmT8CT|L`N>|6c*J zd`7t<{967Yo{b3K-n^{?_Yhbg9_GXv+4sf)R@b-}!YX<9phvDZT{PFbO9kttL5>{= zHOS=}LM=Dcb2~!qN09v2G46E{R0X4_V+DEy9`4BtII)KuA+GjDh0uzViW~Z$$WR zLB5s$iH6*<+0Z67w^r?-cx(%Jo6+pGp4L;@;}()QSV=uo^i@VD?8TVc?C|>K^3yquI~B z`$8}-e1zxE&;;&3#A3% z8qmGE9M_D|x~v4s9S_M)TJ!Pi25PI4kWgxPq<=!G^mU@sECi*+68nNUU8^wK;{W>0!*3eV> zJ06oXDGjX*`H=oDElm~YOHMB$Sy$aV3t~#9SL#H$ODzbaH^C?@=#>4R?*EuMcl9J20nI67^T}WhG zcXjntc8QdCwZ+`dqQj~JtRElWS{c16eqeM1DKeN~Em2{-D(P!zyR&&vf9nz>xAcRt zc&Fk3G zWMP>1VqMe0`S$YCW*xKa@1LxJSxU9uS{vtCit-^Zf>yBe5k7PLJ<#ezIlqxy(^w1& zr};vhPzahdGpaW5`IG;hfCNoSD=Z23eSU`m*V;rz z?0MG&R(GFl+C7+GJ{YYFNOZ2Xc@1q1!vaT6IWYXtOe@jA>c~7K{P>jGQ#JoocqqWY zi(LAo?;vQP`ldqmd}e7KOh(K72P>l7v*4z96}!#9db^rRghq}DApf~U|4D$uA%KZ@ zjmKcB1k*S4+TTLwy?*g)vCigtq~tZDo9+>t&s5RB$e|AFHFa;5>#6HI4g zV2Wz?qn~6U?+V`5u^s<77QW$(_ zcD*5>AG>=8Jg-;XYa!#5SFwEz8Z*RA1WuqaVyrA*7MdP-hj{Kr^M^HRnK#jpGc(#4 z8}G(x$)I_P(@BCD8giCD5*GYwa=~nGw9H*@wt(2aH3HJFq3!l=faQM(41I2UC88_6 zdnxtn(qQ`uS{;0?KI4@nv^M-ZSO|Py+6|5CDVCw~M6 zxa}}_@-W&SYvcs*e}R>pumrZ~4Y~5TwC_5^#w*RP=s$(sNc-qsj$-lHnES? z!j`KpX|i~-?i{XuOy%m$zlwmN(F0jxDg)oh+v-+%i3v6mI_xmP%yKckK9@q4W&Z>^ zFK?y?uV5FfWMd|Som!a502H13=YxM);AL9fy))ZkdroVsiz%x22*Ls!lp>qZU`-K{?8d4H4*-Ow z`iSC+^U*_acMNxn-FcB+6C1Z;Dmn#(8J}IHjCUiony^;fmU2Yy6G>Pg7`;W7uLNqrxK;S~1k^H+!hC^> z<@x>{sQ3Tk1q2YHU&c`8h7ykW%Y<$%`!pb{M7X^y4=1||X*kOWe)d<94XJlrdGvsz{%l3;GM zu`ZHeg4RuJdUA@e6G@QzNO%zffS3HnjNZ{V+#!A%SfATv&D(2Zs%tSQu2McBg zW4i%n3o_cb4#0-D8=3`-W_2qJ45CKHn9Rae2NMj&e93}Fa_#5HzywJPcD+6VBL#Xe zTuhP%8ae=vPFK{Ye-!6u0Q-?3Kf-%quY=;!NuSi0Jr(tsyc+rD2iDNcy&FJ;$v)D^ zfZ~+D)vfB&Nx-E+F~if)XImp{o4I->+M9)1|Lo5%mEfD7{7i>n4S)aEtzS0o} z(BG6?3g-XbJIIP;3^RqvnDFv^u+jr{`o{rlXs=E*ptPSp^ultswYP{^+7?OrJGj!D z4!&KS7UtGNGu!&}8aa0UDtIe`?}{cO2tqv{W9{pe+A4EQL5hU8V-@v35m3S14uFRK zzC0!Z8GcH*Rzu;j#vVcV`+EV<&g|FFASxpvBY_GoJKo^=)qbKuA^oNhl~q^e9fqz) z#DSLTd-zHuDa5l~ZPlye*LBra9gM#Cc6Irp=o!jd0Hf6;t$;}MISX07D)jw*KL`kO zkNk@qOidscapEf}!G$C}dKD$uizI)5>X=aBW@mJk)Y%(0oFP)xY?z>AZxjT7_G5-K zu%uO371bo0kzLbYKO`lCQ(c9AO4k7@#pA@igx%D${fqS|MPO+&Z&SRd(VA>MIFGS$ z@IpMbff!3o>}4Gwm)cf=57!0Wt=J%5N1oL+8(hu2bVg$jX6cjY)D5kc9%+l(DgLF! zY!}^14~bavAT1F_`a9Ugs`K9Uh*-*XPZFqR%}xmPcReF~m$AWcwgh7e!McmpkWrm( zIJ%nU2lhgAFXG;BBo^=V-BnrV98BtT*S%w26`sRw)G+|q46y609uu4qHbK*8=Q;iIg~VTMd*P0B=7pcy7kh!>20 zJtLQ4%waWwciP8yo6bhtqlvSlpt&r%xe}O!z1(J(*hKH^9r{#X&dH+t!uKFZ!alrRyfN(X<`a0&MZacK)tUOM|V1>eYyv~ zmcxa9b2Mo~cQ8@9Jo+H5#muOxz~>){|NlGSh9+enc#c_VQY=2)CSrBd7)26;CBMsF zi4&MXvIZaPb=TS)oRy547X}1sUY~--ua$JNrhToAxs1YSLZRO|Z3qZ>-mbd$_XVup zMDa6)_RK?bx+bL_84*9f^OWp;#IFS`Cg24FAE1R^vAU{z?_Pz?1kgG?O}wIb5iXpQ z3kf@Z&SLW>y!Pitq3v$&(C=UA{npgQj@uWVtg&7vg)5Cn3a(DtO*m$Hfd!xsf#LCa z_-CkTXiO*_cfLd=P{KL}zQu3KmM!D)AK6q&J zpdVTl_Njh){{*&FSn3ZxC$_^((U-J4EGWqWC9a>{u2C!&WI*XlI6<|^~@Qq3q@&ATh(~{I+VpO`XX(0 z`FB4V;@L&A?poyUanN4{D?g->#l*D~nDRdy5VdhI47_x~xi5Q6_Eae7rw@8=mJ+*o zmoO`UqnM`pOhSU)LnSVOW1~=;AfB`ts!phdFW@u9D!xc~30Ad$Cmvt|TW2J+!;7Hi z*6tqArSApQRBTtM;{j@mDG+>7k(X#H(j8P`XJXA&2czR8z8i&Gr#W0Taxuju$;ECBO&cah@2^M1CYT4j! zrsI8+5jWtgB7;1z-RiUg+zx)1Xt>t%5c;&C-Ajnw_k-$+AV7s)nBA+47LD&K7DorF z{IiF0!0Y`5lS&s()Bui`$Dj_JM#vok%9bs>9aFY=6M<`(vT4imm0=1bqO~;=eeXXF zXl1CqWMt&^lf-ky)VO8HEL|9VaXIRTJ`O+@fqhUWY=_y0@Hf`+LFrM)hvK1*4;2;l zc;L9Y5A}FE)K=X=%W=pLOMd>6%N?}NjT6j{L+jjK<_iovwz|Q#H+Nbvjuimrk;nM7 z3HunVaq(P46Y<<}%s&2VIj@6)h$6^I3we+JqjLRAv3@eSRZzbIn!$a6&KlZ2w6@ir z?4yI$ws#(HO?z>P`Thun_gwl+SlgE3_DCGd(Ykmp(q=gJu+au}0&9BcmOn@RE5!rn zAC+_%_x>SHk+A0-RRg0BuR_0)^~W{yBYpT$@=pK()6tF-4P|4q;iv>M`@>wo8f~ma zYII=p00cm}Fj#EAj*wt zD>W&mFm;~zp}is7!T6gRenX3d9r?8=9-$6{hO)Uwij4OKU2<_)Lwo<=4r`D>m=5&A zbl~0Q_B8{tX)PX(ZY`i9rwDr>HSG025?*2{uw2?~zllJYJYS?};!;_@!cb4hW1`m6 z2ttcs^lu7KSIcB%ASaL(G?V_#ng48w0U&Vi$1E;uXs=H+L}r%G!yZ;4U4grNm+0 zHHp=IMdJAY9I4z5CN|KhcMo9)1F6k%*xtCPr1f+(_?51j2xE5`%&~DO`L)7BJYR#c z|K9<&A5O)sqoNALJ~Ua4tHJ;00dm5&FgJqlfA&zV%zDbZ!YjSuNaZbj^@hY(TLoTQ zcrC!*Ha#$xYVfWW*1G?n+68>JhJkt$ylgP?d|X#*tMCBmvE21xvBP^;9oR39`%zk; zF&@N8LSwuK2mJ1x1;J-=g4O}dmOMB5C1#A@e4+ufY9EGkBbb?6da{oZSpLDZO?yr} z4ePm+q><3{$oK#UCTYyZpZ-W#AM4LQ4H(D>P>8VN3^=g$Zg~_~Me@0M28I^BIfbP^u)BiBkd5SwB>w zQ@MsQD1D@T4d*zv;0uRbGvS02xm5aIf4ndHtIU=UPHXb%P()HRT1E8-wZS51By;Re z!V~BcZFkJ!a4J2vwKjLEzVRItjdJf~sr8$1*2R+VY-l@9I%&iLh)8TBlEJ<+<$(ay*|_lQunGEHpU zSvd1pS$8V;NC2ET%%X2u!GEmDZzwpwHOS7G<1EO$nLg2f=*$$;2i_f#SKagBTqJ#q zCdI5Rw=q88c>!7lOV5l7Myq1O)tjWzDmnewIYYE+K0Jz$j#kmV6EqL&!m7G;?$vjR zXqmiO^^G}NHtU@0Uxy|<2@86C(Tv$}TdOGYUj69|g1=jWIpY;L=z2X*-J~>gSmU|EXIUDmXlAz@$11SYYM=<0U9BC^3ey`#-{{}r2K?z z=cC+@_p<78=~9{$IQYlCRm6R_gg=4N_eK1b4AfgI&`DLesSReZN7@=TH^OWn2WA7O zH@9c?f;nk`RUa_dqfzLCs8n?gd2*+?h6<{qp23MBZQ7Dz_n^&rD@Kj2g%kf(GJ2&|r zGtyy;AP`Qw1yV^}S$VgG$UWfHH5NDVu3Bqz00V2rEX8FAKC5woQ!zjlPUel-(gd2} zb+YE=+D>B`g^`3Yn5uXMJb#Il=*xhUgpozUf{-M{^QBrQ#Q2>7AP*Iee+x$Q%kpK0 zdS)8$8>clcf*^(ktOB2Dntx{LjOZE5v}DB8#+V3r%JYH7e>*@oH?*jkt2+kO3dze^(n;WQ2NrJOum-y}lmj+tA~!3z>^>#4$a_wE(NjAJp6@$-4-V zoohJRzP-Mkt~$^nww-8rhI~xU$O-!&T;NW*DFoCxtSHhwup8*LXR&3|7T}%IHnmmY z@*eEy!fw+v zWJ(!)QYIns(1Ya0Je)|Ykn*R<^9AA9gnH(W)&*f()H0inUk7O&qmAW4rT{9I0`oBO zKMlB**J$*lQ3YxM0E1jGNtSEL{BH(;8(v*5yA3<>0Zafcj%#T7#{15F`Y;*Bwf9#x zu?AKwh4`j{8niDQ328H~ykDSZhN;r~J%VQdsPR6~l1M@bUb(+LAcxQ!j>a&gIln#g z_nCeM(tFrK(G1{8S37>0NOZz(0UMh~{UlUH!fIIa0*wRg>fSP`bQ(G*fAZ58&^~`0 zfQ)Wu{%a(JwYpc1eoktk%!!_%gfv}Uv@N*CKnisKeTbZ}222Ft)n^ZF_q#|Ty^;8} z0~UMDi~RA`FrE}_1QWCZA_yU{-Fb?j3KD}8!6V^0SmWEMw#pv0#{V{e0G1b?FyKxG zT02jcFCSAZylB5U;Or$A%vca4m>q=n3c}15{MEiSjglmk#JhY+%mFdOC+~p*tHWsB zpb$?p7$+`Z;Xesk+7&U@l;ywDuB_lu}j_qx5a&3M_|s#(}C1iG)f0{T>fWu0l|wm zK}^YhFP;~L_~DauNiS7m9a(q_Ubo5gz4W@*q) zd0|FYB3}0$)bVbY&-E68lWF-xhpA(MH9)_Fxot*(YY#v%QRC_b${vk;sP4OppeaT&l=c?@6^fY{2>Z5qMg6qoc& z--OF`b^&Hl=XqB)ao}$wB5t)?GtdvjK?HlmB$j7w>r-QyJ)%+D!d0sY_}&n)5(-(# z4&`xEAp*19+`#AOaVjd;0eOsz3b{EUsxx3`!>QuP2~RN6DzJ&wt!o9ti-1wWM|T7C zH{!AaKSrG23;Y;)VhHfK&K`0p+b(kXw*(h_Bg(fq;xN$QBq2ru*S~{HHK&6!dx5tj zh7b5bawC*E(|7Z(9k+yfJhFW4`x>)6IMv>d>78$x&J3mw(lAO7jkEm5cI@$fE+r8| zkTE3|;YG8X<-7x@?(!Q7q+S^P+@Iz+E@Dq{sxxK&CA=tScu{cZZn&RG&^h~@W_Q(5 z0f$D1l7lgxHMCYR56^v1iPzYu`eq*v2vH{)I3r73 zE5x4i2+nTu$mCjh059j&K@qPGK8uo`+YilVRjm8FyjV&FcUP5Ot zg<)KzwoMqM)l)bf!==nf?6ZALDxAx;o7vVwAhH>+rhDk;^}G2b1XiD_IbPFtZ(UQ} zmL@DyU~SXWLZb^|`fwnJ!oFzw00@5&^O}nwuuJiXn|xto5V1>Pyk6W13D7H{l5HHk zPWTh(-hjpJ%K%RyM3n<9R^EtmVG`^px^cJ+Rc;uwnUw&MOJF6zWvminNpBoH8%M=b z+>L&jg)>-nW29;E1m?Gs1J9YVFl3L@cjJmHsL>%@}P=UHPEaBy_tTX(Lgg}7)px#V0+xma1?%w9mph>>Q6bGf=O z0yD_;>xse8i?ND*M5zTNO#6{Bt`4}n%oniJ;lKhgz>Uquj0VW!#DYnKU!^zA)FlvS zV=|cT5w5ljIeP(F7dlgkAmP%LP2(b!h0~Q`CociOv511G15BzD==rh_LOqMK(R*dB z?i6H$0**bRC^trP17b9TsNZ`q!Kv|>`k3(Yd}LoF{lVy$4!3arHa8%|#mMrCv)Ovc zV;(tK;NQc^^N2^eDJEhH1_k(D0HQV9Isj48L&bK|FfOB@2TTpRN5S%UK6uWPXTUZB zWpfnQA(Q-oE8-{&DJbL@0eRdQr+6`bCZJA}((#@L?`0O3?e72X;r=a0iGEw)Qv+8O z9+#E`m#`_*o6EA*4X2MVIwmFxSwrhM(ExE6+5H{9y^==g42A~mRuSJsl9+RD2@U3U zU+#xdk6rC8BxNR!u~ohF8m$YAkqU{C1m?@t<;_|v_<@;;n3ql@H^>6375c(Y6yjH- zWW*puF&-@@P8;|BFe>Qg$6~(!!p7EMwBJ*UQAvFjk@r!5A_{P%*(_jx2_w0`i0IY| zY8MM?7vn=}0Rt8Cj5?3boB}K`B>>vEP)`k@bH?guA!=CAG(Hmtu*B)_LOq2r;KvCfP?9U&I%Iltf zo^9|SQ3qm$!b(AXRRUJHA2DftNniyjT&XX?LBn`3!>}2+vpykfVsZ(Z(CLiXbadDd z8JY;Kc;13Xi%9D^DhJ8UuJ> zqQN6I`@Hz)bE&l#nctT))H8t-OJ!mFc{W@cyUaq24=sXlk4MUF?%i3Iv~#_;rmu5K(qF9aVa;33?w(80XXmy6Wrxdsx`F66Ah>kK=U;wy9>w| zYU(3N_kfe>z#q4SdULxQ8vTXpDMVroqGSa2zyp7vX;PzFrZ}4Z0EeD4B7t4@D+QP5 z`?t~qhZ|bYV&(X^d3`MtsXZQmK1{c3whllAVGM*!s6!k-W$M6PXvuG;(zCvZ<|Hn*pq14^`a*n5ep=DH29K)(t=zk;3V^-?Ah?xmO7p`RKk zN_QWe%W;e3=mA3-HKn54UuK|Av9@ghv+QAfX~i{d6l8JQl0=a<8|*)~f?cwb{q@K5 zIJ-J6!MY1}neMs+b((h<1GyOBBNTR1_A3(t6CRvHML$*GJh8z*5n1$-C)HtIbnuf7 zqrC2;Q>n%5r5A5;h(}q(PKD=i>h#bVw;HRf5V8&qz8*5=3`SJ3sUnQ!w(S%$HId<= zQ`43PZxJ~iyd}YpJs38h8Ds=j^f8M268#6A;UpVsOi1cwKOHylIehRh@bLp9O)&Qs4lp zV+3Ff9#7g=9zb}EY3_FkPhZ#@hjuqVzN0gDz(j|976C>ZMHTyS>S&uFP7X$lky5V?!(<5d3+7d?oG-9-qW9e7oDbs z)A<;p&$;I|;Ri1C+Sjl_ZF2BUJD?$ildBj7nhiSIins^S(N?4@+D(8qFri7>|5W|( z#D!IO?T3m)G{T4*Cb`tMx}VmYim|yR=wN(>3Ht`?fJRSQGnLm=c@6Ss11KIuwBd_U zuR$7|iw!dQ2LW#tc4Upoc6PCpIEY=lCEygHiNss~`gQ$R8nd89Va!m*I2YP7t()M{ ziV;gy0&mQSLjzujK~Ef5(d7Bihog-l%I^<=vvDtQGPqDr7@+Y2{8E75qTT}hY;4vK zvYPB8q$o^ADlSUrcPG`MZ=4Y@WwlfkOcKuJ_*%swq zRtEln|4HVM=}%+-8BRD;+PzCmDHz?}UnUaILA7W+dRZsnc`FjO=23zz1xm0r(>;R! zr~~c7UbUkiagd21RAOcXP%iAp9LAw@nHb$Oszol1S?23AIE|SxVr#=R@er2d6a@0gypO!m>h3_Wa zuik$bNaQeDOY;T94n%8-NYXs%EFN(a&^L}}jRS^aWL3fmzv7YeC6Cz(&*$I65{EXf zfz9zhLl;=#+Lq<~kP+;Axa{?AfL)YIun_=r z77T1ZNlEs;0Kdj)_Lbz0Ai2jh@IWXMp2n}$yrH!@V-AbQfe6k#P-KYm&jK)hIIyka zL~-vQis6Vx1n@U3s~F8XKz$oJ?>0jmU!4%-1h zCYN%FxmUV9tHLe`%uX@rIt6@tT~6sRmev!U=NITow6l5#2^{Tq2a*G@wcT|OXMrPJmpAtN2jB>o zEs+e(YvBj>vJ1*>9VFUBrS(*9 zGZatvrNhb7T2ZjFdB#!2feKDkf3PzTj+(yY{R9I%Ia?zCI%VA=1BlR^q4KKcJPdE; z%WK;yc@?UsI~DAcIJ1mq%cfT36|#L-@IPvSU!Y`3zZDyIMVY?_BANbqx>M``MDiah zn~HwU^5aO|gQC}2l^)oZ=TfdKh6QKt%DPkz@dM!#M(pGW<^XI;0L{*JF-!?0By5A> zxAqtm63mf+5$x`)V9`X2@hMfvbb>h3Fw+UJT)`S&95Tarp1fj}x??J)wKq~yG^TU9 zg8bVXH4?T>_h1zRIr}FBQa|F|`Xwf-542sMByU%J!-x}{8g-=T9C2#ZQO`|DYodzR z-!;B4!9weyyrZ2L!wSr3Dc_bq=a{di7z58O&qiCg2nvJS=mSSP1+SE zzL>N$gEj1=Wur154Sqs?#zUuPtK2vb865Gs4<*Fs{-q-7YT#mXa0R0(&q9XXwQ5xD0omBF<1H2&u#i>JRWnO0XIC4X+k5cwyQ;i{q84G|@{p)D zJJh0C4JA4nU9}2}a&_;)=U9||Vd4s;rEk+E%afGFWuo7M=ChRF`zx^gD zFrjXqllAq+-t@r1%By~b74L5g8kQ*A^;?^%2q+c$MV#=>FI)6_iLwJ0vb=C?B!J_8 zc}tR3O!0d9m768)m5|xsg3cw%&RCQcg=7ChQD$i*1z}O1mbABEQRqvQU9l*{!m(s5 z7mcJ6EJ}AtJM&EvP5hh-Kz9K_w~%Ii_599ek1W4Qoc5EMQ}vU^)wPf8N($_dI-#hX z#j-ZLKf0lCEDf^hyzG*sl?FA%3eXn2DC5ETPT5>!A?xf2m1C@K;pClMq^!A0c|B??H8cnU^Xat4W#Bq>Oa5+w^rPG9xR zE@##Y@BhB{cHG&X>h7vvRd-ca_h4*REHUFZ9eM{}7BLL62bx-^5oDQ9YE<4JRCix8 z`*j4N>f*~H4v>v-tluN-3Xt`j|Lv0526$8zaqCU*p>BXoY2(uxw>Jj7HNH6^np#pp z&b{KlKdreB?M$6Md&z7E5EJOUd|8e|SFAY?H9Ikq>rZPkF%tDlW_%cluP@6==}p&nPES)NBF_7 z;X|}%_AxbSI{;)70NLa58fmVXnpL(5shTQjHM~toLVta;5->J1hIf7}9}oy)lbYME zA`tGU)ttsiA}^WAU?32E1Edj&U8rW>IYct}tj4kewUTkkOc^7I@(qx|NWwJ5rL%L| zbm(Jz17tDeF76SIMaYdEVR0q0Wk*aLVPWC00xVqLyiSTnC^cU)Gr%B7@(s9vAg~K3 z%*EH~9ulFC3eBmh&}wZy97yMo9a@@O;Jj};-Q)8?&W=o$QtU1f2be_a^F0n45=Lq) zjM;i*n3b>c>*=YpII_`{=4LxfiEA-tss%rl-d&+&-su=#qu@8Mi{A!0 z-)4H~!gp)#z=}BY=9Zdqi3@<+)rJ=_ZXGScG4JPG7$-_y7*m(UFV$qH-L(&y)jEfA z8ZfIAgPSda*~sz&?+cc`N0xVyopv`OIW3oYJqCeuMAR_kErPiGl)v^;29GGQQ=*1X zu#$>X{+BEeqI>A_ehmaf4OiqO-PBU+KyuoyD9z16a&BV^boK-eUHJDgF$o-F3AhY( zSSbbQ?BPML`0EupT{jVa7NUl!&@6$j>6E`dMj}YJl9|zii-Or<32jjU{X*KDMN6nO z92eNPTwml7Ep|Bu%bHE_-{U+UrYWMoSHd6?z^c!cYoWuNEAPnrLbaFG*1g4*@vX>y)fWRI40Q_E zL{~Mxzna==8in^VmV@`M$SA)UapmKWbLCWmmuqU_TxG}L^j##b#!1`|NZkBJs{SXc zzQEpwX<~9M#_mlL!6A9yH&GKpfDa~oFiH5=B;mD5!pD+?uf>Juyo?L&Lu%zD-VNhp zyn5=pI1G&UkE(F`P({3*fh$~m0Xzhu(n3YY6P=DhCByMmI7~gn92r39iVm-rj1#M* zMCk0T!MRL*UW;H{qGnGL{^l=9$n3!TJfJ(aow)Ndhjn#7rZ?936r`_g`#x?=PoYYr zt_dXGrV(!x%xNnhr_PYroFuV1MWW@oMxtGa%lCeHH7>wh5)So%2zZQ}~EY`y{wKxGcPg|0vvT*tnZ*eqp z4RFDX6LB_vug3-6OsY;ts_rQ7^5*r`kwP?qW5eT-Aygbt8ocNHE^kN1{VUBG`}DxV z_8TZ~NLX(01aL0(IngG)khWsclEX%!3+%hEFWQxK?waUG#`XL4c#X598l0#G3?XlM zO2M-*5{rlw%+Dxkm29|mVFJ?sb?NoY`W@vHmmUv)dGq#)HgDEQnd%=s4|A4!%Q#Kt z{TVV>v5M`zW-W9g*Zhmcp3A@&m%-9hzj)||UU0FZgnV%9jj3Y+zmR)E&0@s=j|*nj zN6j|M=(4OMq4OEf3+#KWU~ao68kezgr;ogbkWydO%EeU1p(i>+14@>VZ|uhbD)Pln z-7$7MTbx!vCfK0c8UJp%jMLM>lQW^laoyJ&G|1ChiF>ahgU0WTU`nL?Uja*UjrI zv3cmdd)?P;Xt}}ofu>zPq*}OmP0jxgEbq#37JVi0(cqug=}VKe+AeEQ|7SNBdAJVG zU#h$ZM5wm(pg4e%WG9NYBbo{*nN))X8h2F=$Mnf~gg>FmASH+?PcvxJ<$9OZoQ;meA64 z@Gqn-h8V%vZ-ITs^+mQx(8+KH-n3;ih~?2`JzQ=H&rfvv1(odbsZ$qfnl>S%#xb@M za~Q$l{o**-8k9FW5yM8M{}M5>gf?|Yf|xd?x%Mj*CuQ7+b_$KY)&831R}g#bOju@c z5hhMdHZ*bK4gHwC?^K6beGTiE0)1Y?;Z4*y=FE1J`<-yjuScdj_cW<{`?B& zKkAyKGIxambyZeU4{=9r&HTch~>pg9~ z+>`(FAbidhh5;DzNA^tJtPHoBG#TZqx@h z77Jczf^`Kpw%r(p&`71Q&!@Ho&1ha1Rma8*z2G5i?0#90p1F|^8&1^Bo3K#;a|aZ- zycrx9deFeLkk*@$)-ALC__b^e)Y#bSW&@Z@))hU6Zn0M9NqQIz5@$<|HyxREozxkf z8j0Gzac)MRS5-a5JBTOShGSHNv8MlF;o&JhC*3hV zjo_>64#!(fbM%9X*R8Og5XX?AZHM&8^%X##aM|JU`1#2f65}>X zq>QGNsHuwZKMi)K$a_KP9lF^yy27K{RiSZbLz>F!si?xE_v<_kYKoZc_pcq}w@vbv zsVjdPnz>YcIG64tU3%$6Gmw)l@n`wK!(_@ggc1!mShT>IcM zC6#dmyQx#W3s9WyX1omH8*rvrnRS@Tm?tXIJ1!C#6KAgm7wl_~t){k;rDpmsNk$9P zz>L2N(AU5Kw$FvQac%Ty1rCNTaKl(5XL69WECp9arW zEU-;F?=3W`v|=Y_Bj14FV>9{*l3wqtV&B$g9xVL9wGCFP(nK-NBTV@!dT1cC@2>w?(rp1DbQTVljg82ptzwmP4ae;j^3R^cGc5D3UOKAzw2Abh(J7Tsl!_Wq0$T0SXilb<6bT; z&}h1$@MtE5{R?VK5Lx-UXbXnXFUEHYyw zuMa|)n)&Y)B)@sB$@=XogSp6bWSZ|-s3F_oG&qvVi;M?YYb5ulkExWB_O<|~Q;?7e zDHTcROyUhSQ$C^mco@-u1qla8LOb~;2dcZQ9bm~ef2s1;!dRQa#u_pdXcf1hFK`2i z#ZWX3-8kr)gN;$x)7A9%wB%~YZ;WoB#E|$&&b}rftleC*e_qz~UXFha)mv<8n(q^A z;Y6oxK*_c*ZXQpUsfLg*PDq@1lure*kj5TT(H)YC@IePA!P7WbpKVs z%=#4gzmQQL$eDuq0oEFO#zr~RyfXuYg-Usu&eqw7Fq?#QP%LJol!AYwfDSU!f*Lyo zmK)r{Mgyb>E@H#ui-MOX2OuK=1Lw1wXJB6Qmnv_!>+c14NJV0qVd(He+SizQ;f=$v z{DN2Bqa2&-meX{QNR{>Int$OkIP$+NAlRZ=;R~^s8@z8i;RAS6Fi#7t*-B*`ykN;( z@gJbqrlu3)`Z<*l)6MH-UP@6cp^_+E)PS^;C2MX(;3d6+YQDR3U1;bp&vM^pgF zhm7kuRM=b&N?gedlRe-7X;zFcl zs#57Ek>S?1(~t)b3`!u$`6I4QU&|iLbjE7b>XVJ^`_TQU$sZt(S8tTctAl9!Ta0Q5 zvCg1Kfc+DEK8XYlPw=u&Ok*)6u5OrsKCu)C3%=C*PcfgabrSx#xx6@)vBv`YHe^6Y zuZa$!&}MVSMl8*hER$za1JSkLV!_SGOiofTyF&uWmg!5<)y;RXC}^3%5o8&t24v%4 z%#hpq3IVxEWXoF@(iR|v%(L=nxInUVUC~Lt07=+bQr2_tvK}g*7_LZh&jPXT zixG-NI3g7GMap`B9H&A4*9z{HY!ePBg(n~1+$&LPVuDpKvm)twp*sCUt;xppMLtSA zZUpgRJNBArbQAd|W1~27`|8(4HFqN9=mpg=Mf~f6Gu}dGcrI=;H8US(G|?FwcVX7? zzbpV-lIYw8{eOC6Owi^vB46~5ZZN^qx-Lyp@KprM%W=M|e3P0YY7?I&P?G}FJ9 za&`0jnKLj~!=U`fph22WlV$EQ98~#vt)A%g2q@741Nii&bFxgejePB(XPeVM#TPCj ztO|n+hBz1%r02eb4W@zbDVoh^YDRQ8!nxGpGUa09AXl-Pi#@lL`-h`AnJ%-eD!O_= zS|gaqb|a}YFIhL%R5!hLzyElT75f}<+}V!FTQ@f1k#y1^q^m06Rne&dHG8IXoBj8h zg!x@IF!ikpUZHm)`t^F=+P2(H@UvBG=U7vouWqck)9BmWjfkM3p52r=&BYH+6=YWp z8U()?oT^$A9i>z)2n+9r)kZ7fxqrONemEduSSvJ3(=j{cgYL{mTki@<$mf+1{H5}= zd#Y<(&vXl3W)Yq;JjOEjoS@Nq+|g}{bfv()_nf=ydo6LN5;LW;<)aUZ>udUt%ViQ) zYV_-l1=aY@rKl>M4CqU1TM^UCq#4;zb=P_H7H@Tb4;%2P2ERc-^t(yRsfh!-3xZ4? zdLkmrU(1%OoMIE=!YZGaX*P!)j#%~uR#Lz19Gj@V<2)L24JmO8Q{T~rh&n+xb{nQ&7QI+{ zeo(sgiSCPBx`D8;v)1A5fD}rOY6#>n4^LzJ0v>BbIO5mR*9G{G%SI6+tNYttfPvWM zm~Goekk;kkj2GTJ7^5n$64jus6128%xnrLMb)86MR!5H6W4o(31y^rOOHaO;q*Pwp zXJ6xQ4_jv6vYVML_yzGcbaRlNKC^!IKGf_zsxp{x8i~1oyLGke-@YNX+Rq%Jo(iiL zBH*(~yM$lH7%9F1yuHr#8dJ-ol zA0mud1Te{>OzUxxDh*>WS>*GI3B6L{B81xBJZ{qut&it_Vv?t+@Z9Wgn-QiR*V zR34$`Jc;@X19FZPIyj7x_G6@hwfX0AV2BBUm?=m8g?@qkTvS`ZiI7>nV z%t)uRNY@P+r*XAYQnTH+PE+8h==)?~W2={GYb0h?=2GkcjKp*paC;kvGJx)wI)+hy z$f-?WSfjF0+wAhO0cRnVm?jX74MzDqyhxXY4W}`6!cD4b6-+;4uSTqLstgU-dMxCr z<$rhrX1ch6+LR^<;YKkJ|Vmf zK{_JaJQ{qWQ(riM2MWq3>LfaktBSldnT;=>aEEwm{(7PM0uoV^FsTFeId#$0^Ls#P{SdHAqI3-02-C5hHM>Rb4&$-1_YrU2^qS- z2>1fh`%Kd4M%6rpq|qS*95L89U?*Q_^T5by6!3XT!e<9btfwS=9-@pA;_)97*k#ts zV`8i99HYm?ge?QCO*$%LeGG_s5^XXvQ6yw?Nyz*!R3`{RZzaFqhEU_N9fm@GHqv;g7SGQN{fRjO54YR96C! zed`;^IGUk?naz4Z8cdQ!_L&GHtLB{;BrBPzkd^ga&sIjrK;sysXzkVr+@iG{0v|2@ ztgz0NF|AlZBBGxQwGQh_E0~JV#y|q#f|4X-UR~3DkB5heIX-wO@JPn#z1k^0@N>K* zfwz(*Lg`%S)Q2Py-szNv60YGsw0cbd@X!(xss3ZOW@g8FCZDUqc6C2}b0o^Gm{S{6 zBSupw1$7bQ4HV4rsD|D`TC4?V32PC|3*(zBLj{wARw}Q#$0E^9-{-53f4+49WJ+9s z%6Pt`N5ge!wcEfK&_saMZa)AsT0T)K?>GpV?*jmiI1H$5T160WgA+$=+WUdk09z0U zU{dHI`o~}^Janl*Ig|1UFb&Yjy%Nk;%nMBj zQ(e=fjetfuCpP-lNnUo@*;}h-%h?W3&l*P|AQsX#iC1UCoCAu6AOJ&(^z)zV9gyNO zfdA7v`|#6XN?K10LYQ_zUtl<2FW3+J+|(~PY?Hu-U}*DUwnojo<3BFY5JEFCwEo%6 zO>g>O@sV3Do83>#{0YUtR%izg%WJn;+A~e*#!K-)djTwr;mzYwa z3O-aQ?0--Ivct<0XwrdM?FY2~Jw-p4X+7GPM57w(9tIWoap2>#8(;A3^A}`G#-|e@ zt>lJI9)ouGV0J~#d+=PQ{!N>Zj|0G7- zJcjcx5RZfJ`wV=*hA{IRY&qwf%)+MI{7oNhkIY}OjKkB6cl}}B#=N`QbY075X77?NQitJ(lhdHfg4R@ z1wzhK8M`j9Z$<7BH1K&-->bhKm`2|9XRE6V*TJXdYjGuY^@3AP8I;Y00n6 zOHh;%iNI4xV1ytKwI$vG+9#(QGahBzuG@{W9i+=5`W~Aor+&>lE17S$I zD_6}$CAu+tCVluGA1ii-6q^!DvY}e*vgp@o?`^1z=dN-V9G=Q*Vt})3Kq73Q=HWbQ zm(C_Yi+=jeScJIbq2vI(zf~=yH`=Pgl78<-%`L(Sh8`bo`MiciH`5c|5o(=>D|r>t zTx^+S^Gqg$B;XK6d3#SH{;(A`iQrfSyeXj@#W-bS>%I=AXG8AUhM2Ww8uD)!>)n-J zI;!9DXTzC0(o5slO0*o-rnFinwf`Q|#Lc6bX(|$L;Sh&Zq*hb5*~)#}LzE?#+xM-v zkUEkXNJm?@|9!}v8~e92$4?3POg)Em1U$2pI`SHOHWYH_!T#<3E8%sc;E2{@IGXpa zCou_5dR=8{7c2_$O^%CHva!}}b4!<}t^?pzRZ`+2VV{YcKK=&<>)Z16=O{1Ibc}8= zjh&;|O=WC#N2u0j;jgwy3Y@wL^zpntbvapPIx4W&^0NhY;PlBCWfY^xua;YFUM!uP zSQR{Yu#ww8v2NhFPU@Wm>TUgAYXlVY>+IH;7v5&#Y3s|nozngSvF)EIFH!%uPTnb(+S1BX`bqp+*ouuPTV4@Q z!P7Qlb~{~wzi#_ZWg#CBC#Dh9R@JFB*(m0Ks{5a9jIZ=wQig zaVp~}@$T_Kc!gw)&2)NP2wpn<;h6fa0nJ25z@cph>N=v`WttANsK^Gn>VIFb`Z2W8 zBqe^_U(QOKSI$a>H?8z_-@}}XCte0vEJ%YK2g_&G#qS8owY>SV{722^n-DP9Ci-YN8!c;D z_ySl7LLxASPgJTF!7)IG9QZ%3?#AN2A7_f%5^t5K4d){D&d1!rvTn(-zb_AIGWuMu zAMyVKSpI4CdnGGT*2DZ$cs}<0_?t+x*zgkejXfBHwj95Q%h$B_UG|!tNyZ>RGLlA; z_AS)_?MMk8>(`t!1dlG{0I`6p-#Pt)CM>CuEy+=2~`a#{FGB!jyt1fcSq^uR z_S%r)+-bhtzz4YgX9XuZ9|x3}L)}kr#y$XA;0i1(T`z|&y=%o&{GX7tJ_)m2AXt-RWQ`j~s<|vCF%Fdp_ z3antAe}#Olf;Vw;ghcT3S$H4A2OB?zjLQJo*}u`)-_rUh)AtS2{*B=z_Ws(J;Daj> z;Q*ia4-}&zBm&ts7#W}4BtYOzz67Eehw(Su;A}g|au|jr7Sfi((5JS^>1hj zZR2B#`F8^o`3l4k3J?%KC_n&qx=?@s+`wtX%z98MI#SmUg8)mEgF>+Y0l*?xhvY8I zXh=c=&pOZ`Scfe8WLiv-foZW0ycoDAU|$yWIL2)SG6)u#1N zMQ16o@WPyfh5;olyn}@7g~A?AN5VjW=5b4k&QQTe3^g+qi6>NWplmrSI*m=LgN-px z1IF(a%tbt(>bqR`a<$+zL4@*p4h2_yaij5(nfp;G)hD{U7GD(B_AoP7q@g(dfbZ~r zvr8N&C;PlZ0uHr2cX+DwghTECdt{HH^`YBs8%y6lA3Oc_qyhVlHOxb!S@l_d&8HS` z6Kw0VOR^KlyuVb!Ais{A)bHY9pavCFcTDleoM40ym#;4~^+L zXL*j>-Eru$k=f~)RR7{abwkrpD@)HJrSnhc^xzNuC*DrujwsRW+*VLpI(K)UK0zY9 zDV1>5GEUB>pk1SRZEUTUK>CTY2rL;&Cm+nS=UR272Gn!*(Z$+_Q6|PcNZR!74Vg)95bKMXEhrbbG5OU?&~Vls+hB! zwX@o$H>ojH6{NIpY9eRyemH-RFULSz>%BJ@-ttTMMk~rz>^P`jyIxz<%6eR##%6PDgKJF0?Yf5)a2yUojmYd zEcW?itM|amiQDS#l=0>%_eN!3b;T*28~D)PQxiHgxtYZ|l|jj>U{$x?fsjm-vXUE* zMfP1C9O98&FR<=UCF!{o&js2W7V1epl?-s#Sr*A_ACi49_3>gK0Rf|~iGz*3uU)GQ zC)y}4lxxQ+Nz6^}p3sn1Fn1gt5}Fv+C72Q7C)p;RPdaK#^@a8J3~1R{b9t;(Q45f+ znkm;`FLE^8F*GJ8w7X?4mD7eOAgl05(p^h3aI0kc(MRFOju5+VkGT3|A2}*+;=TnW{+*qpr76Z2Hjpb8}k;=1%qK&rMnlO^%Ed`fc0U6rhl5Io4Y< z+%_2V_-KXhfn(}@tB!F5i`?9wej&WE~Im`fD08gV_>u*fN` za`JVSq_<U>bvr#iu;atj6|MmzF#sW zax&nJb6ZWyd9e|*K*g|^!#spZzNg6{`lX7+4l(D93I1tTSI;RbZ4`=$)n!wQ_F6YrPhhoylf*RAaJ-4a}r$NG{>KGc5ct|}*DlDeW~aJb?Ix1m)oxo~#hy}b;+W8;QqCHC9-CTEvSL>`yo>1}ORHZ=}!Y>KT}!#Wyr zLT^@w_1cw-krR69sf4~OEpMuwo9wwlv!V%OdXrrrj`}w*jmqUI_skfGd{3I_3Nl%>xT|6OvOjQpVZz;Z`Ikdn8WUd8DO#i( z>q40{4bCP$Fa33e2{di;rU6AC6*#ra=ME9YURFIR&3kxXfOvNL<|u2Xm`^{p)}eaIYF_%^`fx2v3#X|J zT(`TBtK~e^8n5C{K4xS7k-CAAx;LNg+ueSjU1VX+1qFs}oPMNFZ~tD63H~2^%u4~6 z`Hz3Qe8~`()3HluoDiI;WAqU0(X%`A`-n+-R5293m=DNJm<8RL{UorBPMr%ik&w zukp3C7z|oR$LUAyRZLmfaYMs6GdgJjNN0x?XkJVd&19ME1AhMc02ndWexH0&I{n-0 z;B(`%&$&=)FMaAe`W3~-3x|jV56&j~4NctwUB$cZ#J&FG^o;1V78P_f|JhR5;jUp^ z5S^r6L(^rtG-8d}^Oq6s4f~Iqjd9BId-wLsf;|-s5!EN8qpdk))40C;)HE0u3|;>G zlbpS!FyWy@orh_vqdG*3OOw8Fd=!;0t)EmnFqg^Qi=V9x^9;x;k{iUX7FmvxFwaxfZZaGK zO3rJILp)?S`VYu({8}M*I!5}S%3#=9t;%EUMPX}s(B;&yzgTWCdsLHnD4eDZ=!{-e zI_%QfLZ?~jz<@5Zuu2D?xYf(8sw5yT56X7wPaa=`{Xn|eOV_w)=UaZYy!qBOR?nP+J5+maDbFGv!)Fl2fF8~sxI_8`zacPY7 z=4Wx60>V7UOKG4q8++1Fjv9Dh=^(7CBUUW8%*$ zJ0y$fw0E%UDVm>B?cG^cLpRcH@*uc=sx6qaZc?_IMpde;hZtuxC0RcDwxW*p&B{fT zcH#0IDR#W4{m%}+3|sp=PWm^-DLq4}0*)BzyRsZmq*3X>ry~|D_t*WsxCcdYr!|S} z-zE{39vM2ng-*_X z3!NP=u zh+~}5nczcL1CRdIMGLu?BcNjP{nvVQ18~1!%H?x*5tT`lH0`N%zkI#9LveozqxO{+ zS63jBac1Uu!}-A#ok}hn0 zN|01#5-cnO>FmtrqoGaHU4}#1?MIE|`ud`#l9Mw` zEUYR@rTm}xpNR3O=~ef?dh4mw<-&f;D^WX2)5p6CpM@Vw+i$gpqeQx3W&5)OEd?Hh zj*doYLMf@^?+>0~aa}3e5t8TWWauYZ?MF<|EUaTKvz%HY(mPz>uaMD9``*=_MXT7152~z$mXy{p-fOjAcdylMMIc^!xWhu_a7V1; z1vca9$op=4Va7Vxam#J*{he&asn`|2v?iZfgxquQk$nGgrCI@NzC2z&c(r4h9cm4~ z%4r>sbR~O~)BuSC9gIVkO@2yGa6TF0)sadb3E+Ggk8~D5J zeVfR()wq8DM&lRx{+ng?Gu|t=WY{Oer)3SU_AV_~x1_-lK5;X_jrJ^I}w`$lD>T`OO{Xt==JL&v@KoLYR?ulZvt zEdvzNQa->{)W`yI&>49{YXU-7RusPG4y};k7+Jm!3nxQ7;@tUFc!RW7|K)JL>OMYp z7T=z|KXSc>K!s1{&<7q^eQmzSYkb$YeC86q(TYhlin2d)Ig4dEPLij7@;76?0CFCG zDe!B4f>B=SN^GS=Z@C=IGYCksdUIl=L5KI}&4Dcn59}?@tEgg57wn%Lx4Tr}W+0P> z45Y6ZZ!4OMWC)7>yL2?H<}CvGp#FGiixCKpvg4#5fww7;D{#=58CD^Ba*8b1X?F>J z(#!rRz_sN)(euqm_B~< zN9AYHdu@T-?F;MJ$+Oe9gVgUv=?-CQAyD{usL%7@5a#J?+7J{6suhj>*rWG-G%)$I z>TMG~Ut?(!V|g0M_Ky}`=JSJ=Je)1?bsUN>0KNI})PWU(RTRyUbxEa96x2ks0jBy$ zsAd)|z3DvFrLTn$YE7f6c!WFMy^+N~u4Stm(K9@yc~*v_EFvyCRajTD^5v8Ko}kz3 z>RjVm4tpG};u_T@hV-)U5O#Pa_qs%!w?BEi)SC!dj+8%^#Yle`4)dJ&FzTvDJ6tA` zYALKs4D@*}a&IUlLE3wbhHNEdY^P6|S$gE;?Mx7@j)HX;Yr=>S!M?Tf?qCY*wMPz1 zR7}%On$PL7jn?G$o|KS5G|m1OB3q@S7eyXY2{EXGsB@o%j>elma>8hC;U)|b_{Y;C zS{I`gX8jT!u`zA5Fy=#ZXx=4a{wjHnOE9Fcr;A9@n%V@vwT{@GxH4VmcKIbbTV{97 zuiEC(C{AX|h z&dQfQEICdrVq)S^2Qr1%gG-R8+U%;0mb&EGp&iU{QY%#J+Ben`pU$K6i=q4jL{Gyt z^uv~#u*&;%Ni)BM_-ti>QGL07_-xrl_8K|7I$}bkl{~&Q`PjD<$^EdQqHiuM$g2%p z7&mOSz2QOJXDcDRyN{PrpF)B2J@0kCx=O=qphS%4!YC{$L+<-Csma9v**wJGmpUKj z@b&R{?gkm$!>eNux2#0QMxKz0(z3X!r}{x1`MIpl4lT-D&2uICny=-0O)I zTQ>x6x~$KY78kANqbtXeqU(ISCB)ZeY31^z@F!|SUyhGe9*bwx2-bLZ)c2uwx75+H z?y(9RxrjvCo4j?m-n_q}L%g=rOF!h=d2(U}E0IZwv1ap3`64*3b&Ok0>(O!P#tvHx zmfJ_#T-2izT-5Kj?`Mc{O`qzIkW`$VPUKd9EHU=V`D{dTwsw`IK+BA&LY_pSXrZgb zV_8qu2-tM#kZ=!rd0wk3bm-ZI&Fw>+_Y}586o&Rpo@WRRneYqoDYoI_p=VW8eh}VM z<5YTWR{mat#Pt-pbv5fN`{=4um7SNEXO@M#?w(9mHhr#XE6(0!;@u!D9O0Ykt>0J_ z9xkcw9ZE-nuqKKW-l zwbsk%!($$Mf|jOm=nNTk)H0MP1SXFRGHTtuFHRd|+HkymS5iGe|LUg9$fKQXiG~e5 z)4IEAO(loI^`A!cG;*8IHH2GfH;z1IOYJZ4>S;V7AmOU}N7}ks<8UkYs4&CR)VbVM z&t+u?%AI=x`!}VG88!6OpZp**up+dLq2WrJiSP}-s^^(2?maAvd)m{LT$PkoKa=cT zAkqGzziNMjSwmge*x9M|BS|TZwiO5K4<{Y3q}vm3^Dg4Zy9+9#77a|5tL}c-<=Xrz z-Rq%o)86ST5y@u;#*)4DQu;mp9*Ym0p%0uknHBoS&#nRulh#Et{EICy)cz*zn-ag=l-#h>-}BDDLMl;Uj24$vp`jlWG}_eUz)3b zOE6)6S8`?DJ2Txs<>8mwqJ}K0WEQ4(^!%^-;CJM8y*#SqcC$a3{vfu?n=1H9Hl&vO z{o!#=HtqG(XSx&WFL>4(28})3w85#JDIgejY+Ae>5}zpTXn8<*dJ9@B)Be+{@ej9` ze%>ftPClEQ(})tABA?ky-ydPVuI)S6A6h zWf#+>xxWTASB4EP6y2$@#w=qahj zl__E2Tnf|NI`8BHE-62K^GizDWi?qfra}sN$105m$91+Vb)LT29bjtZQK`|8zRouO z?2g1eJ<-2;SI}s(#22~;q>Xnbri2~YMU^K%5Ju`Eon9wMO;(|^yo~$!fSC8W#U&Lg zrm|hBH+ye3Xiqu)<_3oxEC&9LnuCt2Oq# zAYP|i)D>MD*B00zR-oQqYxho$g_g7k%%7R4gQ)C7e}uYqH++&v{kS#OP9R0EB*ZYHv`W21)# zqRRqu54v383{^PZA6n?EC;VEd64ORn@YE{}QQ}lOR{7Nt4?H+W1h$trOd*krTvQZOEy!Uhuqqg@&`^ugpvY@3_mk%>dGG z)y*lK^4@FfPQUx{CeGUOD~%f{tL?nvy{A4ZjNp_htVL6jfZ2Ve z21{vjSl9D+(e&4iO=T3M4-__BeTR&l%SD;9FgXn7jl$Y08R^Q)IzSs)cx*t_TSE;P zT|wXsmG|}(P+8I!UG}agno8Jf?NhJ2IhB!mac!?s!q|Lyefq5wqzRzB)?xdcX6RFM zWRvq$orWjaM~zcvvtKN?j*>!UVqK+f?w<*97Lz&8u>qABmuW8Bph+{NFVY=R=rDU# zR)M-|TtQkSt}Q@8I%!XsJXexsZtvzn@vi7(Zl3PY!deZlzHkMvI(=`?WSa?_dJc&t J?bnwt_&?hgD^vgg literal 0 HcmV?d00001 From 276f4fa96663f9b23a7845e5f3b2d8f84ed968f2 Mon Sep 17 00:00:00 2001 From: Luke Gorrie Date: Sun, 22 May 2016 08:36:50 +0000 Subject: [PATCH 009/209] connectx4: Progress on initialization routine Commands are being successfully executed towards the card. The full initialization procedure is not in place yet. Support for commands that span multiple input/output pages needs to be implemented. Current expected behavior when running the selftest is to successfully execute the commands ENABLE_HCA, QUERY_ISSI, QUERY_PAGES, MANAGE_PAGES, and then to fail in QUERY_HCA_CAP (likely because it has multipage output). --- src/apps/mellanox/connectx4.lua | 319 ++++++++++++++++++++++---------- 1 file changed, 220 insertions(+), 99 deletions(-) diff --git a/src/apps/mellanox/connectx4.lua b/src/apps/mellanox/connectx4.lua index e3f621aa13..7d8d898cba 100644 --- a/src/apps/mellanox/connectx4.lua +++ b/src/apps/mellanox/connectx4.lua @@ -1,6 +1,55 @@ --go@ git up --- Device driver for the Mellanox ConnectX-4 series Ethernet controller. +-- This driver is written using these main reference sources: +-- +-- PRM: Mellanox Adapter Programmer's Reference Manual +-- This document will be made available on Mellanox's website. +-- Has not happened yet (as of 2016-05-24). +-- +-- mlx5_core: Linux kernel driver for ConnectX-4. This has been +-- developed by Mellanox. +-- +-- Hexdumps: The Linux kernel driver has the capability to run in +-- debug mode and to output hexdumps showing the exact +-- interactions with the card. This driver has a similar +-- capability. This makes it possible to directly compare +-- driver behavior directly via hexdumps i.e. independently of +-- the source code. + +-- Implementation notes: +-- +-- RESET: This driver performs a PCIe reset of the device prior to +-- initialization. This is instead of performing the software +-- deinitialization procedure. The main reason for this is +-- simplicity and keeping the code minimal. +-- +-- Relatedly, reloading the mlx5_core driver in Linux 4.4.8 +-- does not seem to consistently succeed in reinitializing the +-- device. This may be due to bugs in the driver and/or firmware. +-- Skipping the soft-reset would seem to reduce our driver's +-- exposure to such problems. +-- +-- In the future we could consider implementing the software +-- reset if this is found to be important for some purpose. +-- +-- SIGNATURE: +-- Command signatures fields: Are they useful? Are they used? +-- +-- Usefulness - command signature is an 8-bit value calculated +-- with a simple xor. What does this protect and how effective +-- is it? Curious because PCIe is already performing a more +-- robust checksum. Perhaps the signature is designed to catch +-- driver bugs? Or host memory corruption? Enquiring minds +-- would like to know... +-- +-- Used - the Linux driver has code for signatures but seems to +-- hard-code this as disabled at least in certain instances. +-- Likewise the card is accepting at least some commands from +-- this driver without signatures. It seems potentially futile +-- to calculate and include command signatures if they are not +-- actually being verified by the device. + module(...,package.seeall) local ffi = require "ffi" @@ -18,51 +67,47 @@ local cast = ffi.cast local band, bor, shl, shr, bswap, bnot = bit.band, bit.bor, bit.lshift, bit.rshift, bit.bswap, bit.bnot +local debug = true + ConnectX4 = {} ConnectX4.__index = ConnectX4 --utils ---alloc DMA memory in 4K-sized chunks and return an uint32 pointer local function alloc_pages(pages) local ptr, phy = memory.dma_alloc(4096 * pages) assert(band(phy, 0xfff) == 0) --the phy address must be 4K-aligned return cast('uint32_t*', ptr), phy end ---get an big-endian uint32 value from an uint32 pointer at a byte offset function getint(addr, ofs) local ofs = ofs/4 assert(ofs == floor(ofs)) return bswap(addr[ofs]) end ---set a big-endian uint32 value into an uint32 pointer at a byte offset function setint(addr, ofs, val) local ofs = ofs/4 assert(ofs == floor(ofs)) addr[ofs] = bswap(val) end ---extract a bit range from a value local function getbits(val, bit2, bit1) local mask = shl(2^(bit2-bit1+1)-1, bit1) return shr(band(val, mask), bit1) end ---extract a bit range from a pointer local function ptrbits(ptr, bit2, bit1) local addr = cast('uint64_t', ptr) return tonumber(getbits(addr, bit2, bit1)) end ---fit a value into a bit range and return the resulting value local function setbits1(bit2, bit1, val) local mask = shl(2^(bit2-bit1+1)-1, bit1) - return band(shl(val, bit1), mask) + local bits = band(shl(val, bit1), mask) + return bits end ---set multiple bit ranges and return the resulting value local function setbits(...) --bit2, bit1, val, ... local endval = 0 for i = 1, select('#', ...), 3 do @@ -72,12 +117,6 @@ local function setbits(...) --bit2, bit1, val, ... return endval end ---get the value of a bit at a certain a bit offset from a base address -local function getbit(addr, bit) - local i = math.floor(bit / 32) - local j = bit % 32 - return getbits(getint(addr, i * 4), j, j) -end --init segment (section 4.3) @@ -109,6 +148,7 @@ end function init_seg:cmdq_phy_addr(addr) if addr then + print("addr", addr) --must write the MSB of the addr first self:setbits(0x10, 31, 0, ptrbits(addr, 63, 32)) --also resets nic_interface and log_cmdq_* @@ -174,9 +214,22 @@ local QUERY_PAGES = 0x107 local MANAGE_PAGES = 0x108 local SET_HCA_CAP = 0x109 local QUERY_ISSI = 0x10A +--local QUERY_ISSI = 0x010A +--local QUERY_ISSI = 0x0A01 local SET_ISSI = 0x10B local SET_DRIVER_VERSION = 0x10D +-- bytewise xor function used for signature calcuation. +local function xor8 (ptr, len) + local u8 = ffi.cast("uint8_t*", ptr) + local acc = 0 + for i = 0, len-1 do + acc = bit.bxor(acc, u8[i]) + end + return acc +end + +-- Create a command queue with dedicated/reusable DMA memory. function cmdq:new(init_seg) local ptr, phy = alloc_pages(1) local ib_ptr, ib_phy = alloc_pages(1) @@ -185,13 +238,23 @@ function cmdq:new(init_seg) ptr = ptr, phy = phy, ib_ptr = ib_ptr, + ib_phy = ib_phy, ob_ptr = ob_ptr, + ob_phy = ob_phy, init_seg = init_seg, size = init_seg:log_cmdq_size(), stride = init_seg:log_cmdq_stride(), }, self) end +-- Reset all data structures to zero values. +-- This is to prevent leakage from one command to the next. +function cmdq:reset() + ffi.fill(self.ptr, 4096, 0x00) + ffi.fill(self.ib_ptr, 4096, 0x00) + ffi.fill(self.ob_ptr, 4096, 0x00) +end + function cmdq:getbits(ofs, bit2, bit1) return getbits(getint(self.ptr, ofs), bit2, bit1) end @@ -219,55 +282,95 @@ function cmdq:getoutbits(ofs, bit2, bit1) end end -function cmdq:getoutaddr(ofs) - local ofs = (0x20 + ofs) / 4 - assert(ofs == math.floor(ofs)) - return self.ptr + ofs -end - -function cmdq:getbit(ofs, bit) - return getbit(self:getoutaddr(ofs), bit) -end - -local errors = { - 'signature error', - 'token error', - 'bad block number', - 'bad output pointer. pointer not aligned to mailbox size', - 'bad input pointer. pointer not aligned to mailbox size', - 'internal error', - 'input len error. input length less than 0x8', - 'output len error. output length less than 0x8', - 'reserved not zero', - 'bad command type', +-- "Command delivery status" error codes. +local delivery_errors = { + [0x00] = 'no errors', + [0x01] = 'signature error', + [0x02] = 'token error', + [0x03] = 'bad block number', + [0x04] = 'bad output pointer. pointer not aligned to mailbox size', + [0x05] = 'bad input pointer. pointer not aligned to mailbox size', + [0x06] = 'internal error', + [0x07] = 'input len error. input length less than 0x8', + [0x08] = 'output len error. output length less than 0x8', + [0x09] = 'reserved not zero', + [0x10] = 'bad command type', + -- Note: Suspicious to jump from 0x09 to 0x10 here i.e. skipping 0x0A - 0x0F. + -- This is consistent with both the PRM and the Linux mlx5_core driver. } + local function checkz(z) if z == 0 then return end - error('command error: '..(errors[z] or z)) -end + error('command error: '..(delivery_errors[z] or z)) +end + +-- Command error code meanings. +-- Note: This information is missing from the PRM. Can compare with Linux mlx5_core. +local command_errors = { + -- General: + [0x01] = 'INTERNAL_ERR: internal error', + [0x02] = 'BAD_OP: Operation/command not supported or opcode modifier not supported', + [0x03] = 'BAD_PARAM: parameter not supported; parameter out of range; reserved not equal 0', + [0x04] = 'BAD_SYS_STATE: System was not enabled or bad system state', + [0x05] = 'BAD_RESOURCE: Attempt to access reserved or unallocated resource, or resource in inappropriate status. for example., not existing CQ when creating QP', + [0x06] = 'RESOURCE_BUSY: Requested resource is currently executing a command. No change in any resource status or state i.e. command just not executed.', + [0x08] = 'EXCEED_LIM: Required capability exceeds device limits', + [0x09] = 'BAD_RES_STATE: Resource is not in the appropriate state or ownership', + [0x0F] = 'NO_RESOURCES: Command was not executed because lack of resources (for example ICM pages). This is unrecoverable situation from driver point of view', + [0x50] = 'BAD_INPUT_LEN: Bad command input len', + [0x51] = 'BAD_OUTPUT_LEN: Bad command output len', + -- QP/RQ/SQ/TIP: + [0x10] = 'BAD_RESOURCE_STATE: Attempt to modify a Resource (RQ/SQ/TIP/QPs) which is not in the presumed state', + -- MAD: + [0x30] = 'BAD_PKT: Bad management packet (silently discarded)', + -- CQ: + [0x40] = 'BAD_SIZE: More outstanding CQEs in CQ than new CQ size', +} function cmdq:post(last_in_ofs, last_out_ofs) local in_sz = last_in_ofs + 4 local out_sz = last_out_ofs + 4 + print("in_sz", in_sz, "out_sz", out_sz) self:setbits(0x00, 31, 24, 0x7) --type - self:setbits(0x04, 31, 0, in_sz) --input_length self:setbits(0x38, 31, 0, out_sz) --output_length - self:setbits(0x08, 31, 0, ptrbits(self.ib_addr, 63, 32)) - self:setbits(0x0C, 31, 9, ptrbits(self.ib_addr, 31, 9)) + self:setbits(0x08, 31, 0, ptrbits(self.ib_phy, 63, 32)) + self:setbits(0x0C, 31, 0, ptrbits(self.ib_phy, 31, 0)) - self:setbits(0x30, 31, 0, ptrbits(self.ob_addr, 63, 32)) - self:setbits(0x34, 31, 9, ptrbits(self.ob_addr, 31, 9)) + self:setbits(0x30, 31, 0, ptrbits(self.ob_phy, 63, 32)) + self:setbits(0x34, 31, 0, ptrbits(self.ob_phy, 31, 0)) self:setbits(0x3C, 0, 0, 1) --set ownership + if debug then + local dumpoffset = 0 + print("command INPUT:") + dumpoffset = hexdump(self.ptr, 0, 0x40, dumpoffset) + if in_sz > 16 then + print("command block:") + dumpoffset = hexdump(self.ib_ptr, 0, (in_sz-16), dumpoffset) + end + end + self.init_seg:ring_doorbell(0) --post command --poll for command completion while self:getbits(0x3C, 0, 0) == 1 do - C.usleep(1000) + C.usleep(100000) + end + + if debug then + local dumpoffset = 0 + print("command OUTPUT:") + dumpoffset = hexdump(self.ptr, 0, 0x40, dumpoffset) + print("command block:") + if out_sz > 16 then + sz = out_sz-16 + dumpoffset = 0x10 + dumpoffset = hexdump(self.ob_ptr, 0, sz, dumpoffset) + end end local token = self:getbits(0x3C, 31, 24) @@ -279,32 +382,43 @@ function cmdq:post(last_in_ofs, last_out_ofs) return signature, token end ---see 12.2 Return Status Summary +-- see 12.2 Return Status Summary function cmdq:checkstatus() local status = self:getoutbits(0x00, 31, 24) local syndrome = self:getoutbits(0x04, 31, 0) if status == 0 then return end - error(string.format('status: 0x%x, syndrome: %d', status, syndrome)) + error(string.format('status: 0x%x (%s), syndrome: %d', + status, command_errors[status], syndrome)) end -function cmdq:enable_hca() - self:setinbits(0x00, 31, 16, ENABLE_HCA) - self:post(0x0C, 0x08) +function cmdq:prepare (command) + print("Execute: " .. command) + self:reset() end -function cmdq:disable_hca() - self:setinbits(0x00, 31, 16, DISABLE_HCA) +function cmdq:enable_hca() + self:prepare("ENABLE_HCA") + self:setinbits(0x00, 31, 16, ENABLE_HCA) self:post(0x0C, 0x08) end function cmdq:query_issi() + self:prepare("QUERY_ISSI") self:setinbits(0x00, 31, 16, QUERY_ISSI) self:post(0x0C, 0x6C) self:checkstatus() local cur_issi = self:getoutbits(0x08, 15, 0) local t = {} - for i=0,80-1 do - t[i] = self:getbit(0x20, i) == 1 or nil + for i = 639, 0, -1 do + -- Bit N (0..639) when set means ISSI version N is enabled. + -- Bits are ordered from highest to lowest. + local byte = 0x10 + math.floor(i / 8) + local offset = byte - (byte % 4) + local bit = 31 - (i % 32) + if self:getoutbits(offset, bit, bit) == 1 then + local issi = 639 - i + t[issi] = true + end end return { cur_issi = cur_issi, @@ -313,6 +427,7 @@ function cmdq:query_issi() end function cmdq:set_issi(issi) + self:reset() self:setinbits(0x00, 31, 16, SET_ISSI) self:setinbits(0x08, 15, 0, issi) self:post(0x0C, 0x0C) @@ -320,8 +435,8 @@ function cmdq:set_issi(issi) end function cmdq:dump_issi(issi) - print(' cur_issi ', issi.cur_issi) - print(' sup_issi ') + print(' cur_issi = ', issi.cur_issi) + print(' sup_issi = ') for i=0,79 do if issi.sup_issi[i] then print(string.format( @@ -336,6 +451,7 @@ local codes = { regular = 3, } function cmdq:query_pages(which) + self:prepare("QUERY_PAGES") self:setinbits(0x00, 31, 16, QUERY_PAGES) self:setinbits(0x04, 15, 0, codes[which]) self:post(0x0C, 0x0C) @@ -344,6 +460,7 @@ function cmdq:query_pages(which) end function cmdq:alloc_pages(addr, num_pages) + self:prepare("MANAGE_PAGES") self:setinbits(0x00, 31, 16, MANAGE_PAGES) self:setinbits(0x04, 15, 0, 1) --alloc self:setinbits(0x0C, 31, 0, num_pages) @@ -366,6 +483,7 @@ local which_codes = { flow_table = 7, } function cmdq:query_hca_cap(what, which) + self:prepare("QUERY_HCA_CAP") self:setinbits(0x00, 31, 16, QUERY_HCA_CAP) self:setinbits(0x04, 15, 1, assert(which_codes[which]), @@ -446,6 +564,7 @@ function cmdq:query_hca_cap(what, which) end function cmdq:set_hca_cap(which, caps) + self:prepare("SET_HCA_CAP") self:setinbits(0x00, 31, 16, SET_HCA_CAP) self:setinbits(0x04, 15, 1, assert(which_codes[which])) if which_caps == 'general' then @@ -576,10 +695,15 @@ function ConnectX4:new(arg) local conf = config.parse_app_arg(arg) local pciaddress = pci.qualified(conf.pciaddress) + -- Perform a hard reset of the device to bring it into a blank state. + -- (PRM does not suggest this but it is practical for resetting the + -- firmware from bad states.) + pci.reset_device(pciaddress) pci.unbind_device_from_linux(pciaddress) pci.set_bus_master(pciaddress, true) local base, fd = pci.map_pci_memory(pciaddress, 0) + trace("Read the initialization segment") local init_seg = init_seg:init(base) --allocate and set the command queue which also initializes the nic @@ -587,9 +711,10 @@ function ConnectX4:new(arg) --8.2 HCA Driver Start-up + trace("Write the physical location of the command queues to the init segment.") init_seg:cmdq_phy_addr(cmdq.phy) - --wait until the nic is ready + trace("Wait for the 'initializing' field to clear") while not init_seg:ready() do C.usleep(1000) end @@ -597,16 +722,17 @@ function ConnectX4:new(arg) init_seg:dump() cmdq:enable_hca() - local issi = cmdq:query_issi() cmdq:dump_issi(issi) + --cmdq:set_issi(1) - cmdq:set_issi(0) - + -- PRM: Execute QUERY_PAGES to understand the HCA need to boot pages. local boot_pages = cmdq:query_pages'boot' print("query_pages'boot' ", boot_pages) assert(boot_pages > 0) + -- PRM: Execute MANAGE_PAGES to provide the HCA with all required + -- init-pages. This can be done by multiple MANAGE_PAGES commands. local bp_ptr, bp_phy = memory.dma_alloc(4096 * boot_pages) assert(band(bp_phy, 0xfff) == 0) --the phy address must be 4K-aligned cmdq:alloc_pages(bp_phy, boot_pages) @@ -616,6 +742,7 @@ function ConnectX4:new(arg) for k,v in pairs(t) do print('', k, v) end + --[[ cmdq:set_hca_cap() cmdq:query_pages() @@ -628,11 +755,8 @@ function ConnectX4:new(arg) ]] function self:stop() - if not base then return end - if cmdq then - cmdq:disable_hca() - end pci.set_bus_master(pciaddress, false) + pci.reset_device(pciaddress) pci.close_pci_resource(fd, base) base, fd = nil end @@ -640,47 +764,44 @@ function ConnectX4:new(arg) return self end +-- Print a hexdump in the same format as the Linux kernel. +-- +-- Optionally take a 'dumpoffset' giving the logical address where the +-- trace starts (useful when printing multiple related hexdumps i.e. +-- for consistency with the Linux mlx5_core driver format). +function hexdump (pointer, index, bytes, dumpoffset) + local u8 = ffi.cast("uint8_t*", pointer) + dumpoffset = dumpoffset or 0 + for i = 0, bytes-1 do + if i % 16 == 0 then + if i > 0 then io.stdout:write("\n") end + io.stdout:write(("%03x: "):format(dumpoffset+i)) + elseif i % 4 == 0 then + io.stdout:write(" ") + end + io.stdout:write(bit.tohex(u8[index+i], 2)) + end + io.stdout:write("\n") + io.flush() + return dumpoffset + bytes +end + +function trace (...) + print("TRACE", ...) +end + function selftest() io.stdout:setvbuf'no' - local ptr, phy = alloc_pages(1) - ptr[4] = bswap(1234) - assert(getint(ptr, 16) == 1234) - setint(ptr, 16, 4321) - assert(bswap(ptr[4]) == 4321) - assert(getint(ptr, 16) == 4321) - assert(getbits(0xdeadbeef, 31, 16) == 0xdead) - assert(getbits(0xdeadbeef, 15, 0) == 0xbeef) - assert(ptrbits(ffi.cast('void*', 0xdeadbeef), 15, 0) == 0xbeef) - assert(setbits(0, 0, 1) == 1) - assert(setbits(1, 1, 1) == 2) - assert(setbits(1, 0, 3) == 3) - local x = setbits(31, 16, 0xdead, 15, 0, 0xbeef) - print(bit.tohex(x), type(x)) - --assert(x == 0xdeadbeef) - ptr[4] = bswap(2) - assert(getbit(ptr, 4 * 4 * 8 + 0) == 0) - assert(getbit(ptr, 4 * 4 * 8 + 1) == 1) - - local pcidev1 = lib.getenv("SNABB_PCI_CONNECTX40") or lib.getenv("SNABB_PCI0") - local pcidev2 = lib.getenv("SNABB_PCI_CONNECTX41") or lib.getenv("SNABB_PCI1") - if not pcidev1 - or pci.device_info(pcidev1).driver ~= 'apps.mellanox.connectx4' - or not pcidev2 - or pci.device_info(pcidev2).driver ~= 'apps.mellanox.connectx4' - then - print("SNABB_PCI_CONNECTX4[0|1]/SNABB_PCI[0|1] not set or not suitable.") + local pcidev = lib.getenv("SNABB_PCI_CONNECTX4_0") + -- XXX check PCI device type + if not pcidev then + print("SNABB_PCI_CONNECTX4_0 not set") os.exit(engine.test_skipped_code) end - local device_info_1 = pci.device_info(pcidev1) - local device_info_2 = pci.device_info(pcidev2) - - local app1 = ConnectX4:new{pciaddress = pcidev1} - local app2 = ConnectX4:new{pciaddress = pcidev2} - - engine.main({duration = 1, report={showlinks=true, showapps=false}}) - - app1:stop() - app2:stop() + local device_info = pci.device_info(pcidev) + local app = ConnectX4:new{pciaddress = pcidev} + app:stop() end + From 6a426bde864338a93f2e3684f087cc99904ef2cc Mon Sep 17 00:00:00 2001 From: Luke Gorrie Date: Mon, 23 May 2016 09:46:37 +0000 Subject: [PATCH 010/209] connectx4: Translate virtual to physical on demand The physical address of DMA memory can be determined at runtime (cheaply and reliably) using memory.virtual_to_physical(). Now we do this whenever we need a physical address rather than caching the value returned by memory.dma_alloc(). Just means less state to keep track of in our data structures. --- src/apps/mellanox/connectx4.lua | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/src/apps/mellanox/connectx4.lua b/src/apps/mellanox/connectx4.lua index 7d8d898cba..d93e69e338 100644 --- a/src/apps/mellanox/connectx4.lua +++ b/src/apps/mellanox/connectx4.lua @@ -231,16 +231,13 @@ end -- Create a command queue with dedicated/reusable DMA memory. function cmdq:new(init_seg) - local ptr, phy = alloc_pages(1) - local ib_ptr, ib_phy = alloc_pages(1) - local ob_ptr, ob_phy = alloc_pages(1) + local ptr = alloc_pages(1) + local ib_ptr = alloc_pages(1) + local ob_ptr = alloc_pages(1) return setmetatable({ ptr = ptr, - phy = phy, ib_ptr = ib_ptr, - ib_phy = ib_phy, ob_ptr = ob_ptr, - ob_phy = ob_phy, init_seg = init_seg, size = init_seg:log_cmdq_size(), stride = init_seg:log_cmdq_stride(), @@ -336,11 +333,13 @@ function cmdq:post(last_in_ofs, last_out_ofs) self:setbits(0x04, 31, 0, in_sz) --input_length self:setbits(0x38, 31, 0, out_sz) --output_length - self:setbits(0x08, 31, 0, ptrbits(self.ib_phy, 63, 32)) - self:setbits(0x0C, 31, 0, ptrbits(self.ib_phy, 31, 0)) + local inbox_phy = memory.virtual_to_physical(self.ib_ptr) + self:setbits(0x08, 31, 0, ptrbits(inbox_phy, 63, 32)) + self:setbits(0x0C, 31, 0, ptrbits(inbox_phy, 31, 0)) - self:setbits(0x30, 31, 0, ptrbits(self.ob_phy, 63, 32)) - self:setbits(0x34, 31, 0, ptrbits(self.ob_phy, 31, 0)) + local outbox_phy = memory.virtual_to_physical(self.ob_ptr) + self:setbits(0x30, 31, 0, ptrbits(outbox_phy, 63, 32)) + self:setbits(0x34, 31, 0, ptrbits(outbox_phy, 31, 0)) self:setbits(0x3C, 0, 0, 1) --set ownership @@ -712,7 +711,7 @@ function ConnectX4:new(arg) --8.2 HCA Driver Start-up trace("Write the physical location of the command queues to the init segment.") - init_seg:cmdq_phy_addr(cmdq.phy) + init_seg:cmdq_phy_addr(memory.virtual_to_physical(cmdq.ptr)) trace("Wait for the 'initializing' field to clear") while not init_seg:ready() do From f349c41973af27fbbab431736b7fb4ddc512aa8e Mon Sep 17 00:00:00 2001 From: Luke Gorrie Date: Tue, 24 May 2016 08:29:44 +0000 Subject: [PATCH 011/209] core.memory: Add 'align' argument to dma_alloc() Now it is possible to request specific alignment for DMA memory. This is practical. For example, Mellanox ConnectX-4 requires specific alignments (e.g. 4KB). --- src/README.md | 5 ++++- src/core/memory.lua | 15 +++++++++++---- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/src/README.md b/src/README.md index f4dcfc4181..1f79fbfae6 100644 --- a/src/README.md +++ b/src/README.md @@ -339,10 +339,13 @@ can be accessed directly by network cards. The important characteristic of DMA memory is being located in contiguous physical memory at a stable address. -— Function **memory.dma_alloc** *bytes* +— Function **memory.dma_alloc** *bytes*, *[alignment]* Returns a pointer to *bytes* of new DMA memory. +Optionally a specific *alignment* requirement can be provided (in +bytes). The default alignment is 128. + — Function **memory.virtual_to_physical** *pointer* Returns the physical address (`uint64_t`) the DMA memory at *pointer*. diff --git a/src/core/memory.lua b/src/core/memory.lua index ec25748211..20f07408dd 100644 --- a/src/core/memory.lua +++ b/src/core/memory.lua @@ -23,13 +23,20 @@ chunks = {} -- Allocate DMA-friendly memory. -- Return virtual memory pointer, physical address, and actual size. -function dma_alloc (bytes) +function dma_alloc (bytes, align) + align = align or 128 assert(bytes <= huge_page_size) - bytes = lib.align(bytes, 128) - if #chunks == 0 or bytes + chunks[#chunks].used > chunks[#chunks].size then + -- Get current chunk of memory to allocate from + if #chunks == 0 then allocate_next_chunk() end + local chunk = chunks[#chunks] + -- Skip allocation forward pointer to suit alignment + chunk.used = lib.align(chunk.used, align) + -- Need a new chunk to service this allocation? + if chunk.used + bytes > chunk.size then allocate_next_chunk() + chunk = chunks[#chunks] end - local chunk = chunks[#chunks] + -- Slice out the memory we need local where = chunk.used chunk.used = chunk.used + bytes return chunk.pointer + where, chunk.physical + where, bytes From 8c55f664ba8681ea6c95caffb47d3161432595db Mon Sep 17 00:00:00 2001 From: Luke Gorrie Date: Tue, 24 May 2016 08:41:57 +0000 Subject: [PATCH 012/209] connectx4: Request alignment on memory.dma_alloc() Alignment was already checked with an assertion but this would not necessarily succeed. --- src/apps/mellanox/connectx4.lua | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/apps/mellanox/connectx4.lua b/src/apps/mellanox/connectx4.lua index d93e69e338..b2f83419b4 100644 --- a/src/apps/mellanox/connectx4.lua +++ b/src/apps/mellanox/connectx4.lua @@ -75,7 +75,7 @@ ConnectX4.__index = ConnectX4 --utils local function alloc_pages(pages) - local ptr, phy = memory.dma_alloc(4096 * pages) + local ptr, phy = memory.dma_alloc(4096 * pages, 4096) assert(band(phy, 0xfff) == 0) --the phy address must be 4K-aligned return cast('uint32_t*', ptr), phy end @@ -732,7 +732,7 @@ function ConnectX4:new(arg) -- PRM: Execute MANAGE_PAGES to provide the HCA with all required -- init-pages. This can be done by multiple MANAGE_PAGES commands. - local bp_ptr, bp_phy = memory.dma_alloc(4096 * boot_pages) + local bp_ptr, bp_phy = memory.dma_alloc(4096 * boot_pages, 4096) assert(band(bp_phy, 0xfff) == 0) --the phy address must be 4K-aligned cmdq:alloc_pages(bp_phy, boot_pages) From 2c1a92dedd7f125b9ffef72059286d29c3687556 Mon Sep 17 00:00:00 2001 From: Luke Gorrie Date: Wed, 25 May 2016 12:17:30 +0000 Subject: [PATCH 013/209] connectx4: Add multi-mailbox command support Command inputs and outputs are now split into multiple chained mailbox records that each hold up to 512 bytes of data. This is mandatory for large messages. --- src/apps/mellanox/connectx4.lua | 182 +++++++++++++++++++++----------- 1 file changed, 120 insertions(+), 62 deletions(-) diff --git a/src/apps/mellanox/connectx4.lua b/src/apps/mellanox/connectx4.lua index b2f83419b4..9cb53cfd25 100644 --- a/src/apps/mellanox/connectx4.lua +++ b/src/apps/mellanox/connectx4.lua @@ -89,7 +89,7 @@ end function setint(addr, ofs, val) local ofs = ofs/4 assert(ofs == floor(ofs)) - addr[ofs] = bswap(val) + addr[ofs] = bswap(tonumber(val)) end local function getbits(val, bit2, bit1) @@ -229,44 +229,114 @@ local function xor8 (ptr, len) return acc end +local cmdq_entry_t = ffi.typeof("uint32_t[0x40/4]") +local cmdq_mailbox_t = ffi.typeof("uint32_t[0x240/4]") + +-- XXX Check with maximum length of commands that we really use. +local max_mailboxes = 10 +local data_per_mailbox = 0x230 -- Bytes of input/output data in a mailbox + -- Create a command queue with dedicated/reusable DMA memory. function cmdq:new(init_seg) - local ptr = alloc_pages(1) - local ib_ptr = alloc_pages(1) - local ob_ptr = alloc_pages(1) - return setmetatable({ - ptr = ptr, - ib_ptr = ib_ptr, - ob_ptr = ob_ptr, - init_seg = init_seg, - size = init_seg:log_cmdq_size(), - stride = init_seg:log_cmdq_stride(), - }, self) + local entry = ffi.cast("uint32_t*", memory.dma_alloc(0x40)) + local inboxes, outboxes = {}, {} + for i = 0, max_mailboxes-1 do + -- XXX overpadding.. 0x240 alignment is not accepted? + inboxes[i] = ffi.cast("uint32_t*", memory.dma_alloc(0x240, 4096)) + outboxes[i] = ffi.cast("uint32_t*", memory.dma_alloc(0x240, 4096)) + end + return setmetatable({entry = entry, + inboxes = inboxes, + outboxes = outboxes, + init_seg = init_seg, + size = init_seg:log_cmdq_size(), + stride = init_seg:log_cmdq_stride()}, + self) end -- Reset all data structures to zero values. -- This is to prevent leakage from one command to the next. -function cmdq:reset() - ffi.fill(self.ptr, 4096, 0x00) - ffi.fill(self.ib_ptr, 4096, 0x00) - ffi.fill(self.ob_ptr, 4096, 0x00) +local token = 0xAA +function cmdq:prepare(command, last_input_offset, last_output_offset) + print("Command: " .. command) + local input_size = last_input_offset + 4 + local output_size = last_output_offset + 4 + + -- Command entry: + + ffi.fill(self.entry, ffi.sizeof(self.entry), 0) + self:setbits(0x00, 31, 24, 0x7) -- type + self:setbits(0x04, 31, 0, input_size) + self:setbits(0x38, 31, 0, output_size) + self:setbits(0x3C, + 0, 0, 1, -- ownership = hardware + 31, 24, token) + + -- Mailboxes: + + -- How many mailboxes do we need? + local ninboxes = math.ceil((input_size - 16) / data_per_mailbox) + local noutboxes = math.ceil((output_size - 16) / data_per_mailbox) + print("ninboxes", ninboxes) + print("noutboxes", noutboxes) + if ninboxes > max_mailboxes then error("Input overflow: " ..input_size) end + if noutboxes > max_mailboxes then error("Output overflow: "..output_size) end + + if ninboxes > 0 then + local phy = memory.virtual_to_physical(self.inboxes[0]) + setint(self.entry, 0x08, phy / 2^32) + setint(self.entry, 0x0C, phy % 2^32) + end + if noutboxes > 0 then + local phy = memory.virtual_to_physical(self.outboxes[0]) + print("phy", phy, bit.tohex(phy / 2^32), bit.tohex(phy % 2^32)) + setint(self.entry, 0x30, phy / 2^32) + setint(self.entry, 0x34, phy % 2^32) + print(bit.tohex(getint(self.entry, 0x30)), bit.tohex(getint(self.entry, 0x34))) + end + + -- Initialize mailboxes + for i = 0, max_mailboxes-1 do + -- Zap old state + ffi.fill(self.inboxes[i], ffi.sizeof(self.inboxes[i]), 0) + ffi.fill(self.outboxes[i], ffi.sizeof(self.outboxes[i]), 0) + -- Set mailbox block number + setint(self.inboxes[i], 0x238, i) + setint(self.outboxes[i], 0x238, i) + -- Tokens to match command entry + setint(self.inboxes[i], 0x23C, setbits(23, 16, token)) + setint(self.outboxes[i], 0x23C, setbits(23, 16, token)) + -- Set 'next' mailbox pointers (when used) + if i < ninboxes then + local phy = memory.virtual_to_physical(self.inboxes[i+1]) + setint(self.inboxes[i], 0x230, phy / 2^32) + setint(self.inboxes[i], 0x234, phy % 2^32) + end + if i < noutboxes then + local phy = memory.virtual_to_physical(self.outboxes[i+1]) + setint(self.outboxes[i], 0x230, phy / 2^32) + setint(self.outboxes[i], 0x234, phy % 2^32) + end + end + token = (token == 255) and 1 or token+1 end function cmdq:getbits(ofs, bit2, bit1) - return getbits(getint(self.ptr, ofs), bit2, bit1) + return getbits(getint(self.entry, ofs), bit2, bit1) end -function cmdq:setbits(ofs, bit2, bit1, val) - setint(self.ptr, ofs, setbits(bit2, bit1, val)) +function cmdq:setbits(ofs, ...) + setint(self.entry, ofs, setbits(...)) end function cmdq:setinbits(ofs, ...) --bit1, bit2, val, ... - assert(band(ofs, 3) == 0) --offset must be 4-byte aligned + assert(ofs % 4 == 0) if ofs <= 16 - 4 then --inline self:setbits(0x10 + ofs, ...) else --input mailbox - assert(ofs <= 16 - 4 + 4096) - setint(self.ib_ptr, ofs, setbits(...)) + local mailbox = math.floor((ofs + 4 - 16) / data_per_mailbox) + local offset = (ofs + 4 - 16) % data_per_mailbox + setint(self.mailboxes[mailbox], offset, setbits(...)) end end @@ -274,8 +344,11 @@ function cmdq:getoutbits(ofs, bit2, bit1) if ofs <= 16 - 4 then --inline return self:getbits(0x20 + ofs, bit2, bit1) else --output mailbox - assert(ofs <= 16 - 4 + 4096) - return getbits(getint(self.ob_ptr, ofs), bit2, bit1) + local mailbox = math.floor((ofs - 16) / data_per_mailbox) + local offset = (ofs - 16) % data_per_mailbox + local b = getbits(getint(self.outboxes[mailbox], offset), bit2, bit1) + print("cmdq:getoutbits", mailbox, bit.tohex(offset, 4), bit2, bit1, b) + return b end end @@ -325,31 +398,18 @@ local command_errors = { } function cmdq:post(last_in_ofs, last_out_ofs) - local in_sz = last_in_ofs + 4 - local out_sz = last_out_ofs + 4 - print("in_sz", in_sz, "out_sz", out_sz) - - self:setbits(0x00, 31, 24, 0x7) --type - self:setbits(0x04, 31, 0, in_sz) --input_length - self:setbits(0x38, 31, 0, out_sz) --output_length - - local inbox_phy = memory.virtual_to_physical(self.ib_ptr) - self:setbits(0x08, 31, 0, ptrbits(inbox_phy, 63, 32)) - self:setbits(0x0C, 31, 0, ptrbits(inbox_phy, 31, 0)) - - local outbox_phy = memory.virtual_to_physical(self.ob_ptr) - self:setbits(0x30, 31, 0, ptrbits(outbox_phy, 63, 32)) - self:setbits(0x34, 31, 0, ptrbits(outbox_phy, 31, 0)) - - self:setbits(0x3C, 0, 0, 1) --set ownership - if debug then local dumpoffset = 0 print("command INPUT:") - dumpoffset = hexdump(self.ptr, 0, 0x40, dumpoffset) - if in_sz > 16 then - print("command block:") - dumpoffset = hexdump(self.ib_ptr, 0, (in_sz-16), dumpoffset) + dumpoffset = hexdump(self.entry, 0, 0x40, dumpoffset) + local ninboxes = math.ceil((last_in_ofs + 4 - 16) / data_per_mailbox) + for i = 0, ninboxes-1 do + local blocknumber = getint(self.inboxes[i], 0x238, 31, 0) + if blocknumber ~= 0 then -- mailbox being used? + local address = memory.virtual_to_physical(self.inboxes[i]) + print("Block "..blocknumber.." @ "..bit.tohex(address, 12)..":") + dumpoffset = hexdump(self.inboxes[i], 0, ffi.sizeof(cmdq_mailbox_t), dumpoffset) + end end end @@ -363,12 +423,13 @@ function cmdq:post(last_in_ofs, last_out_ofs) if debug then local dumpoffset = 0 print("command OUTPUT:") - dumpoffset = hexdump(self.ptr, 0, 0x40, dumpoffset) - print("command block:") - if out_sz > 16 then - sz = out_sz-16 - dumpoffset = 0x10 - dumpoffset = hexdump(self.ob_ptr, 0, sz, dumpoffset) + dumpoffset = hexdump(self.entry, 0, 0x40, dumpoffset) + local noutboxes = math.ceil((last_out_ofs + 4 - 16) / data_per_mailbox) + for i = 0, noutboxes-1 do + local blocknumber = getint(self.outboxes[i], 0x238, 31, 0) + local address = memory.virtual_to_physical(self.outboxes[i]) + print("Block "..blocknumber.." @ "..bit.tohex(address, 12)..":") + dumpoffset = hexdump(self.outboxes[i], 0, ffi.sizeof(cmdq_mailbox_t), dumpoffset) end end @@ -390,19 +451,14 @@ function cmdq:checkstatus() status, command_errors[status], syndrome)) end -function cmdq:prepare (command) - print("Execute: " .. command) - self:reset() -end - function cmdq:enable_hca() - self:prepare("ENABLE_HCA") + self:prepare("ENABLE_HCA", 0x0C, 0x08) self:setinbits(0x00, 31, 16, ENABLE_HCA) self:post(0x0C, 0x08) end function cmdq:query_issi() - self:prepare("QUERY_ISSI") + self:prepare("QUERY_ISSI", 0x0C, 0x6C) self:setinbits(0x00, 31, 16, QUERY_ISSI) self:post(0x0C, 0x6C) self:checkstatus() @@ -411,7 +467,7 @@ function cmdq:query_issi() for i = 639, 0, -1 do -- Bit N (0..639) when set means ISSI version N is enabled. -- Bits are ordered from highest to lowest. - local byte = 0x10 + math.floor(i / 8) + local byte = 0x20 + math.floor(i / 8) local offset = byte - (byte % 4) local bit = 31 - (i % 32) if self:getoutbits(offset, bit, bit) == 1 then @@ -697,8 +753,8 @@ function ConnectX4:new(arg) -- Perform a hard reset of the device to bring it into a blank state. -- (PRM does not suggest this but it is practical for resetting the -- firmware from bad states.) - pci.reset_device(pciaddress) pci.unbind_device_from_linux(pciaddress) + pci.reset_device(pciaddress) pci.set_bus_master(pciaddress, true) local base, fd = pci.map_pci_memory(pciaddress, 0) @@ -711,7 +767,7 @@ function ConnectX4:new(arg) --8.2 HCA Driver Start-up trace("Write the physical location of the command queues to the init segment.") - init_seg:cmdq_phy_addr(memory.virtual_to_physical(cmdq.ptr)) + init_seg:cmdq_phy_addr(memory.virtual_to_physical(cmdq.entry)) trace("Wait for the 'initializing' field to clear") while not init_seg:ready() do @@ -723,6 +779,8 @@ function ConnectX4:new(arg) cmdq:enable_hca() local issi = cmdq:query_issi() cmdq:dump_issi(issi) + + os.exit(0) --cmdq:set_issi(1) -- PRM: Execute QUERY_PAGES to understand the HCA need to boot pages. From 6abbd96a7230d1f355c1a5cc5bdb085f48135a48 Mon Sep 17 00:00:00 2001 From: Luke Gorrie Date: Wed, 25 May 2016 12:46:52 +0000 Subject: [PATCH 014/209] connectx4: Successful QUERY_HCA_CAP query Maybe more work needed to correctly interpret the result. --- src/apps/mellanox/connectx4.lua | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/src/apps/mellanox/connectx4.lua b/src/apps/mellanox/connectx4.lua index 9cb53cfd25..0fe2bd4d65 100644 --- a/src/apps/mellanox/connectx4.lua +++ b/src/apps/mellanox/connectx4.lua @@ -336,7 +336,7 @@ function cmdq:setinbits(ofs, ...) --bit1, bit2, val, ... else --input mailbox local mailbox = math.floor((ofs + 4 - 16) / data_per_mailbox) local offset = (ofs + 4 - 16) % data_per_mailbox - setint(self.mailboxes[mailbox], offset, setbits(...)) + setint(self.inboxes[mailbox], offset, setbits(...)) end end @@ -347,7 +347,6 @@ function cmdq:getoutbits(ofs, bit2, bit1) local mailbox = math.floor((ofs - 16) / data_per_mailbox) local offset = (ofs - 16) % data_per_mailbox local b = getbits(getint(self.outboxes[mailbox], offset), bit2, bit1) - print("cmdq:getoutbits", mailbox, bit.tohex(offset, 4), bit2, bit1, b) return b end end @@ -506,7 +505,7 @@ local codes = { regular = 3, } function cmdq:query_pages(which) - self:prepare("QUERY_PAGES") + self:prepare("QUERY_PAGES", 0x0C, 0x0C) self:setinbits(0x00, 31, 16, QUERY_PAGES) self:setinbits(0x04, 15, 0, codes[which]) self:post(0x0C, 0x0C) @@ -515,7 +514,7 @@ function cmdq:query_pages(which) end function cmdq:alloc_pages(addr, num_pages) - self:prepare("MANAGE_PAGES") + self:prepare("MANAGE_PAGES", 0x10 + num_pages*8, 0x0C) self:setinbits(0x00, 31, 16, MANAGE_PAGES) self:setinbits(0x04, 15, 0, 1) --alloc self:setinbits(0x0C, 31, 0, num_pages) @@ -538,7 +537,7 @@ local which_codes = { flow_table = 7, } function cmdq:query_hca_cap(what, which) - self:prepare("QUERY_HCA_CAP") + self:prepare("QUERY_HCA_CAP", 0x0C, 0x100C - 3000) self:setinbits(0x00, 31, 16, QUERY_HCA_CAP) self:setinbits(0x04, 15, 1, assert(which_codes[which]), @@ -546,7 +545,7 @@ function cmdq:query_hca_cap(what, which) self:post(0x0C, 0x100C - 3000) self:checkstatus() local caps = {} - if which_caps == 'general' then + if which == 'general' then caps.log_max_cq_sz = self:getoutbits(0x18, 23, 16) caps.log_max_cq = self:getoutbits(0x18, 4, 0) caps.log_max_eq_sz = self:getoutbits(0x1C, 31, 24) @@ -619,7 +618,7 @@ function cmdq:query_hca_cap(what, which) end function cmdq:set_hca_cap(which, caps) - self:prepare("SET_HCA_CAP") + self:prepare("SET_HCA_CAP", 0x100C, 0x0C) self:setinbits(0x00, 31, 16, SET_HCA_CAP) self:setinbits(0x04, 15, 1, assert(which_codes[which])) if which_caps == 'general' then @@ -780,7 +779,7 @@ function ConnectX4:new(arg) local issi = cmdq:query_issi() cmdq:dump_issi(issi) - os.exit(0) + --os.exit(0) --cmdq:set_issi(1) -- PRM: Execute QUERY_PAGES to understand the HCA need to boot pages. @@ -795,9 +794,15 @@ function ConnectX4:new(arg) cmdq:alloc_pages(bp_phy, boot_pages) local t = cmdq:query_hca_cap('cur', 'general') - print'query_hca_cap:' + print'query_hca_cap (current, general):' + for k,v in pairs(t) do + print((" %-24s = %s"):format(k, v)) + end + + local t = cmdq:query_hca_cap('max', 'general') + print'query_hca_cap (maximum, general):' for k,v in pairs(t) do - print('', k, v) + print((" %-24s = %s"):format(k, v)) end --[[ From 92ec4104b9c313be1dfb5e9600eb8a71efff88e4 Mon Sep 17 00:00:00 2001 From: Luke Gorrie Date: Wed, 8 Jun 2016 08:30:46 +0000 Subject: [PATCH 015/209] connectx4: Command mailbox chaining fixes --- src/apps/mellanox/connectx4.lua | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/src/apps/mellanox/connectx4.lua b/src/apps/mellanox/connectx4.lua index 0fe2bd4d65..acb3ccb93b 100644 --- a/src/apps/mellanox/connectx4.lua +++ b/src/apps/mellanox/connectx4.lua @@ -233,8 +233,8 @@ local cmdq_entry_t = ffi.typeof("uint32_t[0x40/4]") local cmdq_mailbox_t = ffi.typeof("uint32_t[0x240/4]") -- XXX Check with maximum length of commands that we really use. -local max_mailboxes = 10 -local data_per_mailbox = 0x230 -- Bytes of input/output data in a mailbox +local max_mailboxes = 1000 +local data_per_mailbox = 0x200 -- Bytes of input/output data in a mailbox -- Create a command queue with dedicated/reusable DMA memory. function cmdq:new(init_seg) @@ -264,7 +264,7 @@ function cmdq:prepare(command, last_input_offset, last_output_offset) -- Command entry: - ffi.fill(self.entry, ffi.sizeof(self.entry), 0) + ffi.fill(self.entry, ffi.sizeof(cmdq_entry_t), 0) self:setbits(0x00, 31, 24, 0x7) -- type self:setbits(0x04, 31, 0, input_size) self:setbits(0x38, 31, 0, output_size) @@ -277,8 +277,6 @@ function cmdq:prepare(command, last_input_offset, last_output_offset) -- How many mailboxes do we need? local ninboxes = math.ceil((input_size - 16) / data_per_mailbox) local noutboxes = math.ceil((output_size - 16) / data_per_mailbox) - print("ninboxes", ninboxes) - print("noutboxes", noutboxes) if ninboxes > max_mailboxes then error("Input overflow: " ..input_size) end if noutboxes > max_mailboxes then error("Output overflow: "..output_size) end @@ -298,8 +296,8 @@ function cmdq:prepare(command, last_input_offset, last_output_offset) -- Initialize mailboxes for i = 0, max_mailboxes-1 do -- Zap old state - ffi.fill(self.inboxes[i], ffi.sizeof(self.inboxes[i]), 0) - ffi.fill(self.outboxes[i], ffi.sizeof(self.outboxes[i]), 0) + ffi.fill(self.inboxes[i], ffi.sizeof(cmdq_mailbox_t), 0) + ffi.fill(self.outboxes[i], ffi.sizeof(cmdq_mailbox_t), 0) -- Set mailbox block number setint(self.inboxes[i], 0x238, i) setint(self.outboxes[i], 0x238, i) @@ -334,8 +332,8 @@ function cmdq:setinbits(ofs, ...) --bit1, bit2, val, ... if ofs <= 16 - 4 then --inline self:setbits(0x10 + ofs, ...) else --input mailbox - local mailbox = math.floor((ofs + 4 - 16) / data_per_mailbox) - local offset = (ofs + 4 - 16) % data_per_mailbox + local mailbox = math.floor((ofs - 16) / data_per_mailbox) + local offset = (ofs - 16) % data_per_mailbox setint(self.inboxes[mailbox], offset, setbits(...)) end end @@ -404,11 +402,9 @@ function cmdq:post(last_in_ofs, last_out_ofs) local ninboxes = math.ceil((last_in_ofs + 4 - 16) / data_per_mailbox) for i = 0, ninboxes-1 do local blocknumber = getint(self.inboxes[i], 0x238, 31, 0) - if blocknumber ~= 0 then -- mailbox being used? - local address = memory.virtual_to_physical(self.inboxes[i]) - print("Block "..blocknumber.." @ "..bit.tohex(address, 12)..":") - dumpoffset = hexdump(self.inboxes[i], 0, ffi.sizeof(cmdq_mailbox_t), dumpoffset) - end + local address = memory.virtual_to_physical(self.inboxes[i]) + print("Block "..blocknumber.." @ "..bit.tohex(address, 12)..":") + dumpoffset = hexdump(self.inboxes[i], 0, ffi.sizeof(cmdq_mailbox_t), dumpoffset) end end From 13e65509e0cc7dcd204c28ee6e087f46f9edbba2 Mon Sep 17 00:00:00 2001 From: Luke Gorrie Date: Thu, 9 Jun 2016 05:39:00 +0000 Subject: [PATCH 016/209] connectx4: Set debug = false Complete debug messages have become a little overwhelming now that we are allocating thousands of pages of memory for the adapter. Just for the moment disabling the hexdumps is the more sensible default. More fine-grained debug logging is likely needed. --- src/apps/mellanox/connectx4.lua | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/apps/mellanox/connectx4.lua b/src/apps/mellanox/connectx4.lua index acb3ccb93b..0626daa0d2 100644 --- a/src/apps/mellanox/connectx4.lua +++ b/src/apps/mellanox/connectx4.lua @@ -67,7 +67,7 @@ local cast = ffi.cast local band, bor, shl, shr, bswap, bnot = bit.band, bit.bor, bit.lshift, bit.rshift, bit.bswap, bit.bnot -local debug = true +local debug = false ConnectX4 = {} ConnectX4.__index = ConnectX4 From cd8e30eab519d1ac19b425128ab90798c836c634 Mon Sep 17 00:00:00 2001 From: Luke Gorrie Date: Thu, 9 Jun 2016 05:41:06 +0000 Subject: [PATCH 017/209] connectx4: Remove stray print statements --- src/apps/mellanox/connectx4.lua | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/apps/mellanox/connectx4.lua b/src/apps/mellanox/connectx4.lua index 0626daa0d2..fd4f28307c 100644 --- a/src/apps/mellanox/connectx4.lua +++ b/src/apps/mellanox/connectx4.lua @@ -148,7 +148,6 @@ end function init_seg:cmdq_phy_addr(addr) if addr then - print("addr", addr) --must write the MSB of the addr first self:setbits(0x10, 31, 0, ptrbits(addr, 63, 32)) --also resets nic_interface and log_cmdq_* @@ -287,10 +286,8 @@ function cmdq:prepare(command, last_input_offset, last_output_offset) end if noutboxes > 0 then local phy = memory.virtual_to_physical(self.outboxes[0]) - print("phy", phy, bit.tohex(phy / 2^32), bit.tohex(phy % 2^32)) setint(self.entry, 0x30, phy / 2^32) setint(self.entry, 0x34, phy % 2^32) - print(bit.tohex(getint(self.entry, 0x30)), bit.tohex(getint(self.entry, 0x34))) end -- Initialize mailboxes From a285999e439a105e01182182b324e77fd6adf610 Mon Sep 17 00:00:00 2001 From: Luke Gorrie Date: Thu, 9 Jun 2016 05:41:49 +0000 Subject: [PATCH 018/209] connectx4: Move checkstatus() into post() Refactored the error checking to always be done when posting a command to the command queue. Previously this was a manual step for each command and that seems more error prone. --- src/apps/mellanox/connectx4.lua | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/apps/mellanox/connectx4.lua b/src/apps/mellanox/connectx4.lua index fd4f28307c..f13cfe03cb 100644 --- a/src/apps/mellanox/connectx4.lua +++ b/src/apps/mellanox/connectx4.lua @@ -430,6 +430,7 @@ function cmdq:post(last_in_ofs, last_out_ofs) local status = self:getbits(0x3C, 7, 1) checkz(status) + self:checkstatus() return signature, token end @@ -453,7 +454,6 @@ function cmdq:query_issi() self:prepare("QUERY_ISSI", 0x0C, 0x6C) self:setinbits(0x00, 31, 16, QUERY_ISSI) self:post(0x0C, 0x6C) - self:checkstatus() local cur_issi = self:getoutbits(0x08, 15, 0) local t = {} for i = 639, 0, -1 do @@ -478,7 +478,6 @@ function cmdq:set_issi(issi) self:setinbits(0x00, 31, 16, SET_ISSI) self:setinbits(0x08, 15, 0, issi) self:post(0x0C, 0x0C) - self:checkstatus() end function cmdq:dump_issi(issi) @@ -502,7 +501,6 @@ function cmdq:query_pages(which) self:setinbits(0x00, 31, 16, QUERY_PAGES) self:setinbits(0x04, 15, 0, codes[which]) self:post(0x0C, 0x0C) - self:checkstatus() return self:getoutbits(0x0C, 31, 0) end @@ -517,7 +515,6 @@ function cmdq:alloc_pages(addr, num_pages) self:setinbits(0x14 + i*8, 31, 12, ptrbits(addr + 4096*i, 31, 12)) end self:post(0x10 + num_pages*8, 0x0C) - self:checkstatus() end local what_codes = { @@ -536,7 +533,6 @@ function cmdq:query_hca_cap(what, which) 15, 1, assert(which_codes[which]), 0, 0, assert(what_codes[what])) self:post(0x0C, 0x100C - 3000) - self:checkstatus() local caps = {} if which == 'general' then caps.log_max_cq_sz = self:getoutbits(0x18, 23, 16) @@ -722,7 +718,12 @@ function cmdq:set_hca_cap(which, caps) --TODO end self:post(0x100C, 0x0C) - self:checkstatus() +end + +function cmdq:init_hca() + self:prepare("INIT_HCA", 0x0c, 0x0c) + self:setinbits(0x00, 31, 16, INIT_HCA) + self:post(0x0C, 0x0C) end function init_seg:dump() From 4575dc7385a3df81eeed9ce32d129176025b8bb5 Mon Sep 17 00:00:00 2001 From: Luke Gorrie Date: Thu, 9 Jun 2016 05:43:21 +0000 Subject: [PATCH 019/209] connectx4: Device initialization fixes Now successfully: - Providing boot memory to the adapter (6 pages) - Querying adapter capabilities (current and maximum) - Setting adapter capabilities (keep current) - Providing init memory to the adapter (4232 pages !) The output from the init sequence looks like this: TRACE Read the initialization segment TRACE Write the physical location of the command queues to the init segment. TRACE Wait for the 'initializing' field to clear fw_rev 14 12 1220 cmd_interface_rev 5 cmdq_phy_addr cdata: 0x1f000000 log_cmdq_size 5 log_cmdq_stride 6 ready true nic_interface_supported true internal_timer 2.0108995831647e+14 health_syndrome 0 Command: ENABLE_HCA Command: QUERY_ISSI cur_issi = 0 sup_issi = 01 Command: QUERY_PAGES query_pages'boot' 6 Command: MANAGE_PAGES Command: QUERY_HCA_CAP Command: QUERY_HCA_CAP Capabilities - current and (maximum): eth_net_offloads = 0 (0) end_pad = 1 (1) cq_eq_remap = 1 (1) device_frequency_mhz = 275 (275) log_max_vlan_list = 12 (12) log_min_stride_sz_rq = 0 (0) log_max_klm_list_size = 16 (16) log_max_rqt = 0 (0) log_max_l2_table = 16 (16) log_max_current_uc_list = 10 (10) log_min_stride_sz_sq = 0 (0) log_uar_page_sz = 0 (8) log_max_wq_sz = 0 (0) log_max_current_mc_list = 14 (14) log_max_msg = 30 (30) log_max_stride_sz_rq = 0 (0) max_flow_counter = 0 (0) log_max_eq_sz = 22 (22) log_max_rqt_size = 0 (0) basic_cyclic_rcv_wqe = 0 (0) cache_line_128byte = 0 (0) max_tc = 0 (0) cmdif_checksum = 0 (3) driver_version = 0 (0) log_max_tis = 0 (0) port_type = 1 (1) wq_signature = 1 (1) log_max_tir = 0 (0) max_indirection = 4 (4) log_max_rq = 0 (0) cq_resize = 1 (1) cq_oi = 1 (1) cq_moderation = 1 (1) log_max_pd = 24 (24) log_max_mkey = 24 (24) log_max_transport_domain = 0 (0) rc = 1 (1) num_ports = 1 (1) bf = 1 (1) vport_counters = 1 (1) log_max_eq = 8 (8) pad_tx_eth_packet = 0 (0) log_pg_sz = 12 (12) uar_sz = 5 (5) cq_period_start_from_cqe = 1 (1) uc = 1 (1) log_max_mrw_sz = 64 (64) log_max_cq = 24 (24) vport_group_manager = 1 (1) log_max_tis_per_sq = 0 (0) start_pad = 0 (0) log_max_cq_sz = 22 (22) nic_flow_table = 0 (0) scqe_break_moderation = 1 (1) ud = 1 (1) log_max_sq = 0 (0) cqe_version = 0 (0) log_bf_reg_size = 9 (9) sctr_data_cqe = 1 (1) log_max_rmp = 0 (0) cqe_version = 0 (0) log_bf_reg_size = 9 (9) sctr_data_cqe = 1 (1) log_max_rmp = 0 (0) log_max_stride_sz_sq = 0 (0) imaicl = 0 (0) xrc = 1 (1) Command: SET_HCA_CAP Command: QUERY_PAGES query_pages'init' 4232 Command: MANAGE_PAGES Command: INIT_HCA --- src/apps/mellanox/connectx4.lua | 220 ++++++++++++++++---------------- 1 file changed, 112 insertions(+), 108 deletions(-) diff --git a/src/apps/mellanox/connectx4.lua b/src/apps/mellanox/connectx4.lua index f13cfe03cb..89560c5699 100644 --- a/src/apps/mellanox/connectx4.lua +++ b/src/apps/mellanox/connectx4.lua @@ -504,15 +504,15 @@ function cmdq:query_pages(which) return self:getoutbits(0x0C, 31, 0) end -function cmdq:alloc_pages(addr, num_pages) +function cmdq:alloc_pages(num_pages) self:prepare("MANAGE_PAGES", 0x10 + num_pages*8, 0x0C) self:setinbits(0x00, 31, 16, MANAGE_PAGES) self:setinbits(0x04, 15, 0, 1) --alloc self:setinbits(0x0C, 31, 0, num_pages) - local addr = cast('char*', addr) for i=0, num_pages-1 do - self:setinbits(0x10 + i*8, 31, 0, ptrbits(addr + 4096*i, 63, 32)) - self:setinbits(0x14 + i*8, 31, 12, ptrbits(addr + 4096*i, 31, 12)) + local _, phy = memory.dma_alloc(4096, 4096) + self:setinbits(0x10 + i*8, 31, 0, ptrbits(phy, 63, 32)) + self:setinbits(0x14 + i*8, 31, 12, ptrbits(phy, 31, 12)) end self:post(0x10 + num_pages*8, 0x0C) end @@ -535,69 +535,69 @@ function cmdq:query_hca_cap(what, which) self:post(0x0C, 0x100C - 3000) local caps = {} if which == 'general' then - caps.log_max_cq_sz = self:getoutbits(0x18, 23, 16) - caps.log_max_cq = self:getoutbits(0x18, 4, 0) - caps.log_max_eq_sz = self:getoutbits(0x1C, 31, 24) - caps.log_max_mkey = self:getoutbits(0x1C, 21, 16) - caps.log_max_eq = self:getoutbits(0x1C, 3, 0) - caps.max_indirection = self:getoutbits(0x20, 31, 24) - caps.log_max_mrw_sz = self:getoutbits(0x20, 22, 16) - caps.log_max_klm_list_size = self:getoutbits(0x20, 5, 0) - caps.end_pad = self:getoutbits(0x2C, 31, 31) - caps.start_pad = self:getoutbits(0x2C, 28, 28) - caps.cache_line_128byte = self:getoutbits(0x2C, 27, 27) - caps.vport_counters = self:getoutbits(0x30, 30, 30) - caps.vport_group_manager = self:getoutbits(0x34, 31, 31) - caps.nic_flow_table = self:getoutbits(0x34, 25, 25) - caps.port_type = self:getoutbits(0x34, 9, 8) - caps.num_ports = self:getoutbits(0x34, 7, 0) - caps.log_max_msg = self:getoutbits(0x38, 28, 24) - caps.max_tc = self:getoutbits(0x38, 19, 16) - caps.cqe_version = self:getoutbits(0x3C, 3, 0) - caps.cmdif_checksum = self:getoutbits(0x40, 15, 14) - caps.wq_signature = self:getoutbits(0x40, 11, 11) - caps.sctr_data_cqe = self:getoutbits(0x40, 10, 10) - caps.eth_net_offloads = self:getoutbits(0x40, 3, 3) - caps.cq_oi = self:getoutbits(0x44, 31, 31) - caps.cq_resize = self:getoutbits(0x44, 30, 30) - caps.cq_moderation = self:getoutbits(0x44, 29, 29) - caps.cq_eq_remap = self:getoutbits(0x44, 25, 25) - caps.scqe_break_moderation = self:getoutbits(0x44, 21, 21) - caps.cq_period_start_from_cqe = self:getoutbits(0x44, 20, 20) - caps.imaicl = self:getoutbits(0x44, 14, 14) - caps.xrc = self:getoutbits(0x44, 3, 3) - caps.ud = self:getoutbits(0x44, 2, 2) - caps.uc = self:getoutbits(0x44, 1, 1) - caps.rc = self:getoutbits(0x44, 0, 0) - caps.uar_sz = self:getoutbits(0x48, 21, 16) - caps.log_pg_sz = self:getoutbits(0x48, 7, 0) - caps.bf = self:getoutbits(0x4C, 31, 31) - caps.driver_version = self:getoutbits(0x4C, 30, 30) - caps.pad_tx_eth_packet = self:getoutbits(0x4C, 29, 29) - caps.log_bf_reg_size = self:getoutbits(0x4C, 20, 16) - caps.log_max_transport_domain = self:getoutbits(0x64, 28, 24) - caps.log_max_pd = self:getoutbits(0x64, 20, 16) - caps.max_flow_counter = self:getoutbits(0x68, 15, 0) - caps.log_max_rq = self:getoutbits(0x6C, 28, 24) - caps.log_max_sq = self:getoutbits(0x6C, 20, 16) - caps.log_max_tir = self:getoutbits(0x6C, 12, 8) - caps.log_max_tis = self:getoutbits(0x6C, 4, 0) - caps.basic_cyclic_rcv_wqe = self:getoutbits(0x70, 31, 31) - caps.log_max_rmp = self:getoutbits(0x70, 28, 24) - caps.log_max_rqt = self:getoutbits(0x70, 20, 16) - caps.log_max_rqt_size = self:getoutbits(0x70, 12, 8) - caps.log_max_tis_per_sq = self:getoutbits(0x70, 4, 0) - caps.log_max_stride_sz_rq = self:getoutbits(0x74, 28, 24) - caps.log_min_stride_sz_rq = self:getoutbits(0x74, 20, 16) - caps.log_max_stride_sz_sq = self:getoutbits(0x74, 12, 8) - caps.log_min_stride_sz_sq = self:getoutbits(0x74, 4, 0) - caps.log_max_wq_sz = self:getoutbits(0x78, 4, 0) - caps.log_max_vlan_list = self:getoutbits(0x7C, 20, 16) - caps.log_max_current_mc_list = self:getoutbits(0x7C, 12, 8) - caps.log_max_current_uc_list = self:getoutbits(0x7C, 4, 0) - caps.log_max_l2_table = self:getoutbits(0x90, 28, 24) - caps.log_uar_page_sz = self:getoutbits(0x90, 15, 0) - caps.device_frequency_mhz = self:getoutbits(0x98, 31, 0) + caps.log_max_cq_sz = self:getoutbits(0x10 + 0x18, 23, 16) + caps.log_max_cq = self:getoutbits(0x10 + 0x18, 4, 0) + caps.log_max_eq_sz = self:getoutbits(0x10 + 0x1C, 31, 24) + caps.log_max_mkey = self:getoutbits(0x10 + 0x1C, 21, 16) + caps.log_max_eq = self:getoutbits(0x10 + 0x1C, 3, 0) + caps.max_indirection = self:getoutbits(0x10 + 0x20, 31, 24) + caps.log_max_mrw_sz = self:getoutbits(0x10 + 0x20, 22, 16) + caps.log_max_klm_list_size = self:getoutbits(0x10 + 0x20, 5, 0) + caps.end_pad = self:getoutbits(0x10 + 0x2C, 31, 31) + caps.start_pad = self:getoutbits(0x10 + 0x2C, 28, 28) + caps.cache_line_128byte = self:getoutbits(0x10 + 0x2C, 27, 27) + caps.vport_counters = self:getoutbits(0x10 + 0x30, 30, 30) + caps.vport_group_manager = self:getoutbits(0x10 + 0x34, 31, 31) + caps.nic_flow_table = self:getoutbits(0x10 + 0x34, 25, 25) + caps.port_type = self:getoutbits(0x10 + 0x34, 9, 8) + caps.num_ports = self:getoutbits(0x10 + 0x34, 7, 0) + caps.log_max_msg = self:getoutbits(0x10 + 0x38, 28, 24) + caps.max_tc = self:getoutbits(0x10 + 0x38, 19, 16) + caps.cqe_version = self:getoutbits(0x10 + 0x3C, 3, 0) + caps.cmdif_checksum = self:getoutbits(0x10 + 0x40, 15, 14) + caps.wq_signature = self:getoutbits(0x10 + 0x40, 11, 11) + caps.sctr_data_cqe = self:getoutbits(0x10 + 0x40, 10, 10) + caps.eth_net_offloads = self:getoutbits(0x10 + 0x40, 3, 3) + caps.cq_oi = self:getoutbits(0x10 + 0x44, 31, 31) + caps.cq_resize = self:getoutbits(0x10 + 0x44, 30, 30) + caps.cq_moderation = self:getoutbits(0x10 + 0x44, 29, 29) + caps.cq_eq_remap = self:getoutbits(0x10 + 0x44, 25, 25) + caps.scqe_break_moderation = self:getoutbits(0x10 + 0x44, 21, 21) + caps.cq_period_start_from_cqe = self:getoutbits(0x10 + 0x44, 20, 20) + caps.imaicl = self:getoutbits(0x10 + 0x44, 14, 14) + caps.xrc = self:getoutbits(0x10 + 0x44, 3, 3) + caps.ud = self:getoutbits(0x10 + 0x44, 2, 2) + caps.uc = self:getoutbits(0x10 + 0x44, 1, 1) + caps.rc = self:getoutbits(0x10 + 0x44, 0, 0) + caps.uar_sz = self:getoutbits(0x10 + 0x48, 21, 16) + caps.log_pg_sz = self:getoutbits(0x10 + 0x48, 7, 0) + caps.bf = self:getoutbits(0x10 + 0x4C, 31, 31) + caps.driver_version = self:getoutbits(0x10 + 0x4C, 30, 30) + caps.pad_tx_eth_packet = self:getoutbits(0x10 + 0x4C, 29, 29) + caps.log_bf_reg_size = self:getoutbits(0x10 + 0x4C, 20, 16) + caps.log_max_transport_domain = self:getoutbits(0x10 + 0x64, 28, 24) + caps.log_max_pd = self:getoutbits(0x10 + 0x64, 20, 16) + caps.max_flow_counter = self:getoutbits(0x10 + 0x68, 15, 0) + caps.log_max_rq = self:getoutbits(0x10 + 0x6C, 28, 24) + caps.log_max_sq = self:getoutbits(0x10 + 0x6C, 20, 16) + caps.log_max_tir = self:getoutbits(0x10 + 0x6C, 12, 8) + caps.log_max_tis = self:getoutbits(0x10 + 0x6C, 4, 0) + caps.basic_cyclic_rcv_wqe = self:getoutbits(0x10 + 0x70, 31, 31) + caps.log_max_rmp = self:getoutbits(0x10 + 0x70, 28, 24) + caps.log_max_rqt = self:getoutbits(0x10 + 0x70, 20, 16) + caps.log_max_rqt_size = self:getoutbits(0x10 + 0x70, 12, 8) + caps.log_max_tis_per_sq = self:getoutbits(0x10 + 0x70, 4, 0) + caps.log_max_stride_sz_rq = self:getoutbits(0x10 + 0x74, 28, 24) + caps.log_min_stride_sz_rq = self:getoutbits(0x10 + 0x74, 20, 16) + caps.log_max_stride_sz_sq = self:getoutbits(0x10 + 0x74, 12, 8) + caps.log_min_stride_sz_sq = self:getoutbits(0x10 + 0x74, 4, 0) + caps.log_max_wq_sz = self:getoutbits(0x10 + 0x78, 4, 0) + caps.log_max_vlan_list = self:getoutbits(0x10 + 0x7C, 20, 16) + caps.log_max_current_mc_list = self:getoutbits(0x10 + 0x7C, 12, 8) + caps.log_max_current_uc_list = self:getoutbits(0x10 + 0x7C, 4, 0) + caps.log_max_l2_table = self:getoutbits(0x10 + 0x90, 28, 24) + caps.log_uar_page_sz = self:getoutbits(0x10 + 0x90, 15, 0) + caps.device_frequency_mhz = self:getoutbits(0x10 + 0x98, 31, 0) elseif which_caps == 'offload' then --TODO elseif which_caps == 'flow_table' then @@ -607,43 +607,43 @@ function cmdq:query_hca_cap(what, which) end function cmdq:set_hca_cap(which, caps) - self:prepare("SET_HCA_CAP", 0x100C, 0x0C) + self:prepare("SET_HCA_CAP", 0x100C - 3000, 0x0C) self:setinbits(0x00, 31, 16, SET_HCA_CAP) self:setinbits(0x04, 15, 1, assert(which_codes[which])) - if which_caps == 'general' then - self:setinbits(0x18, + if which == 'general' then + self:setinbits(0x10 + 0x18, 23, 16, caps.log_max_cq_sz, 4, 0, caps.log_max_cq) - self:setinbits(0x1C, + self:setinbits(0x10 + 0x1C, 31, 24, caps.log_max_eq_sz, 21, 16, caps.log_max_mkey, 3, 0, caps.log_max_eq) - self:setinbits(0x20, + self:setinbits(0x10 + 0x20, 31, 24, caps.max_indirection, 22, 16, caps.log_max_mrw_sz, 5, 0, caps.log_max_klm_list_size) - self:setinbits(0x2C, + self:setinbits(0x10 + 0x2C, 31, 31, caps.end_pad, 28, 28, caps.start_pad, 27, 27, caps.cache_line_128byte) - self:setinbits(0x30, + self:setinbits(0x10 + 0x30, 30, 30, caps.vport_counters) - self:setinbits(0x34, + self:setinbits(0x10 + 0x34, 31, 31, caps.vport_group_manager, 25, 25, caps.nic_flow_table, 9, 8, caps.port_type, 7, 0, caps.num_ports) - self:setinbits(0x38, + self:setinbits(0x10 + 0x38, 28, 24, caps.log_max_msg, 19, 16, caps.max_tc) - self:setinbits(0x3C, + self:setinbits(0x10 + 0x3C, 3, 0, caps.cqe_version) - self:setinbits(0x40, + self:setinbits(0x10 + 0x40, 15, 14, caps.cmdif_checksum, 11, 11, caps.wq_signature, 10, 10, caps.sctr_data_cqe, 3, 3, caps.eth_net_offloads) - self:setinbits(0x44, + self:setinbits(0x10 + 0x44, 31, 31, caps.cq_oi, 30, 30, caps.cq_resize, 29, 29, caps.cq_moderation, @@ -655,48 +655,48 @@ function cmdq:set_hca_cap(which, caps) 2, 2, caps.ud, 1, 1, caps.uc, 0, 0, caps.rc) - self:setinbits(0x48, + self:setinbits(0x10 + 0x48, 21, 16, caps.uar_sz, 7, 0, caps.log_pg_sz) - self:setinbits(0x4C, + self:setinbits(0x10 + 0x4C, 31, 31, caps.bf, 30, 30, caps.driver_version, 29, 29, caps.pad_tx_eth_packet, 20, 16, caps.log_bf_reg_size) - self:setinbits(0x64, + self:setinbits(0x10 + 0x64, 28, 24, caps.log_max_transport_domain, 20, 16, caps.log_max_pd) - self:setinbits(0x68, + self:setinbits(0x10 + 0x68, 15, 0, caps.max_flow_counter) - self:setinbits(0x6C, + self:setinbits(0x10 + 0x6C, 28, 24, caps.log_max_rq, 20, 16, caps.log_max_sq, 12, 8, caps.log_max_tir, 4, 0, caps.log_max_tis) - self:setinbits(0x70, + self:setinbits(0x10 + 0x70, 31, 31, caps.basic_cyclic_rcv_wqe, 28, 24, caps.log_max_rmp, 20, 16, caps.log_max_rqt, 12, 8, caps.log_max_rqt_size, 4, 0, caps.log_max_tis_per_sq) - self:setinbits(0x74, + self:setinbits(0x10 + 0x74, 28, 24, caps.log_max_stride_sz_rq, 20, 16, caps.log_min_stride_sz_rq, 12, 8, caps.log_max_stride_sz_sq, 4, 0, caps.log_min_stride_sz_sq) - self:setinbits(0x78, + self:setinbits(0x10 + 0x78, 4, 0, caps.log_max_wq_sz) - self:setinbits(0x7C, + self:setinbits(0x10 + 0x7C, 20, 16, caps.log_max_vlan_list, 12, 8, caps.log_max_current_mc_list, 4, 0, caps.log_max_current_uc_list) - self:setinbits(0x90, + self:setinbits(0x10 + 0x90, 28, 24, caps.log_max_l2_table, 15, 0, caps.log_uar_page_sz) - self:setinbits(0x98, + self:setinbits(0x10 + 0x98, 31, 0, caps.device_frequency_mhz) - elseif which_caps == 'offload' then - self:setinbits(0x00, + elseif which == 'offload' then + self:setinbits(0x10 + 0x00, 31, 31, caps.csum_cap, 30, 30, caps.vlan_cap, 29, 29, caps.lro_cap, @@ -709,12 +709,12 @@ function cmdq:set_hca_cap(which, caps) 20, 16, caps.max_lso_cap, 13, 12, caps.wqe_inline_mode, 11, 8, caps.rss_ind_tbl_cap) - self:setinbits(0x08, + self:setinbits(0x10 + 0x08, 15, 0, caps.lro_min_mss_size) for i = 1, 4 do - self:setinbits(0x30 + (i-1)*4, 31, 0, caps.lro_timer_supported_periods[i]) + self:setinbits(0x10 + 0x30 + (i-1)*4, 31, 0, caps.lro_timer_supported_periods[i]) end - elseif which_caps == 'flow_table' then + elseif which == 'flow_table' then --TODO end self:post(0x100C, 0x0C) @@ -783,22 +783,26 @@ function ConnectX4:new(arg) -- PRM: Execute MANAGE_PAGES to provide the HCA with all required -- init-pages. This can be done by multiple MANAGE_PAGES commands. - local bp_ptr, bp_phy = memory.dma_alloc(4096 * boot_pages, 4096) - assert(band(bp_phy, 0xfff) == 0) --the phy address must be 4K-aligned - cmdq:alloc_pages(bp_phy, boot_pages) - - local t = cmdq:query_hca_cap('cur', 'general') - print'query_hca_cap (current, general):' - for k,v in pairs(t) do - print((" %-24s = %s"):format(k, v)) - end + cmdq:alloc_pages(boot_pages) - local t = cmdq:query_hca_cap('max', 'general') - print'query_hca_cap (maximum, general):' - for k,v in pairs(t) do - print((" %-24s = %s"):format(k, v)) + local cur = cmdq:query_hca_cap('cur', 'general') + local max = cmdq:query_hca_cap('max', 'general') + print'Capabilities - current and (maximum):' + for k in pairs(cur) do + print((" %-24s = %-3s (%s)"):format(k, cur[k], max[k])) end + cmdq:set_hca_cap('general', cur) + + -- Initialization pages + local init_pages = cmdq:query_pages('init') + print("query_pages'init' ", init_pages) + assert(init_pages > 0) + + cmdq:alloc_pages(init_pages) + + cmdq:init_hca() + --[[ cmdq:set_hca_cap() cmdq:query_pages() From 8c0be9da4905d1cc8c88942a6fbc63223600f6b4 Mon Sep 17 00:00:00 2001 From: Luke Gorrie Date: Tue, 14 Jun 2016 05:19:25 +0000 Subject: [PATCH 020/209] connectx4: Comment on ommitted SET_DRIVER_VERSION --- src/apps/mellanox/connectx4.lua | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/apps/mellanox/connectx4.lua b/src/apps/mellanox/connectx4.lua index 89560c5699..385afb5b2e 100644 --- a/src/apps/mellanox/connectx4.lua +++ b/src/apps/mellanox/connectx4.lua @@ -49,6 +49,18 @@ -- this driver without signatures. It seems potentially futile -- to calculate and include command signatures if they are not -- actually being verified by the device. +-- +-- DRIVER VERSION: This driver does /not/ identify itself via the +-- command SET_DRIVER_VERSION. That interation could lead to +-- hazards in the spirit of HTTP User-Agent where the adapter +-- firmware would behave differently depending on how the +-- driver identifies itself. +-- +-- This decision could be revisited in the future when the +-- motivation for this mechanism is better understood. (The +-- card and firmware being used for initial development is not +-- asking the driver to identify itself anyway.) + module(...,package.seeall) From f196ac72cc698c5f6cc1bc5b897718bc4d15659e Mon Sep 17 00:00:00 2001 From: Luke Gorrie Date: Tue, 14 Jun 2016 05:20:43 +0000 Subject: [PATCH 021/209] connectx4: Added CREATE_EQ (event queue) command --- src/apps/mellanox/connectx4.lua | 59 +++++++++++++++++++++++++++++++-- 1 file changed, 57 insertions(+), 2 deletions(-) diff --git a/src/apps/mellanox/connectx4.lua b/src/apps/mellanox/connectx4.lua index 385afb5b2e..a098c82525 100644 --- a/src/apps/mellanox/connectx4.lua +++ b/src/apps/mellanox/connectx4.lua @@ -225,8 +225,7 @@ local QUERY_PAGES = 0x107 local MANAGE_PAGES = 0x108 local SET_HCA_CAP = 0x109 local QUERY_ISSI = 0x10A ---local QUERY_ISSI = 0x010A ---local QUERY_ISSI = 0x0A01 +local CREATE_EQ = 0x301 local SET_ISSI = 0x10B local SET_DRIVER_VERSION = 0x10D @@ -529,6 +528,59 @@ function cmdq:alloc_pages(num_pages) self:post(0x10 + num_pages*8, 0x0C) end +-- Create Event Queue (EQ) +function cmdq:create_eq(numpages) + self:prepare("CREATE_EQ", 0x10C + numpages*8, 0x0C) + self:setinbits(0x00, 31, 16, CREATE_EQ) + -- Setup Event Queue Context: + -- + + -- XXX Had wanted to use log_page_size=0 for 4KB pages + -- (2^0*4096=4096) but get BAD_INPUT_LEN errors. Have consulted the + -- hexdump for the Linux mlx5 driver and seen them choose + -- log_page_size=2 for presumably 16KB pages (2^2*4096=16384) and + -- mimicking this value resolves the error. + -- + -- So, it works, but questions: + -- 1. How come we are choosing a page size? 4KB is used elsewhere. + -- 2. Are we setting the value correctly or is there some silly bug? + -- 3. How come log_page_size 2 is okay but 0 is not? + -- (What is the root cause of the BAD_INPUT_LEN error, really?) + local status = 0 -- 0 = OK + local ec = 0 -- event collapse flag + local oi = 0 -- overrun ignore flag + local st = 0x0 -- (Card did not accept 0x0A) + local page_offset = 0 -- (must be 0) + local log_eq_size = 7 -- Log (base 2) of EQ size (in entries) + local uar_page = 0 -- UAR page 0 for main event queue + local intr = 0 -- MSI-X table entry (should not be used) + local log_page_size = 2 -- Log (base 2) of page size in 4KB units + local consumer_counter = 0 -- Software cursor (init to zero) + local producer_counter = 0 -- Hardware cursor (init to zero) + self:setinbits(0x10 + 0x00, + 31, 28, status, + 18, 18, ec, + 17, 17, oi, + 11, 8, st) + self:setinbits(0x10 + 0x08, 7,9, page_offset) + self:setinbits(0x10 + 0x0C, 28, 24, log_eq_size, 23, 0, uar_page) + self:setinbits(0x10 + 0x14, 7, 9, intr) + self:setinbits(0x10 + 0x18, 28, 24, log_page_size) + self:setinbits(0x10 + 0x28, 23, 0, consumer_counter) + self:setinbits(0x10 + 0x2C, 23, 0, producer_counter) + -- Set event bitmask + local events = bits{PageRequest=0x0A} + self:setinbits(0x10 + 0x5C, 31, 0, events) + -- Allocate pages in contiguous physical memory + local ptr, phy = memory.dma_alloc(4096 * numpages, 4096) + for i = 0, numpages-1 do + self:setinbits(0x110 + i*8, 31, 0, ptrbits(phy + i * 4096, 63, 32)) + self:setinbits(0x114 + i*8, 31, 0, ptrbits(phy + i * 4096, 31, 0)) + end + self:post(0x10C + numpages*8, 0x0C) + return self:getoutbits(0x08, 7, 0) +end + local what_codes = { max = 0, cur = 1, @@ -848,6 +900,9 @@ function hexdump (pointer, index, bytes, dumpoffset) if i % 16 == 0 then if i > 0 then io.stdout:write("\n") end io.stdout:write(("%03x: "):format(dumpoffset+i)) + local eq = cmdq:create_eq(1) + print("eq = " .. eq) + elseif i % 4 == 0 then io.stdout:write(" ") end From c1cee02a649a4913c1284e75ae12db7d26ed3a49 Mon Sep 17 00:00:00 2001 From: Luke Gorrie Date: Tue, 14 Jun 2016 05:21:52 +0000 Subject: [PATCH 022/209] connectx4: Added initial VPORT commands Added QUERY_VPORT_STATE, MODIFY_VPORT_STATE, QUERY_NIC_VPORT_CONTEXT. Note: I am not sure that these commands are actually needed since we are not using SR-IOV. The PRM mandates using some VPORT commands but I don't see them in the trace from the Linux mlx5 driver. So we may be able to remove this code. --- src/apps/mellanox/connectx4.lua | 47 ++++++++++++++++++++++++++++++--- 1 file changed, 44 insertions(+), 3 deletions(-) diff --git a/src/apps/mellanox/connectx4.lua b/src/apps/mellanox/connectx4.lua index a098c82525..b341265e26 100644 --- a/src/apps/mellanox/connectx4.lua +++ b/src/apps/mellanox/connectx4.lua @@ -784,6 +784,42 @@ function cmdq:set_hca_cap(which, caps) self:post(0x100C, 0x0C) end +-- XXX VPORT commands /may/ not be needed since we are not using SR-IOV. +-- In this case the functions below can be removed. + +function cmdq:query_vport_state() + self:prepare("QUERY_VPORT_STATE", 0x0c, 0x0c) + self:setinbits(0x00, 31, 16, QUERY_VPORT_STATE) + self:post(0x0C, 0x0C) + return { admin_state = self:getoutbits(0x0C, 7, 4), + oper_state = self:getoutbits(0x0C, 3, 0) } +end + +function cmdq:modify_vport_state(admin_state) + self:prepare("MODIFY_VPORT_STATE", 0x0c, 0x0c) + self:setinbits(0x00, 31, 16, MODIFY_VPORT_STATE) + self:setinbits(0x0C, 7, 4, admin_state) + self:post(0x0C, 0x0C) +end + +function cmdq:query_nic_vport_context() + -- XXX This command can be used to manipulate long lists of allowed + -- unicast addresses, multicast addresses, and VLANs. For now we + -- skip that (leave the list length as zero) and access only the + -- global settings. Is this interaction correct ? + self:prepare("QUERY_NIC_VPORT_CONTEXT", 0x0c, 0x10+0xFC) + self:setinbits(0x00, 31, 16, 0x754) -- Command opcode + self:post(0x0C, 0x10+0xFC) + local mac_hi = self:getoutbits(0x10+0xF4, 31, 0) + local mac_lo = self:getoutbits(0x10+0xF8, 31, 0) + local mac_hex = bit.tohex(mac_hi, 4) .. bit.tohex(mac_lo, 8) + return { mtu = self:getoutbits(0x10+0x24, 15, 0), + promisc_uc = self:getoutbits(0x10+0xf0, 31, 31), + promisc_mc = self:getoutbits(0x10+0xf0, 30, 30), + promisc_all = self:getoutbits(0x10+0xf0, 29, 29), + permanent_address = mac_hex } +end + function cmdq:init_hca() self:prepare("INIT_HCA", 0x0c, 0x0c) self:setinbits(0x00, 31, 16, INIT_HCA) @@ -867,6 +903,14 @@ function ConnectX4:new(arg) cmdq:init_hca() + local eq = cmdq:create_eq(1) + print("eq = " .. eq) + + local vport_ctx = cmdq:query_nic_vport_context() + for k,v in pairs(vport_ctx) do + print(k,v) + end + --[[ cmdq:set_hca_cap() cmdq:query_pages() @@ -900,9 +944,6 @@ function hexdump (pointer, index, bytes, dumpoffset) if i % 16 == 0 then if i > 0 then io.stdout:write("\n") end io.stdout:write(("%03x: "):format(dumpoffset+i)) - local eq = cmdq:create_eq(1) - print("eq = " .. eq) - elseif i % 4 == 0 then io.stdout:write(" ") end From 51d4d45c5354890e9dea8f18f625d49037c63085 Mon Sep 17 00:00:00 2001 From: Luke Gorrie Date: Tue, 14 Jun 2016 09:27:52 +0000 Subject: [PATCH 023/209] connectx4: Give 'lock' argument to map_pci_memory Required argument for new code merged from master in v2016.06. Request exclusive lock on the device. --- src/apps/mellanox/connectx4.lua | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/apps/mellanox/connectx4.lua b/src/apps/mellanox/connectx4.lua index b341265e26..e9a2bbd9b0 100644 --- a/src/apps/mellanox/connectx4.lua +++ b/src/apps/mellanox/connectx4.lua @@ -849,7 +849,7 @@ function ConnectX4:new(arg) pci.unbind_device_from_linux(pciaddress) pci.reset_device(pciaddress) pci.set_bus_master(pciaddress, true) - local base, fd = pci.map_pci_memory(pciaddress, 0) + local base, fd = pci.map_pci_memory(pciaddress, 0, true) trace("Read the initialization segment") local init_seg = init_seg:init(base) From 7659eb61fcf01cc9db342dfdc040644c23b45186 Mon Sep 17 00:00:00 2001 From: Luke Gorrie Date: Tue, 14 Jun 2016 14:17:44 +0000 Subject: [PATCH 024/209] connectx4: Clean driver initialization This commit introduces a clean and working version of the device initialization. --- src/apps/mellanox/README.src.md | 4 - src/apps/mellanox/connectx4.lua | 1496 ++++++++++++++++++------------- 2 files changed, 860 insertions(+), 640 deletions(-) delete mode 100644 src/apps/mellanox/README.src.md diff --git a/src/apps/mellanox/README.src.md b/src/apps/mellanox/README.src.md deleted file mode 100644 index 65ec01f352..0000000000 --- a/src/apps/mellanox/README.src.md +++ /dev/null @@ -1,4 +0,0 @@ -# Mellanox ConnectX-4 Ethernet Controller App - -## MCX4 (apps.mellanox.connectx4) - diff --git a/src/apps/mellanox/connectx4.lua b/src/apps/mellanox/connectx4.lua index e9a2bbd9b0..c15046a923 100644 --- a/src/apps/mellanox/connectx4.lua +++ b/src/apps/mellanox/connectx4.lua @@ -1,66 +1,31 @@ ---go@ git up ---- Device driver for the Mellanox ConnectX-4 series Ethernet controller. +-- Device driver for the Mellanox ConnectX-4 Ethernet controller family. +-- Use of this source code is governed by the Apache 2.0 license; see COPYING. --- This driver is written using these main reference sources: --- --- PRM: Mellanox Adapter Programmer's Reference Manual --- This document will be made available on Mellanox's website. --- Has not happened yet (as of 2016-05-24). --- --- mlx5_core: Linux kernel driver for ConnectX-4. This has been --- developed by Mellanox. --- --- Hexdumps: The Linux kernel driver has the capability to run in --- debug mode and to output hexdumps showing the exact --- interactions with the card. This driver has a similar --- capability. This makes it possible to directly compare --- driver behavior directly via hexdumps i.e. independently of --- the source code. - --- Implementation notes: --- --- RESET: This driver performs a PCIe reset of the device prior to --- initialization. This is instead of performing the software --- deinitialization procedure. The main reason for this is --- simplicity and keeping the code minimal. --- --- Relatedly, reloading the mlx5_core driver in Linux 4.4.8 --- does not seem to consistently succeed in reinitializing the --- device. This may be due to bugs in the driver and/or firmware. --- Skipping the soft-reset would seem to reduce our driver's --- exposure to such problems. +-- This is a device driver for Mellanox ConnectX-4 and ConnectX-4 LX +-- ethernet cards. This driver is completely stand-alone and does not +-- depend on any other software such as Mellanox OFED library or the +-- Linux mlx5 driver. -- --- In the future we could consider implementing the software --- reset if this is found to be important for some purpose. +-- Thanks are due to Mellanox and Deutsche Telekom for making it +-- possible to develop this driver based on publicly available +-- information. Mellanox supported this work by releasing an edition +-- of their Programming Reference Manual (PRM) that is not subject to +-- confidentiality restrictions. This is now a valuable resource to +-- independent open source developers everywhere (spread the word!) -- --- SIGNATURE: --- Command signatures fields: Are they useful? Are they used? --- --- Usefulness - command signature is an 8-bit value calculated --- with a simple xor. What does this protect and how effective --- is it? Curious because PCIe is already performing a more --- robust checksum. Perhaps the signature is designed to catch --- driver bugs? Or host memory corruption? Enquiring minds --- would like to know... +-- Special thanks to Normen Kowalewski and Rainer Schatzmayer. + +-- General notes about this implementation: -- --- Used - the Linux driver has code for signatures but seems to --- hard-code this as disabled at least in certain instances. --- Likewise the card is accepting at least some commands from --- this driver without signatures. It seems potentially futile --- to calculate and include command signatures if they are not --- actually being verified by the device. +-- The driver is based primarily on the PRM: +-- http://www.mellanox.com/related-docs/user_manuals/Ethernet_Adapters_Programming_Manual.pdf -- --- DRIVER VERSION: This driver does /not/ identify itself via the --- command SET_DRIVER_VERSION. That interation could lead to --- hazards in the spirit of HTTP User-Agent where the adapter --- firmware would behave differently depending on how the --- driver identifies itself. +-- The Linux mlx5_core driver is also used for reference. This +-- driver implements the same hexdump format as mlx5_core so it is +-- possible to directly compare/diff the binary encoded commands +-- that the drivers send. -- --- This decision could be revisited in the future when the --- motivation for this mechanism is better understood. (The --- card and firmware being used for initial development is not --- asking the driver to identify itself anyway.) - +-- Physical addresses are always used for DMA (rlkey). module(...,package.seeall) @@ -79,166 +44,708 @@ local cast = ffi.cast local band, bor, shl, shr, bswap, bnot = bit.band, bit.bor, bit.lshift, bit.rshift, bit.bswap, bit.bnot -local debug = false +local debug_trace = true -- Print trace messages +local debug_hexdump = false -- Print hexdumps (in Linux mlx5 format) + + +--------------------------------------------------------------- +-- ConnectX4 Snabb app. +-- +-- Uses the driver routines to implement ConnectX-4 support in +-- the Snabb app network. +--------------------------------------------------------------- ConnectX4 = {} ConnectX4.__index = ConnectX4 ---utils +function ConnectX4:new (arg) + local self = setmetatable({}, self) + local conf = config.parse_app_arg(arg) + local pciaddress = pci.qualified(conf.pciaddress) + + local sendq_size = conf.sendq_size or 1024 + local recvq_size = conf.recvq_size or 1024 + + -- Perform a hard reset of the device to bring it into a blank state. + -- + -- Reset is performed at PCI level instead of via firmware command. + -- This is intended to be robust to problems like bad firmware states. + pci.unbind_device_from_linux(pciaddress) + pci.reset_device(pciaddress) + pci.set_bus_master(pciaddress, true) + + -- Setup the command channel + -- + local base, fd = pci.map_pci_memory(pciaddress, 0, true) + local init_seg = InitializationSegment:new(base) + local hca = HCA:new(init_seg) + + trace("Write the physical location of the command queues to the init segment.") + init_seg:cmdq_phy_addr(memory.virtual_to_physical(hca.entry)) + if debug_trace then init_seg:dump() end + trace("Wait for the 'initializing' field to clear") + while not init_seg:ready() do + C.usleep(1000) + end + + -- Boot the card + -- + hca:enable_hca() + hca:set_issi(1) + hca:alloc_pages(hca:query_pages("boot")) + if debug_trace then self:dump_capabilities() end + + -- Initialize the card + -- + hca:alloc_pages(hca:query_pages("init")) + hca:init_hca() + hca:alloc_pages(hca:query_pages("regular")) + + if debug_trace then self:check_vport() end -local function alloc_pages(pages) - local ptr, phy = memory.dma_alloc(4096 * pages, 4096) - assert(band(phy, 0xfff) == 0) --the phy address must be 4K-aligned - return cast('uint32_t*', ptr), phy + -- Create basic objects that we need + -- + local uar = hca:alloc_uar() + local eq = hca:create_eq(uar) + local pd = hca:alloc_protection_domain() + local tdomain = hca:alloc_transport_domain() + local rlkey = hca:query_rlkey() + + -- Create send and receive queues & associated objects + -- + local tis = hca:create_tis(0, tdomain) + local send_cq = hca:create_cq(1024, uar, eq.eqn) + local recv_cq = hca:create_cq(1024, uar, eq.eqn) + + -- Allocate work queue memory (receive & send contiguous in memory) + local wq_doorbell = memory.dma_alloc(16) + local sendq_size = 1024 + local recvq_size = 1024 + local workqueues = memory.dma_alloc(64 * (sendq_size + recvq_size), 4096) + local rwq = workqueues -- receive work queue + local swq = workqueues + 64 * recvq_size -- send work queue + + -- Create the queue objects + local sq = hca:create_sq(send_cq.cqn, pd, sendq_size, wq_doorbell, swq, tis) + local rq = hca:create_rq(recv_cq.cqn, pd, recvq_size, wq_doorbell, rwq) + local tir = hca:create_tir_direct(rq.rqn, tdomain) + + -- Setup packet dispatching. + -- Just a "wildcard" flow group to send RX packets to the receive queue. + -- + local rx_flow_table_id = hca:create_root_flow_table(NIC_RX) + local flow_group_id = hca:create_flow_group_wildcard(rx_flow_table_id, NIC_RX, 0, 0) + hca:set_flow_table_entry_wildcard(rx_flow_table_id, NIC_RX, flow_group_id, 0, tir) + hca:set_flow_table_root(rx_flow_table_id, NIC_RX) + + function self:stop() + pci.set_bus_master(pciaddress, false) + pci.reset_device(pciaddress) + pci.close_pci_resource(fd, base) + base, fd = nil + end + + return self end -function getint(addr, ofs) - local ofs = ofs/4 - assert(ofs == floor(ofs)) - return bswap(addr[ofs]) +function ConnectX4:dump_capabilities () + if true then return end + -- Print current and maximum card capabilities. + -- XXX Check if we have any specific requirements that we need to + -- set and/or assert on. + local cur = self.hca:query_hca_general_cap('current') + local max = self.hca:query_hca_general_cap('max') + print'Capabilities - current and (maximum):' + for k in pairs(cur) do + print((" %-24s = %-3s (%s)"):format(k, cur[k], max[k])) + end end -function setint(addr, ofs, val) - local ofs = ofs/4 - assert(ofs == floor(ofs)) - addr[ofs] = bswap(tonumber(val)) +function ConnectX4:check_vport () + if true then return end + local vport_ctx = hca:query_nic_vport_context() + for k,v in pairs(vport_ctx) do + print(k,v) + end + local vport_state = hca:query_vport_state() + for k,v in pairs(vport_state) do + print(k,v) + end end -local function getbits(val, bit2, bit1) - local mask = shl(2^(bit2-bit1+1)-1, bit1) - return shr(band(val, mask), bit1) + +--------------------------------------------------------------- +-- Firmware commands. +-- +-- Code for sending individual messages to the firmware. +-- These messages are defined in the "Command Reference" section +-- of the Mellanox Programmer Reference Manual (PRM). +-- +-- (See further below for the implementation of the command interface.) +--------------------------------------------------------------- + +-- These commands are all built on a handful of primitives for sending +-- commands to the HCA. The parameters to these functions are chosen +-- to be easy to cross-reference with the definitions in the PRM. +-- +-- command(name, last_input_offset, last_output_offset) +-- Start preparing a command for the HCA. +-- The input and output sizes are given as the offsets of their +-- last dwords. +-- The command name is given only for debugging purposes. +-- +-- input(name, offset, highbit, lowbit, value) +-- Specify an input parameter to the current command. +-- The parameter value is stored in the given bit-range at the +-- given offset. +-- The parameter name is given only for debugging purposes. +-- +-- execute() +-- Execute the command specified starting with the most recent +-- call to command(). +-- If the command fails then an exception is raised. +-- +-- output(offset, highbit, lowbit) +-- Return a value from the output of the command. + +-- Note: Parameters are often omitted when their default value (zero) +-- is sensible. Exceptions are made for more important ones. + +-- hca object is the main interface towards the NIC firmware. +HCA = {} + +--------------------------------------------------------------- +-- Startup & General commands +--------------------------------------------------------------- + +-- Turn on the NIC. +function HCA:enable_hca () + self:command("ENABLE_HCA", 0x0C, 0x08) + :input("opcode", 0x00, 31, 16, 0x104) + :execute() end -local function ptrbits(ptr, bit2, bit1) - local addr = cast('uint64_t', ptr) - return tonumber(getbits(addr, bit2, bit1)) +-- Initialize the NIC firmware. +function HCA:init_hca () + self:command("INIT_HCA", 0x0c, 0x0c) + :input("opcode", 0x00, 31, 16, 0x102) + :execute() end -local function setbits1(bit2, bit1, val) - local mask = shl(2^(bit2-bit1+1)-1, bit1) - local bits = band(shl(val, bit1), mask) - return bits +-- Set the software-firmware interface version to use. +function HCA:set_issi (issi) + self:command("SET_ISSI", 0x0C, 0x0C) + :input("opcode", 0x00, 31, 16, 0x10B) + :input("issi", 0x08, 15, 0, issi) + :execute() end -local function setbits(...) --bit2, bit1, val, ... - local endval = 0 - for i = 1, select('#', ...), 3 do - local bit2, bit1, val = select(i, ...) - endval = bor(endval, setbits1(bit2, bit1, val or 0)) +-- Query the value of the "reserved lkey" for using physical addresses. +function HCA:query_rlkey () + self:command("QUERY_SPECIAL_CONTEXTS", 0x0C, 0x0C) + :input("opcode", 0x00, 31, 16, 0x203) + :execute() + local rlkey = self:output(0x0C, 31, 0) + return rlkey +end + +-- Query how many pages of memory the NIC needs. +function HCA:query_pages (which) + self:command("QUERY_PAGES", 0x0C, 0x0C) + :input("opcode", 0x00, 31, 16, 0x107) + :input("opmod", 0x04, 15, 0, ({boot=1,init=2,regular=3})[which]) + :execute() + return self:output(0x0C, 31, 0) +end + +-- Provide the NIC with freshly allocated memory. +function HCA:alloc_pages (num_pages) + self:command("MANAGE_PAGES", 0x10 + num_pages*8, 0x0C) + :input("opcode", 0x00, 31, 16, 0x108) + :input("opmod", 0x04, 15, 0, 1) -- allocate mode + :input("input_num_entries", 0x0C, 31, 0, num_pages, "input_num_entries") + for i=0, num_pages do + local _, phy = memory.dma_alloc(4096, 4096) + self:input(nil, 0x10 + i*8, 31, 0, ptrbits(phy, 63, 32)) + self:input(nil, 0x14 + i*8, 31, 12, ptrbits(phy, 31, 12)) end - return endval + self:execute() end +-- Query the NIC capabilities (maximum or current setting). +function HCA:query_hca_general_cap (max_or_current) + local opmod = assert(({max=0, current=1})[max_or_current]) + self:command("QUERY_HCA_CAP", 0x0C, 0x100C - 3000) + :input("opcode", 0x00, 31, 16, 0x100) + :input("opmod", 0x04, 0, 0, opmod) + :execute() + return { + log_max_cq_sz = self:output(0x10 + 0x18, 23, 16), + log_max_cq = self:output(0x10 + 0x18, 4, 0), + log_max_eq_sz = self:output(0x10 + 0x1C, 31, 24), + log_max_mkey = self:output(0x10 + 0x1C, 21, 16), + log_max_eq = self:output(0x10 + 0x1C, 3, 0), + max_indirection = self:output(0x10 + 0x20, 31, 24), + log_max_mrw_sz = self:output(0x10 + 0x20, 22, 16), + log_max_klm_list_size = self:output(0x10 + 0x20, 5, 0), + end_pad = self:output(0x10 + 0x2C, 31, 31), + start_pad = self:output(0x10 + 0x2C, 28, 28), + cache_line_128byte = self:output(0x10 + 0x2C, 27, 27), + vport_counters = self:output(0x10 + 0x30, 30, 30), + vport_group_manager = self:output(0x10 + 0x34, 31, 31), + nic_flow_table = self:output(0x10 + 0x34, 25, 25), + port_type = self:output(0x10 + 0x34, 9, 8), + num_ports = self:output(0x10 + 0x34, 7, 0), + log_max_msg = self:output(0x10 + 0x38, 28, 24), + max_tc = self:output(0x10 + 0x38, 19, 16), + cqe_version = self:output(0x10 + 0x3C, 3, 0), + cmdif_checksum = self:output(0x10 + 0x40, 15, 14), + wq_signature = self:output(0x10 + 0x40, 11, 11), + sctr_data_cqe = self:output(0x10 + 0x40, 10, 10), + eth_net_offloads = self:output(0x10 + 0x40, 3, 3), + cq_oi = self:output(0x10 + 0x44, 31, 31), + cq_resize = self:output(0x10 + 0x44, 30, 30), + cq_moderation = self:output(0x10 + 0x44, 29, 29), + cq_eq_remap = self:output(0x10 + 0x44, 25, 25), + scqe_break_moderation = self:output(0x10 + 0x44, 21, 21), + cq_period_start_from_cqe = self:output(0x10 + 0x44, 20, 20), + imaicl = self:output(0x10 + 0x44, 14, 14), + xrc = self:output(0x10 + 0x44, 3, 3), + ud = self:output(0x10 + 0x44, 2, 2), + uc = self:output(0x10 + 0x44, 1, 1), + rc = self:output(0x10 + 0x44, 0, 0), + uar_sz = self:output(0x10 + 0x48, 21, 16), + log_pg_sz = self:output(0x10 + 0x48, 7, 0), + bf = self:output(0x10 + 0x4C, 31, 31), + driver_version = self:output(0x10 + 0x4C, 30, 30), + pad_tx_eth_packet = self:output(0x10 + 0x4C, 29, 29), + log_bf_reg_size = self:output(0x10 + 0x4C, 20, 16), + log_max_transport_domain = self:output(0x10 + 0x64, 28, 24), + log_max_pd = self:output(0x10 + 0x64, 20, 16), + max_flow_counter = self:output(0x10 + 0x68, 15, 0), + log_max_rq = self:output(0x10 + 0x6C, 28, 24), + log_max_sq = self:output(0x10 + 0x6C, 20, 16), + log_max_tir = self:output(0x10 + 0x6C, 12, 8), + log_max_tis = self:output(0x10 + 0x6C, 4, 0), + basic_cyclic_rcv_wqe = self:output(0x10 + 0x70, 31, 31), + log_max_rmp = self:output(0x10 + 0x70, 28, 24), + log_max_rqt = self:output(0x10 + 0x70, 20, 16), + log_max_rqt_size = self:output(0x10 + 0x70, 12, 8), + log_max_tis_per_sq = self:output(0x10 + 0x70, 4, 0), + log_max_stride_sz_rq = self:output(0x10 + 0x74, 28, 24), + log_min_stride_sz_rq = self:output(0x10 + 0x74, 20, 16), + log_max_stride_sz_sq = self:output(0x10 + 0x74, 12, 8), + log_min_stride_sz_sq = self:output(0x10 + 0x74, 4, 0), + log_max_wq_sz = self:output(0x10 + 0x78, 4, 0), + log_max_vlan_list = self:output(0x10 + 0x7C, 20, 16), + log_max_current_mc_list = self:output(0x10 + 0x7C, 12, 8), + log_max_current_uc_list = self:output(0x10 + 0x7C, 4, 0), + log_max_l2_table = self:output(0x10 + 0x90, 28, 24), + log_uar_page_sz = self:output(0x10 + 0x90, 15, 0), + device_frequency_mhz = self:output(0x10 + 0x98, 31, 0) + } +end ---init segment (section 4.3) +-- Teardown the NIC firmware. +-- mode = 0 (graceful) or 1 (panic) +function HCA:teardown_hca (mode) + self:command("TEARDOWN_HCA", 0x0c, 0x0c) + :input("opcode", 0x00, 31, 16, 0x103) + :input("opmod", 0x04, 15, 0, mode) + :execute() +end -local init_seg = {} -init_seg.__index = init_seg +function HCA:disable_hca () + self:command("DISABLE_HCA", 0x0c, 0x0c) + :input("opcode", 0x00, 31, 16, 0x103) + :input("opmod", 0x04, 15, 0, mode) + :execute() +end -function init_seg:getbits(ofs, bit2, bit1) - return getbits(getint(self.ptr, ofs), bit2, bit1) +--------------------------------------------------------------- +-- Event queues +--------------------------------------------------------------- + +-- Create an event queue that can be accessed via the given UAR page number. +function HCA:create_eq (uar) + local numpages = 1 + local log_eq_size = 7 -- 128 entries + local ptr, phy = memory.dma_alloc(4096, 4096) -- memory for entries + self:command("CREATE_EQ", 0x10C + numpages*8, 0x0C) + :input("opcode", 0x00, 31, 16, 0x301) + :input("log_eq_size", 0x10 + 0x0C, 28, 24, log_eq_size) + :input("uar_page", 0x10 + 0x0C, 23, 0, uar) + :input("log_page_size", 0x10 + 0x18, 28, 24, 2) -- XXX best value? 0 or max? + :input("event bitmask", 0x10 + 0x5C, 31, 0, bits({PageRequest=0xB})) -- XXX more events? + :input("pas[0] high", 0x110, 31, 0, ptrbits(phy, 63, 32)) + :input("pas[0] low", 0x114, 31, 0, ptrbits(phy, 31, 0)) + :execute() + local eqn = self:output(0x08, 7, 0) + return eq:new(eqn, ptr, 2^log_eq_size) end -function init_seg:setbits(ofs, ...) - setint(self.ptr, ofs, setbits(...)) +-- Event Queue Entry (EQE) +local eqe_t = ffi.typeof[[ + struct { + uint16_t event_type; + uint16_t event_sub_type; + uint32_t event_data; + uint16_t pad; + uint8_t signature; + uint8_t owner; + } +]] + +eq = {} +eq.__index = eq + +-- Create event queue object. +function eq:new (eqn, pointer, nentries) + local ring = ffi.cast(ffi.typeof("$*", eqe_t), pointer) + for i = 0, nentries-1 do + ring[i].owner = 1 + end + return setmetatable({eqn = eqn, + ring = ring, + index = 0, + n = nentries}, + self) end -function init_seg:init(ptr) - return setmetatable({ptr = cast('uint32_t*', ptr)}, self) +-- Poll the queue for events. +function eq:poll() + print("Polling EQ") + local eqe = self.ring[self.index] + while eqe.owner == 0 and eqe.event_type ~= 0xFF do + self.index = self.index + 1 + eqe = self.ring[self.index % self.n] + self:event(eqe) + end + print("done polling EQ") end -function init_seg:fw_rev() --maj, min, subminor - return - self:getbits(0, 15, 0), - self:getbits(0, 31, 16), - self:getbits(4, 15, 0) +-- Handle an event. +function eq:event () + print(("Got event %s.%s"):format(eqe.event_type, eqe.event_sub_type)) + error("Event handling not yet implemented") end -function init_seg:cmd_interface_rev() - return self:getbits(4, 31, 16) +--------------------------------------------------------------- +-- Vport +--------------------------------------------------------------- + +function HCA:set_vport_admin_state (up) + self:command("MODIFY_VPORT_STATE", 0x0c, 0x0c) + :input("opcode", 0x00, 31, 16, 0x751) + :input("admin_state", 0x0C, 7, 4, up and 1 or 0) + :execute() end -function init_seg:cmdq_phy_addr(addr) - if addr then - --must write the MSB of the addr first - self:setbits(0x10, 31, 0, ptrbits(addr, 63, 32)) - --also resets nic_interface and log_cmdq_* - self:setbits(0x14, 31, 12, ptrbits(addr, 31, 12)) - else - return cast('void*', - cast('uint64_t', self:getbits(0x10, 31, 0) * 2^32 + - cast('uint64_t', self:getbits(0x14, 31, 12)) * 2^12)) +function HCA:query_vport_state () + self:command("QUERY_VPORT_STATE", 0x0c, 0x0c) + :input("opcode", 0x00, 31, 16, 0x750) + :execute() + return { admin_state = self:output(0x0C, 7, 4), + oper_state = self:output(0x0C, 3, 0) } +end + +function HCA:query_vport_counter () + self:command("QUERY_VPORT_COUNTER", 0x1c, 0x20c) + :input("opcode", 0x00, 31, 16, 0x770) + :execute() + -- XXX return in a table + print("vport counters") + for i = 0x10, 0x200, 4 do + local n = self:output(i, 31, 0) + if n > 0 then print(bit.tohex(i), n) end end end -function init_seg:nic_interface(mode) - self:setbits(0x14, 9, 8, mode) +function HCA:query_nic_vport_context () + self:command("QUERY_NIC_VPORT_CONTEXT", 0x0c, 0x10+0xFC) + :input("opcode", 0x00, 31, 16, 0x754) + :execute() + local mac_hi = self:output(0x10+0xF4, 31, 0) + local mac_lo = self:output(0x10+0xF8, 31, 0) + local mac_hex = bit.tohex(mac_hi, 4) .. bit.tohex(mac_lo, 8) + return { min_wqe_inline_mode = self:output(0x10+0x00, 26, 24), + mtu = self:output(0x10+0x24, 15, 0), + promisc_uc = self:output(0x10+0xf0, 31, 31) == 1, + promisc_mc = self:output(0x10+0xf0, 30, 30) == 1, + promisc_all = self:output(0x10+0xf0, 29, 29) == 1, + permanent_address = mac_hex } end -function init_seg:log_cmdq_size() - return self:getbits(0x14, 7, 4) +--------------------------------------------------------------- +-- TIR and TIS +--------------------------------------------------------------- + +-- Allocate a Transport Domain. +function HCA:alloc_transport_domain () + self:command("ALLOC_TRANSPORT_DOMAIN", 0x0c, 0x0c) + :input("opcode", 0x00, 31, 16, 0x816) + :execute(0x0C, 0x0C) + return self:output(0x08, 23, 0) end -function init_seg:log_cmdq_stride() - return self:getbits(0x14, 3, 0) +-- Create a TIR (Transport Interface Receive) with direct dispatch (no hashing) +function HCA:create_tir_direct (rqn, transport_domain) + self:command("CREATE_TIR", 0x10C, 0x0C) + :input("opcode", 0x00, 31, 16, 0x900) + :input("inline_rqn", 0x20 + 0x1C, 23, 0, rqn) + :input("transport_domain", 0x20 + 0x24, 23, 0, transport_domain) + :execute() + return self:output(0x08, 23, 0) +end + +-- Create TIS (Transport Interface Send) +function HCA:create_tis (prio, transport_domain) + self:command("CREATE_TIS", 0x20 + 0x9C, 0x0C) + :input("opcode", 0x00, 31, 16, 0x912) + :input("prio", 0x20 + 0x00, 19, 16, prio) + :input("transport_domain", 0x20 + 0x24, 23, 0, transport_domain) + :execute() + return self:output(0x08, 23, 0) end -function init_seg:ring_doorbell(i) - self:setbits(0x18, i, i, 1) +-- Allocate a UAR (User Access Region) i.e. a page of MMIO registers. +function HCA:alloc_uar () + self:command("ALLOC_UAR", 0x0C, 0x0C) + :input("opcode", 0x00, 31, 16, 0x802) + :execute() + return self:output(0x08, 23, 0) end -function init_seg:ready(i, val) - return self:getbits(0x1fc, 31, 31) == 0 +-- Allocate a Protection Domain. +function HCA:alloc_protection_domain () + self:command("ALLOC_PD", 0x0C, 0x0C) + :input("opcode", 0x00, 31, 16, 0x800) + :execute() + return self:output(0x08, 23, 0) end -function init_seg:nic_interface_supported() - return self:getbits(0x1fc, 26, 24) == 0 +-- Create a completion queue and return a completion queue object. +function HCA:create_cq (entries, uar_page, eqn, db_phy) + local doorbell, doorbell_phy = memory.dma_alloc(16) + -- Memory for completion queue entries + local cqe, cqe_phy = memory.dma_alloc(entries * 64, 4096) + self:command("CREATE_CQ", 0x114, 0x0C) + :input("opcode", 0x00, 31, 16, 0x400) + :input("log_cq_size", 0x10 + 0x0C, 28, 24, 10) + :input("uar_page", 0x10 + 0x0C, 23, 0, uar_page) + :input("c_eqn", 0x10 + 0x14, 7, 0, eqn) + :input("log_page_size", 0x10 + 0x18, 28, 24, 4) + :input("db_addr high", 0x10 + 0x38, 31, 0, ptrbits(doorbell_phy, 63, 32)) + :input("db_addr_low", 0x10 + 0x3C, 31, 0, ptrbits(doorbell_phy, 31, 0)) + :input("pas[0] high", 0x110, 31, 0, ptrbits(cqe_phy, 63, 32)) + :input("pas[0] low", 0x114, 31, 0, ptrbits(cqe_phy, 31, 0)) + :execute() + local cqn = self:output(0x08, 23, 0) + return { cqn = cqn, doorbell = doorbell, cqe = cqe } end -function init_seg:internal_timer() - return - self:getbits(0x1000, 31, 0) * 2^32 + - self:getbits(0x1004, 31, 0) +-- Create a receive queue and return a receive queue object. +-- Return the receive queue number and a pointer to the WQEs. +function HCA:create_rq (cqn, pd, size, doorbell, rwq) + local log_wq_size = log2size(size) + local db_phy = memory.virtual_to_physical(doorbell) + local rwq_phy = memory.virtual_to_physical(rwq) + self:command("CREATE_RQ", 0x20 + 0x30 + 0xC4, 0x0C) + :input("opcode", 0x00, 31, 16, 0x908) + :input("rlkey", 0x20 + 0x00, 31, 31, 1) + :input("vlan_strip_disable", 0x20 + 0x00, 28, 28, 1) + :input("cqn", 0x20 + 0x08, 23, 0, cqn) + :input("wq_type", 0x20 + 0x30 + 0x00, 31, 28, 1) -- cyclic + :input("pd", 0x20 + 0x30 + 0x08, 23, 0, pd) + :input("dbr_addr high", 0x20 + 0x30 + 0x10, 31, 0, ptrbits(db_phy, 63, 32)) + :input("dbr_addr low", 0x20 + 0x30 + 0x14, 31, 0, ptrbits(db_phy, 31, 0)) + :input("log_wq_stride", 0x20 + 0x30 + 0x20, 19, 16, 4) + :input("page_size", 0x20 + 0x30 + 0x20, 12, 8, 4) -- XXX one big page? + :input("log_wq_size", 0x20 + 0x30 + 0x20, 4 , 0, log_wq_size) + :input("pas[0] high", 0x20 + 0x30 + 0xC0, 63, 32, ptrbits(rwq_phy, 63, 32)) + :input("pas[0] low", 0x20 + 0x30 + 0xC4, 31, 0, ptrbits(rwq_phy, 31, 0)) + :execute() + local rqn = self:output(0x08, 23, 0) + return RQ:new(rqn, rwq, doorbell) end -function init_seg:clear_int() - self:setbits(0x100c, 0, 0, 1) +RQ = {} + +function RQ:new (rqn, rwq, doorbell) + return setmetatable({rqn = rqn, rwq = rwq, doorbell = doorbell}, + {__index = RQ}) end -function init_seg:health_syndrome() - return self:getbits(0x1010, 31, 24) +-- Modify a Receive Queue by making a state transition. +function HCA:modify_rq (rqn, curr_state, next_state) + self:command("MODIFY_RQ", 0x20 + 0x30 + 0xC4, 0x0C) + :input("opcode", 0x00, 31, 16, 0x909) + :input("curr_state", 0x08, 31, 28, curr_state) + :input("rqn", 0x08, 27, 0, rqn) + :input("next_state", 0x20 + 0x00, 23, 20, next_state) + :execute() end ---command queue (section 7.14.1) - -local cmdq = {} -cmdq.__index = cmdq - ---init cmds -local QUERY_HCA_CAP = 0x100 -local QUERY_ADAPTER = 0x101 -local INIT_HCA = 0x102 -local TEARDOWN_HCA = 0x103 -local ENABLE_HCA = 0x104 -local DISABLE_HCA = 0x105 -local QUERY_PAGES = 0x107 -local MANAGE_PAGES = 0x108 -local SET_HCA_CAP = 0x109 -local QUERY_ISSI = 0x10A -local CREATE_EQ = 0x301 -local SET_ISSI = 0x10B -local SET_DRIVER_VERSION = 0x10D - --- bytewise xor function used for signature calcuation. -local function xor8 (ptr, len) - local u8 = ffi.cast("uint8_t*", ptr) - local acc = 0 - for i = 0, len-1 do - acc = bit.bxor(acc, u8[i]) - end - return acc +-- Modify a Send Queue by making a state transition. +function HCA:modify_sq (sqn, curr_state, next_state) + self:command("MODIFY_SQ", 0x20 + 0x30 + 0xC4, 0x0C) + :input("opcode", 0x00, 31, 16, 0x905) + :input("curr_state", 0x08, 31, 28, curr_state) + :input("sqn", 0x08, 23, 0, sqn) + :input("next_state", 0x20 + 0x00, 23, 20, next_state) + :execute() +end + +-- Create a Send Queue. +-- Return the send queue number and a pointer to the WQEs. +function HCA:create_sq (cqn, pd, size, doorbell, swq, tis) + local log_wq_size = log2size(size) + local db_phy = memory.virtual_to_physical(doorbell) + local swq_phy = memory.virtual_to_physical(swq) + self:command("CREATE_SQ", 0x20 + 0x30 + 0xC4, 0x0C) + :input("opcode", 0x00, 31, 16, 0x904) + :input("rlkey", 0x20 + 0x00, 31, 31, 1) + :input("fre", 0x20 + 0x00, 29, 29, 1) + :input("flush_in_error_en", 0x20 + 0x00, 28, 28, 1) + :input("min_wqe_inline_mode", 0x20 + 0x00, 26, 24, 1) + :input("cqn", 0x20 + 0x08, 23, 0, cqn) + :input("tis_lst_sz", 0x20 + 0x20, 31, 16, 1) + :input("tis", 0x20 + 0x2C, 23, 0, tis) + :input("wq_type", 0x20 + 0x30 + 0x00, 31, 28, 1) -- cyclic + :input("pd", 0x20 + 0x30 + 0x08, 23, 0, pd) + :input("pas[0] high", 0x20 + 0x30 + 0x10, 31, 0, ptrbits(db_phy, 63, 32)) + :input("pas[0] low", 0x20 + 0x30 + 0x14, 31, 0, ptrbits(db_phy, 31, 0)) + :input("log_wq_stride", 0x20 + 0x30 + 0x20, 19, 16, 6) + :input("log_wq_page_sz", 0x20 + 0x30 + 0x20, 12, 8, 6) -- XXX check + :input("log_wq_size", 0x20 + 0x30 + 0x20, 4, 0, log_wq_size) + :input("pas[0] high", 0x20 + 0x30 + 0xC0, 31, 0, ptrbits(swq_phy, 63, 32)) + :input("pas[0] low", 0x20 + 0x30 + 0xC4, 31, 0, ptrbits(swq_phy, 31, 0)) + :execute() + local sqn = self:output(0x08, 23, 0) + return SQ:new(sqn, swq, doorbell) +end + +SQ = {} + +function SQ:new (sqn, swq, doorbell) + return setmetatable({sqn = sqn, swq = swq, doorbell = doorbell}, + {__index = SQ}) +end + +NIC_RX = 0 -- Flow table type code for incoming packets +NIC_TX = 1 -- Flow table type code for outgoing packets + +-- Create the root flow table. +function HCA:create_root_flow_table (table_type) + self:command("CREATE_FLOW_TABLE", 0x3C, 0x0C) + :input("opcode", 0x00, 31, 16, 0x930) + :input("table_type", 0x10, 31, 24, table_type) + :input("log_size", 0x18 + 0x00, 7, 0, 4) -- XXX make parameter + :execute() + local table_id = self:output(0x08, 23, 0) + return table_id +end + +-- Set table as root flow table. +function HCA:set_flow_table_root (table_id, table_type) + self:command("SET_FLOW_TABLE_ROOT", 0x3C, 0x0C) + :input("opcode", 0x00, 31, 16, 0x92F) + :input("table_type", 0x10, 31, 24, table_type) + :input("table_id", 0x14, 23, 0, table_id) + :execute() +end + +-- Create a "wildcard" flow group that does not inspect any fields. +function HCA:create_flow_group_wildcard (table_id, table_type, start_ix, end_ix) + self:command("CREATE_FLOW_GROUP", 0x3FC, 0x0C) + :input("opcode", 0x00, 31, 16, 0x933) + :input("table_type", 0x10, 31, 24, table_type) + :input("table_id", 0x14, 23, 0, table_id) + :input("start_ix", 0x1C, 31, 0, start_ix) + :input("end_ix", 0x24, 31, 0, end_ix) -- (inclusive) + :input("match_criteria", 0x3C, 7, 0, 0) -- match outer headers + :execute() + local group_id = self:output(0x08, 23, 0) + return group_id +end + +-- Set a "wildcard" flow table entry that does not match on any fields. +function HCA:set_flow_table_entry_wildcard (table_id, table_type, group_id, flow_index, tir) + self:command("SET_FLOW_TABLE_ENTRY", 0x40 + 0x300, 0x0C) + :input("opcode", 0x00, 31, 16, 0x936) + :input("opmod", 0x04, 15, 0, 0) -- new entry + :input("table_type", 0x10, 31, 24, table_type) + :input("table_id", 0x14, 23, 0, table_id) + :input("flow_index", 0x20, 31, 0, flow_index) + :input("group_id", 0x40 + 0x04, 31, 0, group_id) + :input("action", 0x40 + 0x0C, 15, 0, 4) -- action = FWD_DST + :input("dest_list_sz", 0x40 + 0x10, 23, 0, 1) -- destination list size + :input("dest_type", 0x40 + 0x300, 31, 24, 2) + :input("dest_id", 0x40 + 0x300, 23, 0, tir) + :execute() +end + +--------------------------------------------------------------- +-- PHY control access +--------------------------------------------------------------- + +-- Note: portnumber is always 1 because the ConnectX-4 HCA is managing +-- a single physical port. + +PAOS = 0x5006 -- Port Administrative & Operational Status +PPLR = 0x5018 -- Port Physical Loopback Register) + +-- Set the administrative status of the port (boolean up/down). +function HCA:set_admin_status (admin_up) + self:command("ACCESS_REGISTER", 0x1C, 0x0C) + :input("opcode", 0x00, 31, 16, 0x805) + :input("opmod", 0x04, 15, 0, 0) -- write + :input("register_id", 0x08, 15, 0, PAOS) + :input("local_port", 0x10, 23, 16, 1) -- + :input("admin_status", 0x10, 11, 8, admin_up and 1 or 2) + :input("ase", 0x14, 31, 31, 1) -- enable admin state update + :execute() +end + +function HCA:get_port_status () + self:command("ACCESS_REGISTER", 0x10, 0x1C) + :input("opcode", 0x00, 31, 16, 0x805) + :input("opmod", 0x04, 15, 0, 1) -- read + :input("register_id", 0x08, 15, 0, PAOS) + :input("local_port", 0x10, 23, 16, 1) + :execute() + return {admin_status = self:output(0x10, 11, 8), + oper_status = self:output(0x10, 3, 0)} +end + +function HCA:get_port_loopback_capability () + self:command("ACCESS_REGISTER", 0x10, 0x14) + :input("opcode", 0x00, 31, 16, 0x805) + :input("opmod", 0x04, 15, 0, 1) -- read + :input("register_id", 0x08, 15, 0, PPLR) + :input("local_port", 0x10, 23, 16, 1) + :execute() + local capability = self:getoutbits(0x14, 23, 16) + return capability +end + +function HCA:set_port_loopback (loopback_mode) + self:command("ACCESS_REGISTER", 0x14, 0x0C) + :input("opcode", 0x00, 31, 16, 0x805) + :input("opmod", 0x04, 15, 0, 0) -- write + :input("register_id", 0x08, 15, 0, PPLR) + :input("local_port", 0x10, 23, 16, 1) + :input("loopback_mode", 0x14, 7, 0, loopback_mode and 2 or 0) + :execute() end +--------------------------------------------------------------- +-- Command Interface implementation. +-- +-- Sends commands to the HCA firmware and receives replies. +-- Defined in "Command Interface" section of the PRM. +--------------------------------------------------------------- + local cmdq_entry_t = ffi.typeof("uint32_t[0x40/4]") local cmdq_mailbox_t = ffi.typeof("uint32_t[0x240/4]") @@ -247,7 +754,7 @@ local max_mailboxes = 1000 local data_per_mailbox = 0x200 -- Bytes of input/output data in a mailbox -- Create a command queue with dedicated/reusable DMA memory. -function cmdq:new(init_seg) +function HCA:new (init_seg) local entry = ffi.cast("uint32_t*", memory.dma_alloc(0x40)) local inboxes, outboxes = {}, {} for i = 0, max_mailboxes-1 do @@ -261,34 +768,34 @@ function cmdq:new(init_seg) init_seg = init_seg, size = init_seg:log_cmdq_size(), stride = init_seg:log_cmdq_stride()}, - self) + {__index = HCA}) end -- Reset all data structures to zero values. -- This is to prevent leakage from one command to the next. local token = 0xAA -function cmdq:prepare(command, last_input_offset, last_output_offset) - print("Command: " .. command) - local input_size = last_input_offset + 4 - local output_size = last_output_offset + 4 +function HCA:command (command, last_input_offset, last_output_offset) + if debug_trace then + print("HCA command: " .. command) + end + self.input_size = last_input_offset + 4 + self.output_size = last_output_offset + 4 -- Command entry: ffi.fill(self.entry, ffi.sizeof(cmdq_entry_t), 0) - self:setbits(0x00, 31, 24, 0x7) -- type - self:setbits(0x04, 31, 0, input_size) - self:setbits(0x38, 31, 0, output_size) - self:setbits(0x3C, - 0, 0, 1, -- ownership = hardware - 31, 24, token) - + self:setbits(0x00, 31, 24, 0x7) -- type + self:setbits(0x04, 31, 0, self.input_size) + self:setbits(0x38, 31, 0, self.output_size) + self:setbits(0x3C, 0, 0, 1) -- ownership = hardware + self:setbits(0x3C, 31, 24, token) -- Mailboxes: -- How many mailboxes do we need? - local ninboxes = math.ceil((input_size - 16) / data_per_mailbox) - local noutboxes = math.ceil((output_size - 16) / data_per_mailbox) - if ninboxes > max_mailboxes then error("Input overflow: " ..input_size) end - if noutboxes > max_mailboxes then error("Output overflow: "..output_size) end + local ninboxes = math.ceil((self.input_size - 16) / data_per_mailbox) + local noutboxes = math.ceil((self.output_size - 16) / data_per_mailbox) + if ninboxes > max_mailboxes then error("Input overflow: " ..self.input_size) end + if noutboxes > max_mailboxes then error("Output overflow: "..self.output_size) end if ninboxes > 0 then local phy = memory.virtual_to_physical(self.inboxes[0]) @@ -310,8 +817,8 @@ function cmdq:prepare(command, last_input_offset, last_output_offset) setint(self.inboxes[i], 0x238, i) setint(self.outboxes[i], 0x238, i) -- Tokens to match command entry - setint(self.inboxes[i], 0x23C, setbits(23, 16, token)) - setint(self.outboxes[i], 0x23C, setbits(23, 16, token)) + setint(self.inboxes[i], 0x23C, setbits(23, 16, token, 0)) + setint(self.outboxes[i], 0x23C, setbits(23, 16, token, 0)) -- Set 'next' mailbox pointers (when used) if i < ninboxes then local phy = memory.virtual_to_physical(self.inboxes[i+1]) @@ -325,17 +832,49 @@ function cmdq:prepare(command, last_input_offset, last_output_offset) end end token = (token == 255) and 1 or token+1 + return self -- for method call chaining +end + +function HCA:getbits (offset, hi, lo) + return getbits(getint(self.entry, offset), hi, lo) +end + +function HCA:setbits (offset, hi, lo, value) + local base = getint(self.entry, offset) + setint(self.entry, offset, setbits(hi, lo, value, base)) end -function cmdq:getbits(ofs, bit2, bit1) - return getbits(getint(self.entry, ofs), bit2, bit1) +function HCA:input (name, offset, hi, lo, value) + assert(offset % 4 == 0) + if debug_trace and name then + print(("input @ %4xh (%2d:%2d) %-20s = %10xh (%d)"):format(offset, hi, lo, name, value, value)) + end + if offset <= 16 - 4 then -- inline + self:setbits(0x10 + offset, hi, lo, value) + else + local mailbox_number = math.floor((offset - 16) / data_per_mailbox) + local mailbox_offset = (offset - 16) % data_per_mailbox + local base = getint(self.inboxes[mailbox_number], mailbox_offset) + local newvalue = setbits(hi, lo, value, base) + setint(self.inboxes[mailbox_number], mailbox_offset, newvalue) + end + return self -- for method call chaining end -function cmdq:setbits(ofs, ...) - setint(self.entry, ofs, setbits(...)) +function HCA:output (offset, hi, lo) + if offset <= 16 - 4 then --inline + return self:getbits(0x20 + offset, hi, lo) + else + local mailbox_number = math.floor((offset - 16) / data_per_mailbox) + local mailbox_offset = (offset - 16) % data_per_mailbox + return getbits(getint(self.outboxes[mailbox_number], mailbox_offset), hi, lo) + end end -function cmdq:setinbits(ofs, ...) --bit1, bit2, val, ... + + + +function HCA:setinbits (ofs, ...) --bit1, bit2, val, ... assert(ofs % 4 == 0) if ofs <= 16 - 4 then --inline self:setbits(0x10 + ofs, ...) @@ -346,7 +885,7 @@ function cmdq:setinbits(ofs, ...) --bit1, bit2, val, ... end end -function cmdq:getoutbits(ofs, bit2, bit1) +function HCA:getoutbits (ofs, bit2, bit1) if ofs <= 16 - 4 then --inline return self:getbits(0x20 + ofs, bit2, bit1) else --output mailbox @@ -374,7 +913,7 @@ local delivery_errors = { -- This is consistent with both the PRM and the Linux mlx5_core driver. } -local function checkz(z) +local function checkz (z) if z == 0 then return end error('command error: '..(delivery_errors[z] or z)) end @@ -402,8 +941,10 @@ local command_errors = { [0x40] = 'BAD_SIZE: More outstanding CQEs in CQ than new CQ size', } -function cmdq:post(last_in_ofs, last_out_ofs) - if debug then +function HCA:execute () + local last_in_ofs = self.input_size + local last_out_ofs = self.output_size + if debug_hexdump then local dumpoffset = 0 print("command INPUT:") dumpoffset = hexdump(self.entry, 0, 0x40, dumpoffset) @@ -416,14 +957,18 @@ function cmdq:post(last_in_ofs, last_out_ofs) end end + assert(self:getbits(0x3C, 0, 0) == 1) self.init_seg:ring_doorbell(0) --post command --poll for command completion while self:getbits(0x3C, 0, 0) == 1 do - C.usleep(100000) + if self.init_seg:getbits(0x1010, 31, 24) ~= 0 then + error("HCA health syndrome: " .. bit.tohex(self.init_seg:getbits(0x1010, 31, 24))) + end + C.usleep(10000) end - if debug then + if debug_hexdump then local dumpoffset = 0 print("command OUTPUT:") dumpoffset = hexdump(self.entry, 0, 0x40, dumpoffset) @@ -447,386 +992,107 @@ function cmdq:post(last_in_ofs, last_out_ofs) end -- see 12.2 Return Status Summary -function cmdq:checkstatus() +function HCA:checkstatus () local status = self:getoutbits(0x00, 31, 24) local syndrome = self:getoutbits(0x04, 31, 0) if status == 0 then return end - error(string.format('status: 0x%x (%s), syndrome: %d', + error(string.format('status: 0x%x (%s), syndrome: 0x%x', status, command_errors[status], syndrome)) end -function cmdq:enable_hca() - self:prepare("ENABLE_HCA", 0x0C, 0x08) - self:setinbits(0x00, 31, 16, ENABLE_HCA) - self:post(0x0C, 0x08) -end - -function cmdq:query_issi() - self:prepare("QUERY_ISSI", 0x0C, 0x6C) - self:setinbits(0x00, 31, 16, QUERY_ISSI) - self:post(0x0C, 0x6C) - local cur_issi = self:getoutbits(0x08, 15, 0) - local t = {} - for i = 639, 0, -1 do - -- Bit N (0..639) when set means ISSI version N is enabled. - -- Bits are ordered from highest to lowest. - local byte = 0x20 + math.floor(i / 8) - local offset = byte - (byte % 4) - local bit = 31 - (i % 32) - if self:getoutbits(offset, bit, bit) == 1 then - local issi = 639 - i - t[issi] = true - end - end - return { - cur_issi = cur_issi, - sup_issi = t, - } + + +--------------------------------------------------------------- +-- Initialization segment access. +-- +-- The initialization segment is a region of memory-mapped PCI +-- registers. This is an interface directly to the hardware and is +-- used for bootstrapping communication with the firmware (amongst +-- other things). +-- +-- Described in the "Initialization Segment" section of the PRM. +--------------------------------------------------------------- + +InitializationSegment = {} + +-- Create an initialization segment object. +-- ptr is a pointer to the memory-mapped registers. +function InitializationSegment:new (ptr) + return setmetatable({ptr = cast('uint32_t*', ptr)}, {__index = InitializationSegment}) end -function cmdq:set_issi(issi) - self:reset() - self:setinbits(0x00, 31, 16, SET_ISSI) - self:setinbits(0x08, 15, 0, issi) - self:post(0x0C, 0x0C) +function InitializationSegment:getbits (offset, hi, lo) + return getbits(getint(self.ptr, offset), hi, lo) end -function cmdq:dump_issi(issi) - print(' cur_issi = ', issi.cur_issi) - print(' sup_issi = ') - for i=0,79 do - if issi.sup_issi[i] then - print(string.format( - ' %02d ', i)) - end - end +function InitializationSegment:setbits (offset, hi, lo, value) + local base = getint(self.ptr, offset) + setint(self.ptr, offset, setbits(hi, lo, value, base)) end -local codes = { - boot = 1, - init = 2, - regular = 3, -} -function cmdq:query_pages(which) - self:prepare("QUERY_PAGES", 0x0C, 0x0C) - self:setinbits(0x00, 31, 16, QUERY_PAGES) - self:setinbits(0x04, 15, 0, codes[which]) - self:post(0x0C, 0x0C) - return self:getoutbits(0x0C, 31, 0) -end - -function cmdq:alloc_pages(num_pages) - self:prepare("MANAGE_PAGES", 0x10 + num_pages*8, 0x0C) - self:setinbits(0x00, 31, 16, MANAGE_PAGES) - self:setinbits(0x04, 15, 0, 1) --alloc - self:setinbits(0x0C, 31, 0, num_pages) - for i=0, num_pages-1 do - local _, phy = memory.dma_alloc(4096, 4096) - self:setinbits(0x10 + i*8, 31, 0, ptrbits(phy, 63, 32)) - self:setinbits(0x14 + i*8, 31, 12, ptrbits(phy, 31, 12)) - end - self:post(0x10 + num_pages*8, 0x0C) +function InitializationSegment:fw_rev () --maj, min, subminor + return + self:getbits(0, 15, 0), + self:getbits(0, 31, 16), + self:getbits(4, 15, 0) end --- Create Event Queue (EQ) -function cmdq:create_eq(numpages) - self:prepare("CREATE_EQ", 0x10C + numpages*8, 0x0C) - self:setinbits(0x00, 31, 16, CREATE_EQ) - -- Setup Event Queue Context: - -- +function InitializationSegment:cmd_interface_rev () + return self:getbits(4, 31, 16) +end - -- XXX Had wanted to use log_page_size=0 for 4KB pages - -- (2^0*4096=4096) but get BAD_INPUT_LEN errors. Have consulted the - -- hexdump for the Linux mlx5 driver and seen them choose - -- log_page_size=2 for presumably 16KB pages (2^2*4096=16384) and - -- mimicking this value resolves the error. - -- - -- So, it works, but questions: - -- 1. How come we are choosing a page size? 4KB is used elsewhere. - -- 2. Are we setting the value correctly or is there some silly bug? - -- 3. How come log_page_size 2 is okay but 0 is not? - -- (What is the root cause of the BAD_INPUT_LEN error, really?) - local status = 0 -- 0 = OK - local ec = 0 -- event collapse flag - local oi = 0 -- overrun ignore flag - local st = 0x0 -- (Card did not accept 0x0A) - local page_offset = 0 -- (must be 0) - local log_eq_size = 7 -- Log (base 2) of EQ size (in entries) - local uar_page = 0 -- UAR page 0 for main event queue - local intr = 0 -- MSI-X table entry (should not be used) - local log_page_size = 2 -- Log (base 2) of page size in 4KB units - local consumer_counter = 0 -- Software cursor (init to zero) - local producer_counter = 0 -- Hardware cursor (init to zero) - self:setinbits(0x10 + 0x00, - 31, 28, status, - 18, 18, ec, - 17, 17, oi, - 11, 8, st) - self:setinbits(0x10 + 0x08, 7,9, page_offset) - self:setinbits(0x10 + 0x0C, 28, 24, log_eq_size, 23, 0, uar_page) - self:setinbits(0x10 + 0x14, 7, 9, intr) - self:setinbits(0x10 + 0x18, 28, 24, log_page_size) - self:setinbits(0x10 + 0x28, 23, 0, consumer_counter) - self:setinbits(0x10 + 0x2C, 23, 0, producer_counter) - -- Set event bitmask - local events = bits{PageRequest=0x0A} - self:setinbits(0x10 + 0x5C, 31, 0, events) - -- Allocate pages in contiguous physical memory - local ptr, phy = memory.dma_alloc(4096 * numpages, 4096) - for i = 0, numpages-1 do - self:setinbits(0x110 + i*8, 31, 0, ptrbits(phy + i * 4096, 63, 32)) - self:setinbits(0x114 + i*8, 31, 0, ptrbits(phy + i * 4096, 31, 0)) +function InitializationSegment:cmdq_phy_addr (addr) + if addr then + --must write the MSB of the addr first + self:setbits(0x10, 31, 0, ptrbits(addr, 63, 32)) + --also resets nic_interface and log_cmdq_* + self:setbits(0x14, 31, 12, ptrbits(addr, 31, 12)) + else + return cast('void*', + cast('uint64_t', self:getbits(0x10, 31, 0) * 2^32 + + cast('uint64_t', self:getbits(0x14, 31, 12)) * 2^12)) end - self:post(0x10C + numpages*8, 0x0C) - return self:getoutbits(0x08, 7, 0) end -local what_codes = { - max = 0, - cur = 1, -} -local which_codes = { - general = 0, - offload = 1, - flow_table = 7, -} -function cmdq:query_hca_cap(what, which) - self:prepare("QUERY_HCA_CAP", 0x0C, 0x100C - 3000) - self:setinbits(0x00, 31, 16, QUERY_HCA_CAP) - self:setinbits(0x04, - 15, 1, assert(which_codes[which]), - 0, 0, assert(what_codes[what])) - self:post(0x0C, 0x100C - 3000) - local caps = {} - if which == 'general' then - caps.log_max_cq_sz = self:getoutbits(0x10 + 0x18, 23, 16) - caps.log_max_cq = self:getoutbits(0x10 + 0x18, 4, 0) - caps.log_max_eq_sz = self:getoutbits(0x10 + 0x1C, 31, 24) - caps.log_max_mkey = self:getoutbits(0x10 + 0x1C, 21, 16) - caps.log_max_eq = self:getoutbits(0x10 + 0x1C, 3, 0) - caps.max_indirection = self:getoutbits(0x10 + 0x20, 31, 24) - caps.log_max_mrw_sz = self:getoutbits(0x10 + 0x20, 22, 16) - caps.log_max_klm_list_size = self:getoutbits(0x10 + 0x20, 5, 0) - caps.end_pad = self:getoutbits(0x10 + 0x2C, 31, 31) - caps.start_pad = self:getoutbits(0x10 + 0x2C, 28, 28) - caps.cache_line_128byte = self:getoutbits(0x10 + 0x2C, 27, 27) - caps.vport_counters = self:getoutbits(0x10 + 0x30, 30, 30) - caps.vport_group_manager = self:getoutbits(0x10 + 0x34, 31, 31) - caps.nic_flow_table = self:getoutbits(0x10 + 0x34, 25, 25) - caps.port_type = self:getoutbits(0x10 + 0x34, 9, 8) - caps.num_ports = self:getoutbits(0x10 + 0x34, 7, 0) - caps.log_max_msg = self:getoutbits(0x10 + 0x38, 28, 24) - caps.max_tc = self:getoutbits(0x10 + 0x38, 19, 16) - caps.cqe_version = self:getoutbits(0x10 + 0x3C, 3, 0) - caps.cmdif_checksum = self:getoutbits(0x10 + 0x40, 15, 14) - caps.wq_signature = self:getoutbits(0x10 + 0x40, 11, 11) - caps.sctr_data_cqe = self:getoutbits(0x10 + 0x40, 10, 10) - caps.eth_net_offloads = self:getoutbits(0x10 + 0x40, 3, 3) - caps.cq_oi = self:getoutbits(0x10 + 0x44, 31, 31) - caps.cq_resize = self:getoutbits(0x10 + 0x44, 30, 30) - caps.cq_moderation = self:getoutbits(0x10 + 0x44, 29, 29) - caps.cq_eq_remap = self:getoutbits(0x10 + 0x44, 25, 25) - caps.scqe_break_moderation = self:getoutbits(0x10 + 0x44, 21, 21) - caps.cq_period_start_from_cqe = self:getoutbits(0x10 + 0x44, 20, 20) - caps.imaicl = self:getoutbits(0x10 + 0x44, 14, 14) - caps.xrc = self:getoutbits(0x10 + 0x44, 3, 3) - caps.ud = self:getoutbits(0x10 + 0x44, 2, 2) - caps.uc = self:getoutbits(0x10 + 0x44, 1, 1) - caps.rc = self:getoutbits(0x10 + 0x44, 0, 0) - caps.uar_sz = self:getoutbits(0x10 + 0x48, 21, 16) - caps.log_pg_sz = self:getoutbits(0x10 + 0x48, 7, 0) - caps.bf = self:getoutbits(0x10 + 0x4C, 31, 31) - caps.driver_version = self:getoutbits(0x10 + 0x4C, 30, 30) - caps.pad_tx_eth_packet = self:getoutbits(0x10 + 0x4C, 29, 29) - caps.log_bf_reg_size = self:getoutbits(0x10 + 0x4C, 20, 16) - caps.log_max_transport_domain = self:getoutbits(0x10 + 0x64, 28, 24) - caps.log_max_pd = self:getoutbits(0x10 + 0x64, 20, 16) - caps.max_flow_counter = self:getoutbits(0x10 + 0x68, 15, 0) - caps.log_max_rq = self:getoutbits(0x10 + 0x6C, 28, 24) - caps.log_max_sq = self:getoutbits(0x10 + 0x6C, 20, 16) - caps.log_max_tir = self:getoutbits(0x10 + 0x6C, 12, 8) - caps.log_max_tis = self:getoutbits(0x10 + 0x6C, 4, 0) - caps.basic_cyclic_rcv_wqe = self:getoutbits(0x10 + 0x70, 31, 31) - caps.log_max_rmp = self:getoutbits(0x10 + 0x70, 28, 24) - caps.log_max_rqt = self:getoutbits(0x10 + 0x70, 20, 16) - caps.log_max_rqt_size = self:getoutbits(0x10 + 0x70, 12, 8) - caps.log_max_tis_per_sq = self:getoutbits(0x10 + 0x70, 4, 0) - caps.log_max_stride_sz_rq = self:getoutbits(0x10 + 0x74, 28, 24) - caps.log_min_stride_sz_rq = self:getoutbits(0x10 + 0x74, 20, 16) - caps.log_max_stride_sz_sq = self:getoutbits(0x10 + 0x74, 12, 8) - caps.log_min_stride_sz_sq = self:getoutbits(0x10 + 0x74, 4, 0) - caps.log_max_wq_sz = self:getoutbits(0x10 + 0x78, 4, 0) - caps.log_max_vlan_list = self:getoutbits(0x10 + 0x7C, 20, 16) - caps.log_max_current_mc_list = self:getoutbits(0x10 + 0x7C, 12, 8) - caps.log_max_current_uc_list = self:getoutbits(0x10 + 0x7C, 4, 0) - caps.log_max_l2_table = self:getoutbits(0x10 + 0x90, 28, 24) - caps.log_uar_page_sz = self:getoutbits(0x10 + 0x90, 15, 0) - caps.device_frequency_mhz = self:getoutbits(0x10 + 0x98, 31, 0) - elseif which_caps == 'offload' then - --TODO - elseif which_caps == 'flow_table' then - --TODO - end - return caps -end - -function cmdq:set_hca_cap(which, caps) - self:prepare("SET_HCA_CAP", 0x100C - 3000, 0x0C) - self:setinbits(0x00, 31, 16, SET_HCA_CAP) - self:setinbits(0x04, 15, 1, assert(which_codes[which])) - if which == 'general' then - self:setinbits(0x10 + 0x18, - 23, 16, caps.log_max_cq_sz, - 4, 0, caps.log_max_cq) - self:setinbits(0x10 + 0x1C, - 31, 24, caps.log_max_eq_sz, - 21, 16, caps.log_max_mkey, - 3, 0, caps.log_max_eq) - self:setinbits(0x10 + 0x20, - 31, 24, caps.max_indirection, - 22, 16, caps.log_max_mrw_sz, - 5, 0, caps.log_max_klm_list_size) - self:setinbits(0x10 + 0x2C, - 31, 31, caps.end_pad, - 28, 28, caps.start_pad, - 27, 27, caps.cache_line_128byte) - self:setinbits(0x10 + 0x30, - 30, 30, caps.vport_counters) - self:setinbits(0x10 + 0x34, - 31, 31, caps.vport_group_manager, - 25, 25, caps.nic_flow_table, - 9, 8, caps.port_type, - 7, 0, caps.num_ports) - self:setinbits(0x10 + 0x38, - 28, 24, caps.log_max_msg, - 19, 16, caps.max_tc) - self:setinbits(0x10 + 0x3C, - 3, 0, caps.cqe_version) - self:setinbits(0x10 + 0x40, - 15, 14, caps.cmdif_checksum, - 11, 11, caps.wq_signature, - 10, 10, caps.sctr_data_cqe, - 3, 3, caps.eth_net_offloads) - self:setinbits(0x10 + 0x44, - 31, 31, caps.cq_oi, - 30, 30, caps.cq_resize, - 29, 29, caps.cq_moderation, - 25, 25, caps.cq_eq_remap, - 21, 21, caps.scqe_break_moderation, - 20, 20, caps.cq_period_start_from_cqe, - 14, 14, caps.imaicl, - 3, 3, caps.xrc, - 2, 2, caps.ud, - 1, 1, caps.uc, - 0, 0, caps.rc) - self:setinbits(0x10 + 0x48, - 21, 16, caps.uar_sz, - 7, 0, caps.log_pg_sz) - self:setinbits(0x10 + 0x4C, - 31, 31, caps.bf, - 30, 30, caps.driver_version, - 29, 29, caps.pad_tx_eth_packet, - 20, 16, caps.log_bf_reg_size) - self:setinbits(0x10 + 0x64, - 28, 24, caps.log_max_transport_domain, - 20, 16, caps.log_max_pd) - self:setinbits(0x10 + 0x68, - 15, 0, caps.max_flow_counter) - self:setinbits(0x10 + 0x6C, - 28, 24, caps.log_max_rq, - 20, 16, caps.log_max_sq, - 12, 8, caps.log_max_tir, - 4, 0, caps.log_max_tis) - self:setinbits(0x10 + 0x70, - 31, 31, caps.basic_cyclic_rcv_wqe, - 28, 24, caps.log_max_rmp, - 20, 16, caps.log_max_rqt, - 12, 8, caps.log_max_rqt_size, - 4, 0, caps.log_max_tis_per_sq) - self:setinbits(0x10 + 0x74, - 28, 24, caps.log_max_stride_sz_rq, - 20, 16, caps.log_min_stride_sz_rq, - 12, 8, caps.log_max_stride_sz_sq, - 4, 0, caps.log_min_stride_sz_sq) - self:setinbits(0x10 + 0x78, - 4, 0, caps.log_max_wq_sz) - self:setinbits(0x10 + 0x7C, - 20, 16, caps.log_max_vlan_list, - 12, 8, caps.log_max_current_mc_list, - 4, 0, caps.log_max_current_uc_list) - self:setinbits(0x10 + 0x90, - 28, 24, caps.log_max_l2_table, - 15, 0, caps.log_uar_page_sz) - self:setinbits(0x10 + 0x98, - 31, 0, caps.device_frequency_mhz) - elseif which == 'offload' then - self:setinbits(0x10 + 0x00, - 31, 31, caps.csum_cap, - 30, 30, caps.vlan_cap, - 29, 29, caps.lro_cap, - 28, 28, caps.lro_psh_flag, - 27, 27, caps.lro_time_stamp, - 26, 25, caps.lro_max_msg_sz_mode, - 23, 23, caps.self_lb_en_modifiable, - 22, 22, caps.self_lb_mc, - 21, 21, caps.self_lb_uc, - 20, 16, caps.max_lso_cap, - 13, 12, caps.wqe_inline_mode, - 11, 8, caps.rss_ind_tbl_cap) - self:setinbits(0x10 + 0x08, - 15, 0, caps.lro_min_mss_size) - for i = 1, 4 do - self:setinbits(0x10 + 0x30 + (i-1)*4, 31, 0, caps.lro_timer_supported_periods[i]) - end - elseif which == 'flow_table' then - --TODO - end - self:post(0x100C, 0x0C) +function InitializationSegment:nic_interface (mode) + self:setbits(0x14, 9, 8, mode) end --- XXX VPORT commands /may/ not be needed since we are not using SR-IOV. --- In this case the functions below can be removed. +function InitializationSegment:log_cmdq_size () + return self:getbits(0x14, 7, 4) +end -function cmdq:query_vport_state() - self:prepare("QUERY_VPORT_STATE", 0x0c, 0x0c) - self:setinbits(0x00, 31, 16, QUERY_VPORT_STATE) - self:post(0x0C, 0x0C) - return { admin_state = self:getoutbits(0x0C, 7, 4), - oper_state = self:getoutbits(0x0C, 3, 0) } +function InitializationSegment:log_cmdq_stride () + return self:getbits(0x14, 3, 0) end -function cmdq:modify_vport_state(admin_state) - self:prepare("MODIFY_VPORT_STATE", 0x0c, 0x0c) - self:setinbits(0x00, 31, 16, MODIFY_VPORT_STATE) - self:setinbits(0x0C, 7, 4, admin_state) - self:post(0x0C, 0x0C) +function InitializationSegment:ring_doorbell (i) + self:setbits(0x18, i, i, 1) end -function cmdq:query_nic_vport_context() - -- XXX This command can be used to manipulate long lists of allowed - -- unicast addresses, multicast addresses, and VLANs. For now we - -- skip that (leave the list length as zero) and access only the - -- global settings. Is this interaction correct ? - self:prepare("QUERY_NIC_VPORT_CONTEXT", 0x0c, 0x10+0xFC) - self:setinbits(0x00, 31, 16, 0x754) -- Command opcode - self:post(0x0C, 0x10+0xFC) - local mac_hi = self:getoutbits(0x10+0xF4, 31, 0) - local mac_lo = self:getoutbits(0x10+0xF8, 31, 0) - local mac_hex = bit.tohex(mac_hi, 4) .. bit.tohex(mac_lo, 8) - return { mtu = self:getoutbits(0x10+0x24, 15, 0), - promisc_uc = self:getoutbits(0x10+0xf0, 31, 31), - promisc_mc = self:getoutbits(0x10+0xf0, 30, 30), - promisc_all = self:getoutbits(0x10+0xf0, 29, 29), - permanent_address = mac_hex } +function InitializationSegment:ready (i, val) + return self:getbits(0x1fc, 31, 31) == 0 +end + +function InitializationSegment:nic_interface_supported () + return self:getbits(0x1fc, 26, 24) == 0 end -function cmdq:init_hca() - self:prepare("INIT_HCA", 0x0c, 0x0c) - self:setinbits(0x00, 31, 16, INIT_HCA) - self:post(0x0C, 0x0C) +function InitializationSegment:internal_timer () + return + self:getbits(0x1000, 31, 0) * 2^32 + + self:getbits(0x1004, 31, 0) end -function init_seg:dump() +function InitializationSegment:clear_int () + self:setbits(0x100c, 0, 0, 1) +end + +function InitializationSegment:health_syndrome () + return self:getbits(0x1010, 31, 24) +end + +function InitializationSegment:dump () print('fw_rev ', self:fw_rev()) print('cmd_interface_rev ', self:cmd_interface_rev()) print('cmdq_phy_addr ', self:cmdq_phy_addr()) @@ -838,101 +1104,12 @@ function init_seg:dump() print('health_syndrome ', self:health_syndrome()) end -function ConnectX4:new(arg) - local self = setmetatable({}, self) - local conf = config.parse_app_arg(arg) - local pciaddress = pci.qualified(conf.pciaddress) - - -- Perform a hard reset of the device to bring it into a blank state. - -- (PRM does not suggest this but it is practical for resetting the - -- firmware from bad states.) - pci.unbind_device_from_linux(pciaddress) - pci.reset_device(pciaddress) - pci.set_bus_master(pciaddress, true) - local base, fd = pci.map_pci_memory(pciaddress, 0, true) - - trace("Read the initialization segment") - local init_seg = init_seg:init(base) - - --allocate and set the command queue which also initializes the nic - local cmdq = cmdq:new(init_seg) - - --8.2 HCA Driver Start-up - - trace("Write the physical location of the command queues to the init segment.") - init_seg:cmdq_phy_addr(memory.virtual_to_physical(cmdq.entry)) - - trace("Wait for the 'initializing' field to clear") - while not init_seg:ready() do - C.usleep(1000) - end - - init_seg:dump() - - cmdq:enable_hca() - local issi = cmdq:query_issi() - cmdq:dump_issi(issi) - - --os.exit(0) - --cmdq:set_issi(1) - - -- PRM: Execute QUERY_PAGES to understand the HCA need to boot pages. - local boot_pages = cmdq:query_pages'boot' - print("query_pages'boot' ", boot_pages) - assert(boot_pages > 0) - - -- PRM: Execute MANAGE_PAGES to provide the HCA with all required - -- init-pages. This can be done by multiple MANAGE_PAGES commands. - cmdq:alloc_pages(boot_pages) - - local cur = cmdq:query_hca_cap('cur', 'general') - local max = cmdq:query_hca_cap('max', 'general') - print'Capabilities - current and (maximum):' - for k in pairs(cur) do - print((" %-24s = %-3s (%s)"):format(k, cur[k], max[k])) - end - - cmdq:set_hca_cap('general', cur) - - -- Initialization pages - local init_pages = cmdq:query_pages('init') - print("query_pages'init' ", init_pages) - assert(init_pages > 0) - - cmdq:alloc_pages(init_pages) - - cmdq:init_hca() - - local eq = cmdq:create_eq(1) - print("eq = " .. eq) - - local vport_ctx = cmdq:query_nic_vport_context() - for k,v in pairs(vport_ctx) do - print(k,v) - end - - --[[ - cmdq:set_hca_cap() - cmdq:query_pages() - cmdq:manage_pages() - cmdq:init_hca() - cmdq:set_driver_version() - cmdq:create_eq() - cmdq:query_vport_state() - cmdq:modify_vport_context() - ]] - - function self:stop() - pci.set_bus_master(pciaddress, false) - pci.reset_device(pciaddress) - pci.close_pci_resource(fd, base) - base, fd = nil - end - return self -end +--------------------------------------------------------------- +-- Utilities. +--------------------------------------------------------------- --- Print a hexdump in the same format as the Linux kernel. +-- Print a hexdump in the same format as the Linux kernel mlx5 driver. -- -- Optionally take a 'dumpoffset' giving the logical address where the -- trace starts (useful when printing multiple related hexdumps i.e. @@ -958,7 +1135,54 @@ function trace (...) print("TRACE", ...) end -function selftest() +-- Utilities for peeking and poking bitfields of 32-bit big-endian integers. +-- Pointers are uint32_t* and offsets are in bytes. + +-- Return the value at offset from address. +function getint (pointer, offset) + assert(offset % 4 == 0, "offset not dword-aligned") + local r = bswap(pointer[offset/4]) + --print("getint", pointer, offset, r, bit.tohex(r)) + return r +end + +-- Set the the value at offset from address. +function setint (pointer, offset, value) + assert(offset % 4 == 0, "offset not dword-aligned") + pointer[offset/4] = bswap(tonumber(value)) +end + +-- Return the hi:lo bits of value. +function getbits (value, hi, lo) + local mask = shl(2^(hi-lo+1)-1, lo) + local r = shr(band(value, mask), lo) + --print("getbits", bit.tohex(value), hi, lo, bit.tohex(r)) + return r +end + +-- Return the hi:lo bits of a pointer. +function ptrbits (pointer, hi, lo) + return tonumber(getbits(cast('uint64_t', pointer), hi, lo)) +end + +-- Set value in bits hi:lo of (optional) base. +function setbits (hi, lo, value, base) + base = base or 0 + local mask = shl(2^(hi-lo+1)-1, lo) + local newbits = band(shl(value, lo), mask) + local oldbits = band(base, bnot(mask)) + return bor(newbits, oldbits) +end + +function log2size (size) + -- Return log2 of size rounded up to nearest whole number. + -- + -- Note: Lua provides only natural logarithm function (base e) built-in. + -- See http://www.mathwords.com/c/change_of_base_formula.htm + return math.ceil(math.log(size) / math.log(2)) +end + +function selftest () io.stdout:setvbuf'no' local pcidev = lib.getenv("SNABB_PCI_CONNECTX4_0") From 21d0dc36d1f3dd1c022d19c3777339f52c3609b2 Mon Sep 17 00:00:00 2001 From: Luke Gorrie Date: Fri, 28 Oct 2016 08:53:29 +0000 Subject: [PATCH 025/209] connectx4: Refactored for multiprocess operation The Mellanox driver is now split into two apps: 'ConnectX4' to setup the NIC and 'IO' to attach to a queue pair. --- src/apps/mellanox/connectx4.lua | 546 +++++++++++++++++++++++++++----- 1 file changed, 473 insertions(+), 73 deletions(-) diff --git a/src/apps/mellanox/connectx4.lua b/src/apps/mellanox/connectx4.lua index c15046a923..b2279d7180 100644 --- a/src/apps/mellanox/connectx4.lua +++ b/src/apps/mellanox/connectx4.lua @@ -38,15 +38,21 @@ local index_set = require("lib.index_set") local macaddress = require("lib.macaddress") local mib = require("lib.ipc.shmem.mib") local timer = require("core.timer") +local shm = require("core.shm") +local counter = require("core.counter") local bits, bitset = lib.bits, lib.bitset local floor = math.floor local cast = ffi.cast local band, bor, shl, shr, bswap, bnot = bit.band, bit.bor, bit.lshift, bit.rshift, bit.bswap, bit.bnot -local debug_trace = true -- Print trace messages +local debug_trace = false -- Print trace messages local debug_hexdump = false -- Print hexdumps (in Linux mlx5 format) +-- Maximum size of a receive queue table. +-- XXX This is hard-coded in the Linux mlx5 driver too. Could +-- alternatively detect from query_hca_cap. +local rqt_max_size = 128 --------------------------------------------------------------- -- ConnectX4 Snabb app. @@ -58,9 +64,8 @@ local debug_hexdump = false -- Print hexdumps (in Linux mlx5 format) ConnectX4 = {} ConnectX4.__index = ConnectX4 -function ConnectX4:new (arg) +function ConnectX4:new (conf) local self = setmetatable({}, self) - local conf = config.parse_app_arg(arg) local pciaddress = pci.qualified(conf.pciaddress) local sendq_size = conf.sendq_size or 1024 @@ -76,14 +81,12 @@ function ConnectX4:new (arg) -- Setup the command channel -- - local base, fd = pci.map_pci_memory(pciaddress, 0, true) - local init_seg = InitializationSegment:new(base) + local mmio, fd = pci.map_pci_memory(pciaddress, 0, true) + local init_seg = InitializationSegment:new(mmio) local hca = HCA:new(init_seg) - trace("Write the physical location of the command queues to the init segment.") init_seg:cmdq_phy_addr(memory.virtual_to_physical(hca.entry)) if debug_trace then init_seg:dump() end - trace("Wait for the 'initializing' field to clear") while not init_seg:ready() do C.usleep(1000) end @@ -93,7 +96,7 @@ function ConnectX4:new (arg) hca:enable_hca() hca:set_issi(1) hca:alloc_pages(hca:query_pages("boot")) - if debug_trace then self:dump_capabilities() end + if debug_trace then self:dump_capabilities(hca) end -- Initialize the card -- @@ -114,21 +117,64 @@ function ConnectX4:new (arg) -- Create send and receive queues & associated objects -- local tis = hca:create_tis(0, tdomain) - local send_cq = hca:create_cq(1024, uar, eq.eqn) - local recv_cq = hca:create_cq(1024, uar, eq.eqn) - - -- Allocate work queue memory (receive & send contiguous in memory) - local wq_doorbell = memory.dma_alloc(16) - local sendq_size = 1024 - local recvq_size = 1024 - local workqueues = memory.dma_alloc(64 * (sendq_size + recvq_size), 4096) - local rwq = workqueues -- receive work queue - local swq = workqueues + 64 * recvq_size -- send work queue - - -- Create the queue objects - local sq = hca:create_sq(send_cq.cqn, pd, sendq_size, wq_doorbell, swq, tis) - local rq = hca:create_rq(recv_cq.cqn, pd, recvq_size, wq_doorbell, rwq) - local tir = hca:create_tir_direct(rq.rqn, tdomain) + -- List of all receive queues for hashing traffic across + local rqlist = {} + + for _, queuename in ipairs(conf.queues) do + + local send_cq = hca:create_cq(1, uar, eq.eqn, true) + local recv_cq = hca:create_cq(recvq_size, uar, eq.eqn, false) + + -- Allocate work queue memory (receive & send contiguous in memory) + local wq_doorbell = memory.dma_alloc(16) + local sendq_size = 1024 + local recvq_size = 1024 + local workqueues = memory.dma_alloc(64 * (sendq_size + recvq_size), 4096) + local rwq = workqueues -- receive work queue + local swq = workqueues + 64 * recvq_size -- send work queue + + -- Create the queue objects + local sqn = hca:create_sq(send_cq, pd, sendq_size, wq_doorbell, swq, uar, tis) + hca:modify_sq(sqn, 0, 1) -- RESET -> READY + local rqn = hca:create_rq(recv_cq, pd, recvq_size, wq_doorbell, rwq) + hca:modify_rq(rqn, 0, 1) -- RESET -> READY + + table.insert(rqlist, rqn) + + -- Create shared memory objects containing all of the + -- information needed to access the send and receive queues. + -- + -- Snabb processes will use this information to take ownership + -- of the queue to send and receive packets. + local basepath = "/pci/"..pciaddress.."/"..queuename + local sendpath = basepath.."/send" + local recvpath = basepath.."/recv" + local u64 = function (x) return ffi.cast("uint64_t", x) end + shm.create_frame(sendpath, + {lock = {counter}, + sqn = {counter, sqn}, + wq = {counter, u64(swq)}, + wqsize = {counter, sendq_size}, + cqn = {counter, send_cq.cqn}, + cqe = {counter, u64(send_cq.cqe)}, + doorbell = {counter, u64(wq_doorbell)}, + uar_page = {counter, uar}, + rlkey = {counter, rlkey}}) + shm.create_frame(recvpath, + {lock = {counter}, + rqn = {counter, rqn}, + wq = {counter, u64(rwq)}, + wqsize = {counter, recvq_size}, + cqn = {counter, recv_cq.cqn}, + cqe = {counter, u64(recv_cq.cqe)}, + doorbell = {counter, u64(wq_doorbell)}, + uar_page = {counter, uar}, + rlkey = {counter, rlkey}}) + end + + --local tir = hca:create_tir_direct(rqlist[1], tdomain) + local rqt = hca:create_rqt(rqlist) + local tir = hca:create_tir_indirect(rqt, tdomain) -- Setup packet dispatching. -- Just a "wildcard" flow group to send RX packets to the receive queue. @@ -138,23 +184,26 @@ function ConnectX4:new (arg) hca:set_flow_table_entry_wildcard(rx_flow_table_id, NIC_RX, flow_group_id, 0, tir) hca:set_flow_table_root(rx_flow_table_id, NIC_RX) - function self:stop() + function self:stop () pci.set_bus_master(pciaddress, false) pci.reset_device(pciaddress) - pci.close_pci_resource(fd, base) - base, fd = nil + pci.close_pci_resource(fd, mmio) + mmio, fd = nil end + -- Save "instance variable" values. + self.hca = hca + return self end -function ConnectX4:dump_capabilities () - if true then return end +function ConnectX4:dump_capabilities (hca) + --if true then return end -- Print current and maximum card capabilities. -- XXX Check if we have any specific requirements that we need to -- set and/or assert on. - local cur = self.hca:query_hca_general_cap('current') - local max = self.hca:query_hca_general_cap('max') + local cur = hca:query_hca_general_cap('current') + local max = hca:query_hca_general_cap('max') print'Capabilities - current and (maximum):' for k in pairs(cur) do print((" %-24s = %-3s (%s)"):format(k, cur[k], max[k])) @@ -173,6 +222,16 @@ function ConnectX4:check_vport () end end +function ConnectX4:print_vport_counter () + local c = self.hca:query_vport_counter() + local t = {} + -- Sort into key order + for k in pairs(c) do table.insert(t, k) end + table.sort(t) + for _, k in pairs(t) do + print(("%12s %s"):format(lib.comma_value(c[k]), k)) + end +end --------------------------------------------------------------- -- Firmware commands. @@ -260,14 +319,14 @@ end -- Provide the NIC with freshly allocated memory. function HCA:alloc_pages (num_pages) - self:command("MANAGE_PAGES", 0x10 + num_pages*8, 0x0C) + self:command("MANAGE_PAGES", 0x14 + num_pages*8, 0x0C) :input("opcode", 0x00, 31, 16, 0x108) :input("opmod", 0x04, 15, 0, 1) -- allocate mode :input("input_num_entries", 0x0C, 31, 0, num_pages, "input_num_entries") - for i=0, num_pages do + for i=0, num_pages-1 do local _, phy = memory.dma_alloc(4096, 4096) self:input(nil, 0x10 + i*8, 31, 0, ptrbits(phy, 63, 32)) - self:input(nil, 0x14 + i*8, 31, 12, ptrbits(phy, 31, 12)) + self:input(nil, 0x14 + i*8, 31, 12, ptrbits(phy, 31, 12)) end self:execute() end @@ -394,7 +453,7 @@ local eqe_t = ffi.typeof[[ uint8_t signature; uint8_t owner; } -]] + ]] eq = {} eq.__index = eq @@ -453,12 +512,29 @@ function HCA:query_vport_counter () self:command("QUERY_VPORT_COUNTER", 0x1c, 0x20c) :input("opcode", 0x00, 31, 16, 0x770) :execute() - -- XXX return in a table - print("vport counters") - for i = 0x10, 0x200, 4 do - local n = self:output(i, 31, 0) - if n > 0 then print(bit.tohex(i), n) end + local function get64 (offset) + local hi = self:output(offset, 31, 0) + local lo = self:output(offset + 4, 31, 0) + return lo + (hi * 2^32) end + return { + rx_error_packets = get64(0x10), + rx_error_octets = get64(0x18), + tx_error_packets = get64(0x20), + tx_error_octets = get64(0x28), + rx_bcast_packets = get64(0x70), + rx_bcast_octets = get64(0x78), + tx_bcast_packets = get64(0x80), + tx_bcast_octets = get64(0x88), + rx_ucast_packets = get64(0x90), + rx_ucast_octets = get64(0x98), + tx_ucast_packets = get64(0xA0), + tx_ucast_octets = get64(0xA8), + rx_mcast_packets = get64(0xB0), + rx_mcast_octets = get64(0xB8), + tx_mcast_packets = get64(0xC0), + tx_mcast_octets = get64(0xC8) + } end function HCA:query_nic_vport_context () @@ -497,7 +573,38 @@ function HCA:create_tir_direct (rqn, transport_domain) :execute() return self:output(0x08, 23, 0) end - + +-- Create a TIR with indirect dispatching (hashing) +function HCA:create_tir_indirect (rqt, transport_domain) + self:command("CREATE_TIR", 0x10C, 0x0C) + :input("opcode", 0x00, 31, 16, 0x900) + :input("disp_type", 0x20 + 0x04, 31, 28, 1) -- indirect + :input("rx_hash_symmetric",0x20 + 0x20, 31, 31, 1) -- hash symmetrically + :input("indirect_table", 0x20 + 0x20, 23, 0, rqt) + :input("rx_hash_fn", 0x20 + 0x24, 31, 28, 2) -- toeplitz + :input("transport_domain", 0x20 + 0x24, 23, 0, transport_domain) + -- XXX Is random hash key a good solution? + for i = 0x28, 0x4C, 4 do + self:input("toeplitz_key["..((i-0x28)/4).."]", 0x20 + i, 31, 0, math.random(2^32)) + end + self:execute() + return self:output(0x08, 23, 0) +end + +function HCA:create_rqt (rqlist) + -- Problem: Hardware requires number of hash buckets to be a power of 2. + -- Workaround: Setup max # hash buckets and fill with queues in a loop. + self:command("CREATE_RQT", 0x20 + 0xF0 + 4*rqt_max_size, 0x0C) + :input("opcode", 0x00, 31, 16, 0x916) + :input("rqt_max_size", 0x20 + 0x14, 15, 0, rqt_max_size) + :input("rqt_actual_size", 0x20 + 0x18, 15, 0, rqt_max_size) + for i = 0, rqt_max_size-1 do + self:input("rq_num["..i.."]", 0x20 + 0xF0 + i*4, 23, 0, rqlist[1 + (i % #rqlist)]) + end + self:execute() + return self:output(0x08, 23, 0) +end + -- Create TIS (Transport Interface Send) function HCA:create_tis (prio, transport_domain) self:command("CREATE_TIS", 0x20 + 0x9C, 0x0C) @@ -525,13 +632,16 @@ function HCA:alloc_protection_domain () end -- Create a completion queue and return a completion queue object. -function HCA:create_cq (entries, uar_page, eqn, db_phy) +function HCA:create_cq (entries, uar_page, eqn, collapsed) local doorbell, doorbell_phy = memory.dma_alloc(16) -- Memory for completion queue entries local cqe, cqe_phy = memory.dma_alloc(entries * 64, 4096) + ffi.fill(cqe, entries * 64, 0xFF) self:command("CREATE_CQ", 0x114, 0x0C) :input("opcode", 0x00, 31, 16, 0x400) - :input("log_cq_size", 0x10 + 0x0C, 28, 24, 10) + :input("cc", 0x10 + 0x00, 20, 20, collapsed and 1 or 0) + :input("oi", 0x10 + 0x00, 17, 17, collapsed and 1 or 0) + :input("log_cq_size", 0x10 + 0x0C, 28, 24, log2size(entries)) :input("uar_page", 0x10 + 0x0C, 23, 0, uar_page) :input("c_eqn", 0x10 + 0x14, 7, 0, eqn) :input("log_page_size", 0x10 + 0x18, 28, 24, 4) @@ -546,7 +656,7 @@ end -- Create a receive queue and return a receive queue object. -- Return the receive queue number and a pointer to the WQEs. -function HCA:create_rq (cqn, pd, size, doorbell, rwq) +function HCA:create_rq (cq, pd, size, doorbell, rwq) local log_wq_size = log2size(size) local db_phy = memory.virtual_to_physical(doorbell) local rwq_phy = memory.virtual_to_physical(rwq) @@ -554,7 +664,7 @@ function HCA:create_rq (cqn, pd, size, doorbell, rwq) :input("opcode", 0x00, 31, 16, 0x908) :input("rlkey", 0x20 + 0x00, 31, 31, 1) :input("vlan_strip_disable", 0x20 + 0x00, 28, 28, 1) - :input("cqn", 0x20 + 0x08, 23, 0, cqn) + :input("cqn", 0x20 + 0x08, 23, 0, cq.cqn) :input("wq_type", 0x20 + 0x30 + 0x00, 31, 28, 1) -- cyclic :input("pd", 0x20 + 0x30 + 0x08, 23, 0, pd) :input("dbr_addr high", 0x20 + 0x30 + 0x10, 31, 0, ptrbits(db_phy, 63, 32)) @@ -565,15 +675,7 @@ function HCA:create_rq (cqn, pd, size, doorbell, rwq) :input("pas[0] high", 0x20 + 0x30 + 0xC0, 63, 32, ptrbits(rwq_phy, 63, 32)) :input("pas[0] low", 0x20 + 0x30 + 0xC4, 31, 0, ptrbits(rwq_phy, 31, 0)) :execute() - local rqn = self:output(0x08, 23, 0) - return RQ:new(rqn, rwq, doorbell) -end - -RQ = {} - -function RQ:new (rqn, rwq, doorbell) - return setmetatable({rqn = rqn, rwq = rwq, doorbell = doorbell}, - {__index = RQ}) + return self:output(0x08, 23, 0) end -- Modify a Receive Queue by making a state transition. @@ -598,7 +700,7 @@ end -- Create a Send Queue. -- Return the send queue number and a pointer to the WQEs. -function HCA:create_sq (cqn, pd, size, doorbell, swq, tis) +function HCA:create_sq (cq, pd, size, doorbell, swq, uar, tis) local log_wq_size = log2size(size) local db_phy = memory.virtual_to_physical(doorbell) local swq_phy = memory.virtual_to_physical(swq) @@ -608,11 +710,12 @@ function HCA:create_sq (cqn, pd, size, doorbell, swq, tis) :input("fre", 0x20 + 0x00, 29, 29, 1) :input("flush_in_error_en", 0x20 + 0x00, 28, 28, 1) :input("min_wqe_inline_mode", 0x20 + 0x00, 26, 24, 1) - :input("cqn", 0x20 + 0x08, 23, 0, cqn) + :input("cqn", 0x20 + 0x08, 23, 0, cq.cqn) :input("tis_lst_sz", 0x20 + 0x20, 31, 16, 1) :input("tis", 0x20 + 0x2C, 23, 0, tis) :input("wq_type", 0x20 + 0x30 + 0x00, 31, 28, 1) -- cyclic :input("pd", 0x20 + 0x30 + 0x08, 23, 0, pd) + :input("uar_page", 0x20 + 0x30 + 0x0C, 23, 0, uar) :input("pas[0] high", 0x20 + 0x30 + 0x10, 31, 0, ptrbits(db_phy, 63, 32)) :input("pas[0] low", 0x20 + 0x30 + 0x14, 31, 0, ptrbits(db_phy, 31, 0)) :input("log_wq_stride", 0x20 + 0x30 + 0x20, 19, 16, 6) @@ -620,16 +723,269 @@ function HCA:create_sq (cqn, pd, size, doorbell, swq, tis) :input("log_wq_size", 0x20 + 0x30 + 0x20, 4, 0, log_wq_size) :input("pas[0] high", 0x20 + 0x30 + 0xC0, 31, 0, ptrbits(swq_phy, 63, 32)) :input("pas[0] low", 0x20 + 0x30 + 0xC4, 31, 0, ptrbits(swq_phy, 31, 0)) + :execute() - local sqn = self:output(0x08, 23, 0) - return SQ:new(sqn, swq, doorbell) + return self:output(0x08, 23, 0) +end + +--------------------------------------------------------------- +-- IO app: attach to transmit and receive queues. +--------------------------------------------------------------- + +IO = {} +IO.__index = IO + +function IO:new (conf) + local self = setmetatable({}, self) + local pciaddress = pci.qualified(conf.pciaddress) + local mmio, fd = pci.map_pci_memory(pciaddress, 0, false) + + self.sqlist = {} + self.rqlist = {} + + for _, queuename in ipairs(conf.queues) do + + local basepath = "/pci/"..pciaddress.."/"..queuename + local sendpath = basepath.."/send" + local recvpath = basepath.."/recv" + + local send = shm.open_frame(sendpath) + local recv = shm.open_frame(recvpath) + + local sq = SQ:new(tonumber(counter.read(send.sqn)), + counter.read(send.wq), + tonumber(counter.read(send.wqsize)), + counter.read(send.doorbell), + mmio, + tonumber(counter.read(send.uar_page)), + tonumber(counter.read(send.rlkey)), + counter.read(send.cqe)) + local rq = RQ:new(counter.read(recv.rqn), + counter.read(recv.wq), + tonumber(counter.read(recv.wqsize)), + counter.read(recv.doorbell), + tonumber(counter.read(recv.rlkey)), + counter.read(recv.cqe)) + rq:refill() + table.insert(self.sqlist, sq) + table.insert(self.rqlist, rq) + end + return self +end + +function IO:push () + local l = self.input.input + if l == nil then return end + while l and not link.empty(l) do + local sq = self.sqlist[1] + sq:transmit(l) + sq:reclaim() + end +end + +function IO:pull () + -- Free transmitted packets + for q = 1, #self.sqlist do + self.sqlist[q]:reclaim() + end + -- Input received packets + local l = self.output.output + if l == nil then return end + for q = 1, #self.rqlist do + --self.rqlist[q]:enqueue(packet.allocate()) + self.rqlist[q]:ring_doorbell() + self.rqlist[q]:receive(l) + end end +--------------------------------------------------------------- +-- Receive queue + +-- Work queue entries have irregular shapes and sizes. +-- We operate on them simply as 64-byte chunks. +local wqe_t = ffi.typeof[[ + union { + uint8_t u8[64]; + uint32_t u32[0]; + uint64_t u64[0]; + } * +]] + +-- CQEs are similar to WQEs. +local cqe_t = wqe_t + +local doorbell_t = ffi.typeof[[ + struct { + uint32_t receive; + uint32_t send; + }* +]] + +RQ = {} + +local rwqe_t = ffi.typeof[[ + struct { + uint32_t length, lkey, address_high, address_low; + } * +]] + +function RQ:new (rqn, rwq, wqsize, doorbell, rlkey, cq) + local self = {} + -- Convert arguments to internal types + doorbell = ffi.cast(doorbell_t, doorbell) + rwq = ffi.cast(rwqe_t, rwq) + cqe = ffi.cast(cqe_t, cq) + -- Additional state + local packets = ffi.new("struct packet *[?]", wqsize) + local next_buffer = 0 -- next position for a buffer in wqe + local next_completion = 0 -- next completion queue position to process + local mine = 0 -- cqe ownership bit meaning software-owned + + -- Refill with buffers + function self:refill () + while packets[next_buffer % wqsize] == nil do + local p = packet.allocate() + packets[next_buffer % wqsize] = p + local rwqe = rwq[next_buffer % wqsize] + local phy = memory.virtual_to_physical(p.data) + rwqe.length = bswap(packet.max_payload) + rwqe.lkey = bswap(rlkey) + rwqe.address_high = bswap(tonumber(shr(phy, 32))) + rwqe.address_low = bswap(tonumber(band(phy, 0xFFFFFFFF))) + next_buffer = (next_buffer + 1) % 65536 + end + end + + function self:receive (l) + while not link.full(l) do + -- Find the next completion entry. + local c = cqe[next_completion] + local owner = bit.band(1, c.u8[0x3F]) + if owner ~= mine then + -- Completion entry is not available yet. + break + end + -- Advance to next completion. + next_completion = (next_completion + 1) % wqsize -- XXX cqsize + -- Toggle the ownership value if the CQ wraps around. + if next_completion == 0 then + mine = (mine + 1) % 2 + end + -- Decode the completion entry. + local opcode = shr(c.u8[0x3F], 4) + local len = bswap(c.u32[0x2C/4]) + local wqe = shr(bswap(c.u32[0x3C/4]), 16) + local idx = wqe % wqsize + if opcode == 0 or opcode == 2 then + -- Successful transmission. + assert(packets[idx] ~= nil) + link.transmit(l, packets[idx]) + packets[idx] = nil + elseif opcode == 13 or opcode == 14 then + local syndromes = { + [0x1] = "Local_Length_Error", + [0x4] = "Local_Protection_Error", + [0x5] = "Work_Request_Flushed_Error", + [0x6] = "Memory_Window_Bind_Error", + [0x10] = "Bad_Response_Error", + [0x11] = "Local_Access_Error", + [0x12] = "Remote_Invalid_Request_Error", + [0x13] = "Remote_Access_Error", + [0x14] = "Remote_Operation_Error" + } + local syndrome = c.u8[0x37] + print(("Got error. opcode=%d syndrome=0x%x message=%s"):format( + opcode, syndrome, syndromes[syndromes])) -- XXX + -- Error on transmission. + assert(packets[idx] ~= nil) + packet.free(packets[idx]) + packets[idx] = nil + else + error(("Unexpected CQE opcode: %d (0x%x)"):format(opcode, opcode)) + end + end + end + + function self:ring_doorbell () + doorbell[0].receive = bswap(next_buffer) + end + + return self +end + +--------------------------------------------------------------- +-- Send queue + SQ = {} -function SQ:new (sqn, swq, doorbell) - return setmetatable({sqn = sqn, swq = swq, doorbell = doorbell}, - {__index = SQ}) +function SQ:new (sqn, swq, wqsize, doorbell, mmio, uar, rlkey, cq) + local self = {} + -- Cast pointers to expected types + mmio = ffi.cast("uint8_t*", mmio) + swq = ffi.cast(wqe_t, swq) + doorbell = ffi.cast(doorbell_t, doorbell) + -- Additional state + local packets = ffi.new("struct packet *[?]", wqsize) + local next_packet = 0 + local next_wqeid = 0 + -- Locate "blue flame" register areas for the UAR page + local bf_next = ffi.cast("uint64_t*", mmio + (uar * 4096) + 0x800) + local bf_alt = ffi.cast("uint64_t*", mmio + (uar * 4096) + 0x900) + local cqe = ffi.cast(cqe_t, cq) + + -- Transmit packets from the link onto the send queue. + function self:transmit (l) + local start_wqeid = next_wqeid + while not link.empty(l) and packets[next_packet] == nil do + local p = link.receive(l) + local wqe = swq[next_packet] + packets[next_packet] = p + -- Control segment + wqe.u32[0] = bswap(shl(next_wqeid, 8) + 0x0A) + wqe.u32[1] = bswap(shl(sqn, 8) + 4) + wqe.u32[2] = bswap(shl(2, 2)) -- completion always + -- Ethernet segment + local ninline = 16 + wqe.u32[7] = bswap(shl(ninline, 16)) + ffi.copy(wqe.u8 + 0x1E, p.data, ninline) + -- Send Data Segment (inline data) + wqe.u32[12] = bswap(p.length - ninline) + wqe.u32[13] = bswap(rlkey) + local phy = memory.virtual_to_physical(p.data + ninline) + wqe.u32[14] = bswap(tonumber(phy) / 2^32) + wqe.u32[15] = bswap(tonumber(phy) % 2^32) + -- Advance counters + next_wqeid = (next_wqeid + 1) % 65536 + next_packet = next_wqeid % wqsize + end + -- Ring the doorbell if we enqueued new packets. + if next_wqeid ~= start_wqeid then + local current_packet = (next_packet + wqsize-1) % wqsize + doorbell.send = bswap(next_wqeid) + bf_next[0] = swq[current_packet].u64[0] + -- Switch next/alternate blue flame register for next time + bf_next, bf_alt = bf_alt, bf_next + + end + end + + local next_reclaim = 0 + -- Free packets when their transmission is complete. + function self:reclaim () + local c = cqe[0] + local opcode = cqe.u8[0x38] + local wqeid = shr(bswap(cqe.u32[0x3C/4]), 16) + if opcode == 0x0A then + while next_reclaim ~= wqeid % wqsize do + assert(packets[next_reclaim] ~= nil) + packet.free(packets[next_reclaim]) + packets[next_reclaim] = nil + next_reclaim = (next_reclaim + 1) % wqsize + end + end + end + + return self end NIC_RX = 0 -- Flow table type code for incoming packets @@ -849,6 +1205,10 @@ function HCA:input (name, offset, hi, lo, value) if debug_trace and name then print(("input @ %4xh (%2d:%2d) %-20s = %10xh (%d)"):format(offset, hi, lo, name, value, value)) end + if offset > self.input_size-4 then + error(("input offset out of bounds: %sh > %sh"):format( + bit.tohex(offset, 4), bit.tohex(self.input_size-4, 4))) + end if offset <= 16 - 4 then -- inline self:setbits(0x10 + offset, hi, lo, value) else @@ -959,7 +1319,7 @@ function HCA:execute () assert(self:getbits(0x3C, 0, 0) == 1) self.init_seg:ring_doorbell(0) --post command - + --poll for command completion while self:getbits(0x3C, 0, 0) == 1 do if self.init_seg:getbits(0x1010, 31, 24) ~= 0 then @@ -1026,8 +1386,7 @@ function InitializationSegment:getbits (offset, hi, lo) end function InitializationSegment:setbits (offset, hi, lo, value) - local base = getint(self.ptr, offset) - setint(self.ptr, offset, setbits(hi, lo, value, base)) + setint(self.ptr, offset, setbits(hi, lo, value, 0)) end function InitializationSegment:fw_rev () --maj, min, subminor @@ -1131,10 +1490,6 @@ function hexdump (pointer, index, bytes, dumpoffset) return dumpoffset + bytes end -function trace (...) - print("TRACE", ...) -end - -- Utilities for peeking and poking bitfields of 32-bit big-endian integers. -- Pointers are uint32_t* and offsets are in bytes. @@ -1142,7 +1497,6 @@ end function getint (pointer, offset) assert(offset % 4 == 0, "offset not dword-aligned") local r = bswap(pointer[offset/4]) - --print("getint", pointer, offset, r, bit.tohex(r)) return r end @@ -1185,15 +1539,61 @@ end function selftest () io.stdout:setvbuf'no' - local pcidev = lib.getenv("SNABB_PCI_CONNECTX4_0") + local pcidev0 = lib.getenv("SNABB_PCI_CONNECTX4_0") + local pcidev1 = lib.getenv("SNABB_PCI_CONNECTX4_1") -- XXX check PCI device type - if not pcidev then + if not pcidev0 then print("SNABB_PCI_CONNECTX4_0 not set") os.exit(engine.test_skipped_code) end + if not pcidev1 then + print("SNABB_PCI_CONNECTX4_1 not set") + os.exit(engine.test_skipped_code) + end + + local nic0 = ConnectX4:new{pciaddress = pcidev0, queues = {'a'}} + local nic1 = ConnectX4:new{pciaddress = pcidev1, queues = {'b'}} + local io0 = IO:new({pciaddress = pcidev0, queues = {'a'}}) + local io1 = IO:new({pciaddress = pcidev1, queues = {'b'}}) + io0.input = { input = link.new('input0') } + io0.output = { output = link.new('output0') } + io1.input = { input = link.new('input1') } + io1.output = { output = link.new('output1') } + + print("selftest: waiting for both links up") + while (nic0.hca:query_vport_state().oper_state ~= 1) or + (nic1.hca:query_vport_state().oper_state ~= 1) do + C.usleep(1e6) + end + + local bursts = 100000 + local each = 100 + print(("Links up. Sending %s packets."):format(lib.comma_value(each*bursts))) + + for i = 1, bursts do + for _, app in ipairs({io0, io1}) do + for i = 1, each do + local p = packet.allocate() + ffi.fill(p.data, 16, 0xff) + p.length = 100 + link.transmit(app.input.input, p) + end + app:pull() + app:push() + end + end + + print() + print("NIC0") + nic0:print_vport_counter() + + print() + print("NIC1") + nic1:print_vport_counter() + + nic0:stop() + nic1:stop() - local device_info = pci.device_info(pcidev) - local app = ConnectX4:new{pciaddress = pcidev} - app:stop() + print("selftest: complete") end From f08da3c33118a2467cc840769dc7cfdde94ece5b Mon Sep 17 00:00:00 2001 From: Luke Gorrie Date: Mon, 7 Nov 2016 08:58:57 +0000 Subject: [PATCH 026/209] connectx4: IO app now has only 1xSQ and 1xRQ Previously a single IO app instances could be supported with multiple send and receive queues. The intention was to share traffic amongst the queues in order to overcome NIC hardware limitations on per-queue packet rate. The problem with that approach is that splitting traffic across queues potentially changes the relative ordering of packets and this potentially creates problems for the user. Better to avoid "DWIM" and have the user arrange multiple IO apps in an application-appropriate way if indeed they do need to overcome the ~16Mpps per-queue limit on the ConnectX-4 (100G). --- src/apps/mellanox/connectx4.lua | 71 +++++++++++++-------------------- 1 file changed, 28 insertions(+), 43 deletions(-) diff --git a/src/apps/mellanox/connectx4.lua b/src/apps/mellanox/connectx4.lua index b2279d7180..3f4584a2cc 100644 --- a/src/apps/mellanox/connectx4.lua +++ b/src/apps/mellanox/connectx4.lua @@ -740,62 +740,47 @@ function IO:new (conf) local pciaddress = pci.qualified(conf.pciaddress) local mmio, fd = pci.map_pci_memory(pciaddress, 0, false) - self.sqlist = {} - self.rqlist = {} + local queue = conf.queue - for _, queuename in ipairs(conf.queues) do - - local basepath = "/pci/"..pciaddress.."/"..queuename - local sendpath = basepath.."/send" - local recvpath = basepath.."/recv" + local basepath = "/pci/"..pciaddress.."/"..queue + local sendpath = basepath.."/send" + local recvpath = basepath.."/recv" - local send = shm.open_frame(sendpath) - local recv = shm.open_frame(recvpath) - - local sq = SQ:new(tonumber(counter.read(send.sqn)), - counter.read(send.wq), - tonumber(counter.read(send.wqsize)), - counter.read(send.doorbell), - mmio, - tonumber(counter.read(send.uar_page)), - tonumber(counter.read(send.rlkey)), - counter.read(send.cqe)) - local rq = RQ:new(counter.read(recv.rqn), - counter.read(recv.wq), - tonumber(counter.read(recv.wqsize)), - counter.read(recv.doorbell), - tonumber(counter.read(recv.rlkey)), - counter.read(recv.cqe)) - rq:refill() - table.insert(self.sqlist, sq) - table.insert(self.rqlist, rq) - end + local send = shm.open_frame(sendpath) + local recv = shm.open_frame(recvpath) + + self.sq = SQ:new(tonumber(counter.read(send.sqn)), + counter.read(send.wq), + tonumber(counter.read(send.wqsize)), + counter.read(send.doorbell), + mmio, + tonumber(counter.read(send.uar_page)), + tonumber(counter.read(send.rlkey)), + counter.read(send.cqe)) + self.rq = RQ:new(counter.read(recv.rqn), + counter.read(recv.wq), + tonumber(counter.read(recv.wqsize)), + counter.read(recv.doorbell), + tonumber(counter.read(recv.rlkey)), + counter.read(recv.cqe)) return self end function IO:push () local l = self.input.input if l == nil then return end - while l and not link.empty(l) do - local sq = self.sqlist[1] - sq:transmit(l) - sq:reclaim() - end + self.sq:transmit(l) + self.sq:reclaim() end function IO:pull () -- Free transmitted packets - for q = 1, #self.sqlist do - self.sqlist[q]:reclaim() - end + self.sq:reclaim() -- Input received packets local l = self.output.output if l == nil then return end - for q = 1, #self.rqlist do - --self.rqlist[q]:enqueue(packet.allocate()) - self.rqlist[q]:ring_doorbell() - self.rqlist[q]:receive(l) - end + self.rq:ring_doorbell() + self.rq:receive(l) end --------------------------------------------------------------- @@ -1553,8 +1538,8 @@ function selftest () local nic0 = ConnectX4:new{pciaddress = pcidev0, queues = {'a'}} local nic1 = ConnectX4:new{pciaddress = pcidev1, queues = {'b'}} - local io0 = IO:new({pciaddress = pcidev0, queues = {'a'}}) - local io1 = IO:new({pciaddress = pcidev1, queues = {'b'}}) + local io0 = IO:new({pciaddress = pcidev0, queue = 'a'}) + local io1 = IO:new({pciaddress = pcidev1, queue = 'b'}) io0.input = { input = link.new('input0') } io0.output = { output = link.new('output0') } io1.input = { input = link.new('input1') } From ffe4a22be3f5877fd03cf4fb1db8faf1b95ed934 Mon Sep 17 00:00:00 2001 From: Luke Gorrie Date: Tue, 8 Nov 2016 09:30:23 +0000 Subject: [PATCH 027/209] connectx4: Add 'mtu' paramter (default 9500) Now also explicitly sets promiscuous mode on the ethernet port. --- src/apps/mellanox/connectx4.lua | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/apps/mellanox/connectx4.lua b/src/apps/mellanox/connectx4.lua index 3f4584a2cc..a2493a6995 100644 --- a/src/apps/mellanox/connectx4.lua +++ b/src/apps/mellanox/connectx4.lua @@ -71,6 +71,8 @@ function ConnectX4:new (conf) local sendq_size = conf.sendq_size or 1024 local recvq_size = conf.recvq_size or 1024 + local mtu = conf.mtu or 9500 + -- Perform a hard reset of the device to bring it into a blank state. -- -- Reset is performed at PCI level instead of via firmware command. @@ -106,6 +108,8 @@ function ConnectX4:new (conf) if debug_trace then self:check_vport() end + hca:modify_nic_vport_context(mtu, true, true, true) + -- Create basic objects that we need -- local uar = hca:alloc_uar() @@ -552,6 +556,17 @@ function HCA:query_nic_vport_context () permanent_address = mac_hex } end +function HCA:modify_nic_vport_context (mtu, promisc_uc, promisc_mc, promisc_all) + self:command("MODIFY_NIC_VPORT_CONTEXT", 0x1FC, 0x0C) + :input("opcode", 0x00, 31, 16, 0x755) + :input("field_select", 0x0C, 31, 0, 0x50) -- MTU + promisc + :input("mtu", 0x100 + 0x24, 15, 0, mtu) + :input("promisc_uc", 0x100 + 0xF0, 31, 31, promisc_uc and 1 or 0) + :input("promisc_mc", 0x100 + 0xF0, 30, 30, promisc_mc and 1 or 0) + :input("promisc_all", 0x100 + 0xF0, 29, 29, promisc_all and 1 or 0) + :execute() +end + --------------------------------------------------------------- -- TIR and TIS --------------------------------------------------------------- From 06c43f034cb7bc1dc13061362bfc04af6996f448 Mon Sep 17 00:00:00 2001 From: Luke Gorrie Date: Tue, 8 Nov 2016 09:32:10 +0000 Subject: [PATCH 028/209] connectx4: Cleaner (simplistic) selftest method The selftest function, while still very simplistic, now presents its output more neatly and checks that counter values match expectations. The test now uses unicast traffic (need add additional tests to check multicast and broadcast and ...). Example output: selftest: waiting for both links up Links up. Sending 10,000,000 packets. hardware counter 02:00.0 82:00.0 ---------------- -------------------- -------------------- rx_bcast_octets 0 0 rx_bcast_packets 0 0 rx_error_octets 0 0 rx_error_packets 0 0 rx_mcast_octets 0 0 rx_mcast_packets 0 0 rx_ucast_octets 1,000,000,000 1,000,000,000 rx_ucast_packets 10,000,000 10,000,000 tx_bcast_octets 0 0 tx_bcast_packets 0 0 tx_error_octets 0 0 tx_error_packets 0 0 tx_mcast_octets 0 0 tx_mcast_packets 0 0 tx_ucast_octets 1,000,000,000 1,000,000,000 tx_ucast_packets 10,000,000 10,000,000 selftest: ok --- src/apps/mellanox/connectx4.lua | 35 ++++++++++++++++++++++----------- 1 file changed, 24 insertions(+), 11 deletions(-) diff --git a/src/apps/mellanox/connectx4.lua b/src/apps/mellanox/connectx4.lua index a2493a6995..e8f8bf6156 100644 --- a/src/apps/mellanox/connectx4.lua +++ b/src/apps/mellanox/connectx4.lua @@ -965,7 +965,6 @@ function SQ:new (sqn, swq, wqsize, doorbell, mmio, uar, rlkey, cq) bf_next[0] = swq[current_packet].u64[0] -- Switch next/alternate blue flame register for next time bf_next, bf_alt = bf_alt, bf_next - end end @@ -1566,16 +1565,18 @@ function selftest () C.usleep(1e6) end - local bursts = 100000 - local each = 100 + local bursts = 10000 + local each = 1000 + local octets = 100 print(("Links up. Sending %s packets."):format(lib.comma_value(each*bursts))) for i = 1, bursts do for _, app in ipairs({io0, io1}) do for i = 1, each do local p = packet.allocate() - ffi.fill(p.data, 16, 0xff) - p.length = 100 + ffi.fill(p.data, octets, 0) -- zero packet + p.data[12] = 0x08 -- ethertype = 0x0800 + p.length = octets link.transmit(app.input.input, p) end app:pull() @@ -1584,16 +1585,28 @@ function selftest () end print() - print("NIC0") - nic0:print_vport_counter() + print(("%-16s %20s %20s"):format("hardware counter", pcidev0, pcidev1)) + print("---------------- -------------------- --------------------") - print() - print("NIC1") - nic1:print_vport_counter() + local stat0 = nic0.hca:query_vport_counter() + local stat1 = nic1.hca:query_vport_counter() + + -- Sort into key order + local t = {} + for k in pairs(stat0) do table.insert(t, k) end + table.sort(t) + for _, k in pairs(t) do + print(("%-16s %20s %20s"):format(k, lib.comma_value(stat0[k]), lib.comma_value(stat1[k]))) + end nic0:stop() nic1:stop() - print("selftest: complete") + if (stat0.tx_ucast_packets == bursts*each and stat0.tx_ucast_octets == bursts*each*octets and + stat1.tx_ucast_packets == bursts*each and stat1.tx_ucast_octets == bursts*each*octets) then + print("selftest: ok") + else + error("selftest failed: unexpected counter values") + end end From e367b6b69c15c784843b53d4fa53880fc1ced7dc Mon Sep 17 00:00:00 2001 From: Luke Gorrie Date: Tue, 22 Nov 2016 08:43:11 +0000 Subject: [PATCH 029/209] connectx4: Fix recv completion queue wrap-around Enabled "overflow ignore" on the receive completion queue so that we do not need to acknowledge completion entries. This should be safe because the completion queue and the receive queue are the same size i.e. each receive queue entry will have a separate completion entry with no collisions. --- src/apps/mellanox/connectx4.lua | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/apps/mellanox/connectx4.lua b/src/apps/mellanox/connectx4.lua index e8f8bf6156..6a4ceaaa34 100644 --- a/src/apps/mellanox/connectx4.lua +++ b/src/apps/mellanox/connectx4.lua @@ -655,7 +655,7 @@ function HCA:create_cq (entries, uar_page, eqn, collapsed) self:command("CREATE_CQ", 0x114, 0x0C) :input("opcode", 0x00, 31, 16, 0x400) :input("cc", 0x10 + 0x00, 20, 20, collapsed and 1 or 0) - :input("oi", 0x10 + 0x00, 17, 17, collapsed and 1 or 0) + :input("oi", 0x10 + 0x00, 17, 17, 1) :input("log_cq_size", 0x10 + 0x0C, 28, 24, log2size(entries)) :input("uar_page", 0x10 + 0x0C, 23, 0, uar_page) :input("c_eqn", 0x10 + 0x14, 7, 0, eqn) @@ -794,6 +794,7 @@ function IO:pull () -- Input received packets local l = self.output.output if l == nil then return end + self.rq:refill() self.rq:ring_doorbell() self.rq:receive(l) end @@ -857,7 +858,7 @@ function RQ:new (rqn, rwq, wqsize, doorbell, rlkey, cq) end function self:receive (l) - while not link.full(l) do + while true do -- Find the next completion entry. local c = cqe[next_completion] local owner = bit.band(1, c.u8[0x3F]) @@ -866,7 +867,8 @@ function RQ:new (rqn, rwq, wqsize, doorbell, rlkey, cq) break end -- Advance to next completion. - next_completion = (next_completion + 1) % wqsize -- XXX cqsize + -- Note: assumes sqsize == cqsize + next_completion = (next_completion + 1) % wqsize -- Toggle the ownership value if the CQ wraps around. if next_completion == 0 then mine = (mine + 1) % 2 From 48f608084af3c70fe117538b3aa780677ef30e63 Mon Sep 17 00:00:00 2001 From: Alexander Gall Date: Tue, 10 Jul 2018 11:40:05 +0200 Subject: [PATCH 030/209] Cherry-picked apps.melanox from lukego/debug100g --- src/apps/mellanox/connectx4.lua | 680 +++++++++++++++++++--------- src/apps/mellanox/connectx_test.lua | 180 ++++++++ 2 files changed, 636 insertions(+), 224 deletions(-) create mode 100644 src/apps/mellanox/connectx_test.lua diff --git a/src/apps/mellanox/connectx4.lua b/src/apps/mellanox/connectx4.lua index 6a4ceaaa34..b9ef71f1a4 100644 --- a/src/apps/mellanox/connectx4.lua +++ b/src/apps/mellanox/connectx4.lua @@ -43,8 +43,11 @@ local counter = require("core.counter") local bits, bitset = lib.bits, lib.bitset local floor = math.floor local cast = ffi.cast +local ethernet = require("lib.protocol.ethernet") + local band, bor, shl, shr, bswap, bnot = bit.band, bit.bor, bit.lshift, bit.rshift, bit.bswap, bit.bnot +local cast, typeof = ffi.cast, ffi.typeof local debug_trace = false -- Print trace messages local debug_hexdump = false -- Print hexdumps (in Linux mlx5 format) @@ -54,6 +57,116 @@ local debug_hexdump = false -- Print hexdumps (in Linux mlx5 format) -- alternatively detect from query_hca_cap. local rqt_max_size = 128 +--------------------------------------------------------------- +-- CXQ (ConnectX Queue pair) control object: +-- +-- A "CXQ" is an object that we define to represent a transmit/receive pair. +-- +-- CXQs are created and deleted by a "Control" app and, in between, +-- they are used by "IO" apps to send and receive packets. +-- +-- The lifecycle of a CXQ is managed using a state machine. This is +-- necessary because we allow Control and IO apps to start in any +-- order, for Control and IO apps to start/stop/restart independently, +-- for multiple IO apps to attempt to attach to the same CXQ, and even +-- for apps to stop in one Snabb process and be started in another +-- one. +-- +-- (This design may turn out to be overkill if we discover in the +-- future that we do not need this much flexibility. Time will tell.) +--------------------------------------------------------------- + +-- CXQs can be in one of four states: +-- FREE: CXQ is ready and available for use by an IO app. +-- IDLE: CXQ is owned by an app, but not actively processing right now. +-- BUSY: CXQ is owned by an app and is currently processing (e.g. push/pull). +-- DEAD: CXQ has been deallocated; IO app must try to open a new one. +-- +-- Once a CXQ is closed it stays in the DEAD state forever. However, a +-- replacement CXQ with the same name can be created and existing IO +-- apps can reattach to that instead. This will rerun the state machine. +-- +-- Here are the valid state transitions & when they occur: +-- +-- App Change Why +-- ---- ----------- -------------------------------------------------------- +-- CTRL none->BUSY: Control app starts initialization. +-- CTRL BUSY->FREE: Control app completes initialization. +-- IO FREE->IDLE: IO app starts and becomes owner of the CXQ. +-- IO IDLE->FREE: IO app stops and releases the CXQ for future use. +-- IO IDLE->BUSY: IO app starts running a pull/push method. +-- IO BUSY->IDLE: IO app stops running a pull/push method. +-- CTRL IDLE->DEAD: Control app closes the CXQ. (Replacement can be created.) +-- +-- These state transitions are *PROHIBITED* for important reasons: +-- +-- App Change Why *PROHIBITED* +-- ------ ----------- -------------------------------------------------------- +-- CTRL BUSY->DEAD Cannot close a CXQ while it is busy (must wait.) +-- IO DEAD->BUSY Cannot use a CXQ that is closed (must check.) +-- * DEAD->* Cannot transition from DEAD (must create new CXQ.) +-- +-- Further notes: +-- +-- Packet buffers for pending DMA (transmit or receive) are freed by +-- the Control app (which can disable DMA first) rather than by the IO +-- app (which shuts down with DMA still active.) + +-- A CXQ is represented by one struct allocated in shared memory. +-- +-- The struct defines the fields in very specific terms so that it can +-- be used directly by the driver code (rather than copying back and +-- forth between the shared memory object and a separate native +-- format.) +local cxq_t = ffi.typeof([[ + struct { + uint32_t state; // current state / availability + + // configuration information: + uint32_t sqn; // send queue number + uint32_t sqsize; // send queue size + uint32_t uar; // user access region + uint32_t rlkey; // rlkey for value + uint32_t rqn; // receive queue number + uint32_t rqsize; // receive queue size + + // DMA structures: + // doorbell contains send/receive ring cursor positions + struct { uint32_t receive, send; } *doorbell; + + // receive work queue + struct { uint32_t length, lkey, dma_hi, dma_lo; } *rwq; + + // send work queue and send/receive completion queues + union { uint8_t u8[64]; uint32_t u32[0]; uint64_t u64[0];} *swq, *scq, *rcq; + + // Transmit state + struct packet *tx[64*1024]; // packets queued for transmit + uint16_t next_tx_wqeid; // work queue ID for next transmit descriptor + uint64_t *bf_next, *bf_alt; // "blue flame" to ring doorbell (alternating) + + // Receive state + struct packet *rx[64*1024]; // packets queued for receive + uint16_t next_rx_wqeid; // work queue ID for next receive descriptor + uint16_t next_rx_cqeid; // completion queue ID of next completed packet + int rx_mine; // CQE ownership value that means software-owned + } +]]) + +-- CXQ states: +local BUSY = 0 -- Implicit initial state due to 0 value. +local IDLE = 1 +local FREE = 2 +local DEAD = 3 + +-- Transition from oldstate to newstate. +-- Returns true on successful transition, false if oldstate does not match. +function transition (cxq, oldstate, newstate) + -- XXX use atomic x86 "LOCK CMPXCHG" instruction. Have to teach DynASM. + cxq.state = newstate + return true +end + --------------------------------------------------------------- -- ConnectX4 Snabb app. -- @@ -71,6 +184,14 @@ function ConnectX4:new (conf) local sendq_size = conf.sendq_size or 1024 local recvq_size = conf.recvq_size or 1024 + -- XXX Config says whether to setup queues with MAC+VLAN + -- dispatching ("VMDq") or to simply hash uniformly over them ("RSS"). + -- + -- To be replaced with a more generic algorithm that looks at the + -- configurations of the individual ports and creates an + -- appropriate flow table. + local macvlan = conf.macvlan + local mtu = conf.mtu or 9500 -- Perform a hard reset of the device to bring it into a blank state. @@ -108,6 +229,7 @@ function ConnectX4:new (conf) if debug_trace then self:check_vport() end + hca:set_port_mtu(mtu) hca:modify_nic_vport_context(mtu, true, true, true) -- Create basic objects that we need @@ -118,75 +240,63 @@ function ConnectX4:new (conf) local tdomain = hca:alloc_transport_domain() local rlkey = hca:query_rlkey() - -- Create send and receive queues & associated objects - -- - local tis = hca:create_tis(0, tdomain) -- List of all receive queues for hashing traffic across local rqlist = {} - - for _, queuename in ipairs(conf.queues) do - - local send_cq = hca:create_cq(1, uar, eq.eqn, true) - local recv_cq = hca:create_cq(recvq_size, uar, eq.eqn, false) - - -- Allocate work queue memory (receive & send contiguous in memory) - local wq_doorbell = memory.dma_alloc(16) - local sendq_size = 1024 - local recvq_size = 1024 + local rqs = {} + + local usevlan = false + + for _, queue in ipairs(conf.queues) do + -- Create a shared memory object for controlling the queue pair + local cxq = shm.create("group/pci/"..pciaddress.."/"..queue.id, cxq_t) + + cxq.rlkey = rlkey + cxq.sqsize = sendq_size + cxq.rqsize = recvq_size + cxq.uar = uar + local scqn, scqe = hca:create_cq(1, uar, eq.eqn, true) + local rcqn, rcqe = hca:create_cq(recvq_size, uar, eq.eqn, false) + cxq.scq = cast(typeof(cxq.scq), scqe) + cxq.rcq = cast(typeof(cxq.rcq), rcqe) + cxq.doorbell = cast(typeof(cxq.doorbell), memory.dma_alloc(16)) local workqueues = memory.dma_alloc(64 * (sendq_size + recvq_size), 4096) - local rwq = workqueues -- receive work queue - local swq = workqueues + 64 * recvq_size -- send work queue - + cxq.rwq = cast(ffi.typeof(cxq.rwq), workqueues) + cxq.swq = cast(ffi.typeof(cxq.swq), workqueues + 64 * recvq_size) -- Create the queue objects - local sqn = hca:create_sq(send_cq, pd, sendq_size, wq_doorbell, swq, uar, tis) - hca:modify_sq(sqn, 0, 1) -- RESET -> READY - local rqn = hca:create_rq(recv_cq, pd, recvq_size, wq_doorbell, rwq) - hca:modify_rq(rqn, 0, 1) -- RESET -> READY - - table.insert(rqlist, rqn) - - -- Create shared memory objects containing all of the - -- information needed to access the send and receive queues. - -- - -- Snabb processes will use this information to take ownership - -- of the queue to send and receive packets. - local basepath = "/pci/"..pciaddress.."/"..queuename - local sendpath = basepath.."/send" - local recvpath = basepath.."/recv" - local u64 = function (x) return ffi.cast("uint64_t", x) end - shm.create_frame(sendpath, - {lock = {counter}, - sqn = {counter, sqn}, - wq = {counter, u64(swq)}, - wqsize = {counter, sendq_size}, - cqn = {counter, send_cq.cqn}, - cqe = {counter, u64(send_cq.cqe)}, - doorbell = {counter, u64(wq_doorbell)}, - uar_page = {counter, uar}, - rlkey = {counter, rlkey}}) - shm.create_frame(recvpath, - {lock = {counter}, - rqn = {counter, rqn}, - wq = {counter, u64(rwq)}, - wqsize = {counter, recvq_size}, - cqn = {counter, recv_cq.cqn}, - cqe = {counter, u64(recv_cq.cqe)}, - doorbell = {counter, u64(wq_doorbell)}, - uar_page = {counter, uar}, - rlkey = {counter, rlkey}}) - end + local tis = hca:create_tis(0, tdomain) + -- XXX order check + cxq.sqn = hca:create_sq(scqn, pd, sendq_size, cxq.doorbell, cxq.swq, uar, tis) + cxq.rqn = hca:create_rq(rcqn, pd, recvq_size, cxq.doorbell, cxq.rwq) + hca:modify_sq(cxq.sqn, 0, 1) -- RESET -> READY + hca:modify_rq(cxq.rqn, 0, 1) -- RESET -> READY - --local tir = hca:create_tir_direct(rqlist[1], tdomain) - local rqt = hca:create_rqt(rqlist) - local tir = hca:create_tir_indirect(rqt, tdomain) + -- CXQ is now fully initialized & ready for attach. + assert(transition(cxq, BUSY, FREE)) - -- Setup packet dispatching. - -- Just a "wildcard" flow group to send RX packets to the receive queue. - -- - local rx_flow_table_id = hca:create_root_flow_table(NIC_RX) - local flow_group_id = hca:create_flow_group_wildcard(rx_flow_table_id, NIC_RX, 0, 0) - hca:set_flow_table_entry_wildcard(rx_flow_table_id, NIC_RX, flow_group_id, 0, tir) - hca:set_flow_table_root(rx_flow_table_id, NIC_RX) + usevlan = usevlan or (queue.vlan ~= nil) + + -- XXX collect for flow table construction + rqs[queue.id] = cxq.rqn + rqlist[#rqlist+1] = cxq.rqn + end + + local rxtable = hca:create_root_flow_table(NIC_RX) + local rule = 0 + if macvlan then + local flow_group_id = hca:create_flow_group_macvlan(rxtable, NIC_RX, 0, #conf.queues-1, usevlan) + for _, queue in ipairs(conf.queues) do + local tir = hca:create_tir_direct(rqs[queue.id], tdomain) + hca:set_flow_table_entry_macvlan(rxtable, NIC_RX, flow_group_id, rule, tir, + ethernet:ptoi(queue.mac), queue.vlan) + rule = rule + 1 + end + else + local rqt = hca:create_rqt(rqlist) + local flow_group_id = hca:create_flow_group_wildcard(rxtable, NIC_RX, 0, 0) + local tir = hca:create_tir_indirect(rqt, tdomain) + hca:set_flow_table_entry_wildcard(rxtable, NIC_RX, flow_group_id, 0, tir) + end + hca:set_flow_table_root(rxtable, NIC_RX) function self:stop () pci.set_bus_master(pciaddress, false) @@ -448,7 +558,7 @@ function HCA:create_eq (uar) end -- Event Queue Entry (EQE) -local eqe_t = ffi.typeof[[ +local eqe_t = ffi.typeof([[ struct { uint16_t event_type; uint16_t event_sub_type; @@ -456,8 +566,7 @@ local eqe_t = ffi.typeof[[ uint16_t pad; uint8_t signature; uint8_t owner; - } - ]] + } ]] ) eq = {} eq.__index = eq @@ -512,6 +621,11 @@ function HCA:query_vport_state () oper_state = self:output(0x0C, 3, 0) } end +-- Convenience function +function HCA:linkup () + return self:query_vport_state().oper_state == 1 +end + function HCA:query_vport_counter () self:command("QUERY_VPORT_COUNTER", 0x1c, 0x20c) :input("opcode", 0x00, 31, 16, 0x770) @@ -666,12 +780,12 @@ function HCA:create_cq (entries, uar_page, eqn, collapsed) :input("pas[0] low", 0x114, 31, 0, ptrbits(cqe_phy, 31, 0)) :execute() local cqn = self:output(0x08, 23, 0) - return { cqn = cqn, doorbell = doorbell, cqe = cqe } + return cqn, cqe end -- Create a receive queue and return a receive queue object. -- Return the receive queue number and a pointer to the WQEs. -function HCA:create_rq (cq, pd, size, doorbell, rwq) +function HCA:create_rq (cqn, pd, size, doorbell, rwq) local log_wq_size = log2size(size) local db_phy = memory.virtual_to_physical(doorbell) local rwq_phy = memory.virtual_to_physical(rwq) @@ -679,7 +793,7 @@ function HCA:create_rq (cq, pd, size, doorbell, rwq) :input("opcode", 0x00, 31, 16, 0x908) :input("rlkey", 0x20 + 0x00, 31, 31, 1) :input("vlan_strip_disable", 0x20 + 0x00, 28, 28, 1) - :input("cqn", 0x20 + 0x08, 23, 0, cq.cqn) + :input("cqn", 0x20 + 0x08, 23, 0, cqn) :input("wq_type", 0x20 + 0x30 + 0x00, 31, 28, 1) -- cyclic :input("pd", 0x20 + 0x30 + 0x08, 23, 0, pd) :input("dbr_addr high", 0x20 + 0x30 + 0x10, 31, 0, ptrbits(db_phy, 63, 32)) @@ -715,7 +829,7 @@ end -- Create a Send Queue. -- Return the send queue number and a pointer to the WQEs. -function HCA:create_sq (cq, pd, size, doorbell, swq, uar, tis) +function HCA:create_sq (cqn, pd, size, doorbell, swq, uar, tis) local log_wq_size = log2size(size) local db_phy = memory.virtual_to_physical(doorbell) local swq_phy = memory.virtual_to_physical(swq) @@ -725,7 +839,7 @@ function HCA:create_sq (cq, pd, size, doorbell, swq, uar, tis) :input("fre", 0x20 + 0x00, 29, 29, 1) :input("flush_in_error_en", 0x20 + 0x00, 28, 28, 1) :input("min_wqe_inline_mode", 0x20 + 0x00, 26, 24, 1) - :input("cqn", 0x20 + 0x08, 23, 0, cq.cqn) + :input("cqn", 0x20 + 0x08, 23, 0, cqn) :input("tis_lst_sz", 0x20 + 0x20, 31, 16, 1) :input("tis", 0x20 + 0x2C, 23, 0, tis) :input("wq_type", 0x20 + 0x30 + 0x00, 31, 28, 1) -- cyclic @@ -752,51 +866,79 @@ IO.__index = IO function IO:new (conf) local self = setmetatable({}, self) + local pciaddress = pci.qualified(conf.pciaddress) + local queue = conf.queue local mmio, fd = pci.map_pci_memory(pciaddress, 0, false) - local queue = conf.queue + local online = false -- True when queue is up and running + local cxq -- shm object containing queue control information + local sq -- SQ send queue object + local rq -- RQ receive queue object + local open_throttle = -- Timer to throttle shm open attempts (10ms) + lib.throttle(0.25) + + -- Close the queue mapping. + local function close () + shm.unmap(cxq) + cxq = nil + end - local basepath = "/pci/"..pciaddress.."/"..queue - local sendpath = basepath.."/send" - local recvpath = basepath.."/recv" - - local send = shm.open_frame(sendpath) - local recv = shm.open_frame(recvpath) - - self.sq = SQ:new(tonumber(counter.read(send.sqn)), - counter.read(send.wq), - tonumber(counter.read(send.wqsize)), - counter.read(send.doorbell), - mmio, - tonumber(counter.read(send.uar_page)), - tonumber(counter.read(send.rlkey)), - counter.read(send.cqe)) - self.rq = RQ:new(counter.read(recv.rqn), - counter.read(recv.wq), - tonumber(counter.read(recv.wqsize)), - counter.read(recv.doorbell), - tonumber(counter.read(recv.rlkey)), - counter.read(recv.cqe)) - return self -end + -- Open the queue mapping. + local function open () + local shmpath = "group/pci/"..pciaddress.."/"..queue + if shm.exists(shmpath) then + cxq = shm.open(shmpath, cxq_t) + if transition(cxq, FREE, IDLE) then + sq = SQ:new(cxq, mmio) + rq = RQ:new(cxq) + else + close() -- Queue was not FREE. + end + end + end -function IO:push () - local l = self.input.input - if l == nil then return end - self.sq:transmit(l) - self.sq:reclaim() -end + -- Return true on successful activation of the queue. + local function activate () + -- If not open then make a request on a regular schedule. + if cxq == nil and open_throttle() then + open() + end + if cxq then + -- Careful: Control app may have closed the CXQ. + if transition(cxq, IDLE, BUSY) then + return true + else + assert(cxq.state == DEAD, "illegal state detected") + close() + end + end + end + + -- Enter the idle state. + local function deactivate () + assert(transition(cxq, BUSY, IDLE)) + end + + -- Send packets to the NIC + function self:push () + if activate() then + sq:transmit(self.input.input or self.input.rx) + sq:reclaim() + deactivate() + end + end -function IO:pull () - -- Free transmitted packets - self.sq:reclaim() - -- Input received packets - local l = self.output.output - if l == nil then return end - self.rq:refill() - self.rq:ring_doorbell() - self.rq:receive(l) + -- Receive packets from the NIC. + function self:pull () + if activate() then + rq:receive(self.output.output or self.output.tx) + rq:refill() + deactivate() + end + end + + return self end --------------------------------------------------------------- @@ -810,7 +952,7 @@ local wqe_t = ffi.typeof[[ uint32_t u32[0]; uint64_t u64[0]; } * -]] + ]] -- CQEs are similar to WQEs. local cqe_t = wqe_t @@ -830,89 +972,100 @@ local rwqe_t = ffi.typeof[[ } * ]] -function RQ:new (rqn, rwq, wqsize, doorbell, rlkey, cq) - local self = {} - -- Convert arguments to internal types - doorbell = ffi.cast(doorbell_t, doorbell) - rwq = ffi.cast(rwqe_t, rwq) - cqe = ffi.cast(cqe_t, cq) - -- Additional state - local packets = ffi.new("struct packet *[?]", wqsize) - local next_buffer = 0 -- next position for a buffer in wqe - local next_completion = 0 -- next completion queue position to process - local mine = 0 -- cqe ownership bit meaning software-owned +function RQ:new (cxq) + local rq = {} + + local mask = cxq.rqsize - 1 + -- Return the transmit queue slot for the given WQE ID. + local function slot (wqeid) + return band(wqeid, mask) + end -- Refill with buffers - function self:refill () - while packets[next_buffer % wqsize] == nil do + function rq:refill () + local notify = false -- have to notify NIC with doorbell ring? + while cxq.rx[slot(cxq.next_rx_wqeid)] == nil do local p = packet.allocate() - packets[next_buffer % wqsize] = p - local rwqe = rwq[next_buffer % wqsize] + cxq.rx[slot(cxq.next_rx_wqeid)] = p + local rwqe = cxq.rwq[slot(cxq.next_rx_wqeid)] local phy = memory.virtual_to_physical(p.data) rwqe.length = bswap(packet.max_payload) - rwqe.lkey = bswap(rlkey) - rwqe.address_high = bswap(tonumber(shr(phy, 32))) - rwqe.address_low = bswap(tonumber(band(phy, 0xFFFFFFFF))) - next_buffer = (next_buffer + 1) % 65536 + rwqe.lkey = bswap(cxq.rlkey) + rwqe.dma_hi = bswap(tonumber(shr(phy, 32))) + rwqe.dma_lo = bswap(tonumber(band(phy, 0xFFFFFFFF))) + cxq.next_rx_wqeid = cxq.next_rx_wqeid + 1 + notify = true + end + if notify then + -- ring doorbell + cxq.doorbell.receive = bswap(cxq.next_rx_wqeid) end end - function self:receive (l) - while true do - -- Find the next completion entry. - local c = cqe[next_completion] - local owner = bit.band(1, c.u8[0x3F]) - if owner ~= mine then - -- Completion entry is not available yet. - break - end - -- Advance to next completion. - -- Note: assumes sqsize == cqsize - next_completion = (next_completion + 1) % wqsize - -- Toggle the ownership value if the CQ wraps around. - if next_completion == 0 then - mine = (mine + 1) % 2 - end - -- Decode the completion entry. - local opcode = shr(c.u8[0x3F], 4) - local len = bswap(c.u32[0x2C/4]) - local wqe = shr(bswap(c.u32[0x3C/4]), 16) - local idx = wqe % wqsize - if opcode == 0 or opcode == 2 then - -- Successful transmission. - assert(packets[idx] ~= nil) - link.transmit(l, packets[idx]) - packets[idx] = nil - elseif opcode == 13 or opcode == 14 then - local syndromes = { - [0x1] = "Local_Length_Error", - [0x4] = "Local_Protection_Error", - [0x5] = "Work_Request_Flushed_Error", - [0x6] = "Memory_Window_Bind_Error", - [0x10] = "Bad_Response_Error", - [0x11] = "Local_Access_Error", - [0x12] = "Remote_Invalid_Request_Error", - [0x13] = "Remote_Access_Error", - [0x14] = "Remote_Operation_Error" - } - local syndrome = c.u8[0x37] - print(("Got error. opcode=%d syndrome=0x%x message=%s"):format( - opcode, syndrome, syndromes[syndromes])) -- XXX - -- Error on transmission. - assert(packets[idx] ~= nil) - packet.free(packets[idx]) - packets[idx] = nil - else - error(("Unexpected CQE opcode: %d (0x%x)"):format(opcode, opcode)) + local function have_input () + local c = cxq.rcq[cxq.next_rx_cqeid] + local owner = bit.band(1, c.u8[0x3F]) + return owner == cxq.rx_mine + end + + function rq:receive (l) + if have_input() then + local limit = engine.pull_npackets + while limit > 0 and not link.full(l) do + -- Find the next completion entry. + local c = cxq.rcq[cxq.next_rx_cqeid] + local owner = bit.band(1, c.u8[0x3F]) + limit = limit - 1 + -- Advance to next completion. + -- Note: assumes sqsize == cqsize + cxq.next_rx_cqeid = slot(cxq.next_rx_cqeid + 1) + -- Toggle the ownership value if the CQ wraps around. + if cxq.next_rx_cqeid == 0 then + cxq.rx_mine = (cxq.rx_mine + 1) % 2 + end + -- Decode the completion entry. + local opcode = shr(c.u8[0x3F], 4) + local len = bswap(c.u32[0x2C/4]) + local wqeid = shr(bswap(c.u32[0x3C/4]), 16) + local idx = slot(wqeid) + if opcode == 0 or opcode == 2 then + -- Successful receive + local p = cxq.rx[idx] + assert(p ~= nil) + p.length = len + link.transmit(l, p) + cxq.rx[idx] = nil + elseif opcode == 13 or opcode == 14 then + local syndromes = { + [0x1] = "Local_Length_Error", + [0x4] = "Local_Protection_Error", + [0x5] = "Work_Request_Flushed_Error", + [0x6] = "Memory_Window_Bind_Error", + [0x10] = "Bad_Response_Error", + [0x11] = "Local_Access_Error", + [0x12] = "Remote_Invalid_Request_Error", + [0x13] = "Remote_Access_Error", + [0x14] = "Remote_Operation_Error" + } + local syndrome = c.u8[0x37] + print(("Got error. opcode=%d syndrome=0x%x message=%s"):format( + opcode, syndrome, syndromes[syndromes])) -- XXX + -- Error on receive + assert(packets[idx] ~= nil) + packet.free(packets[idx]) + packets[idx] = nil + else + error(("Unexpected CQE opcode: %d (0x%x)"):format(opcode, opcode)) + end end end end - function self:ring_doorbell () + function rq:ring_doorbell () doorbell[0].receive = bswap(next_buffer) end - return self + return rq end --------------------------------------------------------------- @@ -920,31 +1073,37 @@ end SQ = {} -function SQ:new (sqn, swq, wqsize, doorbell, mmio, uar, rlkey, cq) - local self = {} +function SQ:new (cxq, mmio) + local sq = {} -- Cast pointers to expected types - mmio = ffi.cast("uint8_t*", mmio) - swq = ffi.cast(wqe_t, swq) - doorbell = ffi.cast(doorbell_t, doorbell) - -- Additional state - local packets = ffi.new("struct packet *[?]", wqsize) - local next_packet = 0 - local next_wqeid = 0 - -- Locate "blue flame" register areas for the UAR page - local bf_next = ffi.cast("uint64_t*", mmio + (uar * 4096) + 0x800) - local bf_alt = ffi.cast("uint64_t*", mmio + (uar * 4096) + 0x900) - local cqe = ffi.cast(cqe_t, cq) + local mmio = ffi.cast("uint8_t*", mmio) + cxq.bf_next = ffi.cast("uint64_t*", mmio + (cxq.uar * 4096) + 0x800) + cxq.bf_alt = ffi.cast("uint64_t*", mmio + (cxq.uar * 4096) + 0x900) + + local mask = cxq.sqsize - 1 + -- Return the transmit queue slot for the given WQE ID. + -- (Transmit queue is a smaller power of two than max WQE ID.) + local function slot (wqeid) + return band(wqeid, mask) + end -- Transmit packets from the link onto the send queue. - function self:transmit (l) - local start_wqeid = next_wqeid - while not link.empty(l) and packets[next_packet] == nil do + function sq:transmit (l) + local start_wqeid = cxq.next_tx_wqeid + local next_slot = slot(start_wqeid) + while not link.empty(l) and cxq.tx[next_slot] == nil do local p = link.receive(l) - local wqe = swq[next_packet] - packets[next_packet] = p + local wqe = cxq.swq[next_slot] + -- Store packet pointer so that we can free it later + cxq.tx[next_slot] = p + + -- Construct a 64-byte transmit descriptor. + -- This is in three parts: Control, Ethernet, Data. + -- The Ethernet part includes some inline data. + -- Control segment - wqe.u32[0] = bswap(shl(next_wqeid, 8) + 0x0A) - wqe.u32[1] = bswap(shl(sqn, 8) + 4) + wqe.u32[0] = bswap(shl(cxq.next_tx_wqeid, 8) + 0x0A) + wqe.u32[1] = bswap(shl(cxq.sqn, 8) + 4) wqe.u32[2] = bswap(shl(2, 2)) -- completion always -- Ethernet segment local ninline = 16 @@ -952,41 +1111,40 @@ function SQ:new (sqn, swq, wqsize, doorbell, mmio, uar, rlkey, cq) ffi.copy(wqe.u8 + 0x1E, p.data, ninline) -- Send Data Segment (inline data) wqe.u32[12] = bswap(p.length - ninline) - wqe.u32[13] = bswap(rlkey) + wqe.u32[13] = bswap(cxq.rlkey) local phy = memory.virtual_to_physical(p.data + ninline) wqe.u32[14] = bswap(tonumber(phy) / 2^32) wqe.u32[15] = bswap(tonumber(phy) % 2^32) -- Advance counters - next_wqeid = (next_wqeid + 1) % 65536 - next_packet = next_wqeid % wqsize + cxq.next_tx_wqeid = cxq.next_tx_wqeid + 1 + next_slot = slot(cxq.next_tx_wqeid) end -- Ring the doorbell if we enqueued new packets. - if next_wqeid ~= start_wqeid then - local current_packet = (next_packet + wqsize-1) % wqsize - doorbell.send = bswap(next_wqeid) - bf_next[0] = swq[current_packet].u64[0] + if cxq.next_tx_wqeid ~= start_wqeid then + local current_packet = slot(cxq.next_tx_wqeid + cxq.sqsize-1) + cxq.doorbell.send = bswap(cxq.next_tx_wqeid) + cxq.bf_next[0] = cxq.swq[current_packet].u64[0] -- Switch next/alternate blue flame register for next time - bf_next, bf_alt = bf_alt, bf_next + cxq.bf_next, cxq.bf_alt = cxq.bf_alt, cxq.bf_next end end local next_reclaim = 0 -- Free packets when their transmission is complete. - function self:reclaim () - local c = cqe[0] - local opcode = cqe.u8[0x38] - local wqeid = shr(bswap(cqe.u32[0x3C/4]), 16) + function sq:reclaim () + local opcode = cxq.scq[0].u8[0x38] if opcode == 0x0A then - while next_reclaim ~= wqeid % wqsize do - assert(packets[next_reclaim] ~= nil) - packet.free(packets[next_reclaim]) - packets[next_reclaim] = nil - next_reclaim = (next_reclaim + 1) % wqsize + local wqeid = shr(bswap(cxq.scq[0].u32[0x3C/4]), 16) + while next_reclaim ~= wqeid % cxq.sqsize do + assert(cxq.tx[next_reclaim] ~= nil) + packet.free(cxq.tx[next_reclaim]) + cxq.tx[next_reclaim] = nil + next_reclaim = tonumber(slot(next_reclaim + 1)) end end end - return self + return sq end NIC_RX = 0 -- Flow table type code for incoming packets @@ -997,7 +1155,7 @@ function HCA:create_root_flow_table (table_type) self:command("CREATE_FLOW_TABLE", 0x3C, 0x0C) :input("opcode", 0x00, 31, 16, 0x930) :input("table_type", 0x10, 31, 24, table_type) - :input("log_size", 0x18 + 0x00, 7, 0, 4) -- XXX make parameter + :input("log_size", 0x18 + 0x00, 7, 0, 10) -- XXX make parameter :execute() local table_id = self:output(0x08, 23, 0) return table_id @@ -1020,7 +1178,7 @@ function HCA:create_flow_group_wildcard (table_id, table_type, start_ix, end_ix) :input("table_id", 0x14, 23, 0, table_id) :input("start_ix", 0x1C, 31, 0, start_ix) :input("end_ix", 0x24, 31, 0, end_ix) -- (inclusive) - :input("match_criteria", 0x3C, 7, 0, 0) -- match outer headers + :input("match_criteria", 0x3C, 7, 0, 0) :execute() local group_id = self:output(0x08, 23, 0) return group_id @@ -1042,6 +1200,44 @@ function HCA:set_flow_table_entry_wildcard (table_id, table_type, group_id, flow :execute() end +-- Create a DMAC+VLAN flow group. +function HCA:create_flow_group_macvlan (table_id, table_type, start_ix, end_ix, usevlan) + self:command("CREATE_FLOW_GROUP", 0x3FC, 0x0C) + :input("opcode", 0x00, 31, 16, 0x933) + :input("table_type", 0x10, 31, 24, table_type) + :input("table_id", 0x14, 23, 0, table_id) + :input("start_ix", 0x1C, 31, 0, start_ix) + :input("end_ix", 0x24, 31, 0, end_ix) -- (inclusive) + :input("match_criteria", 0x3C, 7, 0, 1) -- match outer headers + :input("dmac0", 0x40 + 0x08, 31, 0, 0xFFFFFFFF) + :input("dmac1", 0x40 + 0x0C, 31, 16, 0xFFFF) + if usevlan then + self:input("vlanid", 0x40 + 0x0C, 11, 0, 0xFFF) + end + self:execute() + local group_id = self:output(0x08, 23, 0) + return group_id +end + +-- Set a DMAC+VLAN flow table rule. +function HCA:set_flow_table_entry_macvlan (table_id, table_type, group_id, flow_index, tir, dmac, vlanid) + self:command("SET_FLOW_TABLE_ENTRY", 0x40 + 0x300, 0x0C) + :input("opcode", 0x00, 31, 16, 0x936) + :input("opmod", 0x04, 15, 0, 0) -- new entry + :input("table_type", 0x10, 31, 24, table_type) + :input("table_id", 0x14, 23, 0, table_id) + :input("flow_index", 0x20, 31, 0, flow_index) + :input("group_id", 0x40 + 0x04, 31, 0, group_id) + :input("action", 0x40 + 0x0C, 15, 0, 4) -- action = FWD_DST + :input("dest_list_sz", 0x40 + 0x10, 23, 0, 1) -- destination list size + :input("dmac0", 0x40 + 0x48, 31, 0, math.floor(dmac/2^16)) + :input("dmac1", 0x40 + 0x4C, 31, 16, band(dmac, 0xFFFF)) + :input("vlan", 0x40 + 0x4C, 11, 0, vlanid or 0) + :input("dest_type", 0x40 + 0x300, 31, 24, 2) + :input("dest_id", 0x40 + 0x300, 23, 0, tir) + :execute() +end + --------------------------------------------------------------- -- PHY control access --------------------------------------------------------------- @@ -1051,6 +1247,7 @@ end PAOS = 0x5006 -- Port Administrative & Operational Status PPLR = 0x5018 -- Port Physical Loopback Register) +PMTU = 0x5003 -- Set the administrative status of the port (boolean up/down). function HCA:set_admin_status (admin_up) @@ -1064,6 +1261,16 @@ function HCA:set_admin_status (admin_up) :execute() end +function HCA:set_port_mtu (mtu) + self:command("ACCESS_REGISTER", 0x1C, 0x0C) + :input("opcode", 0x00, 31, 16, 0x805) + :input("opmod", 0x04, 15, 0, 0) -- write + :input("register_id", 0x08, 15, 0, PMTU) + :input("local_port", 0x10, 23, 16, 1) + :input("admin_mtu", 0x18, 31, 16, mtu) + :execute() +end + function HCA:get_port_status () self:command("ACCESS_REGISTER", 0x10, 0x1C) :input("opcode", 0x00, 31, 16, 0x805) @@ -1552,14 +1759,16 @@ function selftest () os.exit(engine.test_skipped_code) end - local nic0 = ConnectX4:new{pciaddress = pcidev0, queues = {'a'}} - local nic1 = ConnectX4:new{pciaddress = pcidev1, queues = {'b'}} local io0 = IO:new({pciaddress = pcidev0, queue = 'a'}) local io1 = IO:new({pciaddress = pcidev1, queue = 'b'}) io0.input = { input = link.new('input0') } io0.output = { output = link.new('output0') } io1.input = { input = link.new('input1') } io1.output = { output = link.new('output1') } + -- Exercise the IO apps before the NIC is initialized. + io0:pull() io0:push() io1:pull() io1:push() + local nic0 = ConnectX4:new{pciaddress = pcidev0, queues = {{id='a'}}} + local nic1 = ConnectX4:new{pciaddress = pcidev1, queues = {{id='b'}}} print("selftest: waiting for both links up") while (nic0.hca:query_vport_state().oper_state ~= 1) or @@ -1567,16 +1776,18 @@ function selftest () C.usleep(1e6) end - local bursts = 10000 - local each = 1000 + local bursts = 1000 + local each = 100 local octets = 100 print(("Links up. Sending %s packets."):format(lib.comma_value(each*bursts))) for i = 1, bursts do - for _, app in ipairs({io0, io1}) do + for id, app in ipairs({io0, io1}) do for i = 1, each do local p = packet.allocate() ffi.fill(p.data, octets, 0) -- zero packet + local header = lib.hexundump("000000000001 000000000002 0800", 16) + ffi.copy(p.data, header, #header) p.data[12] = 0x08 -- ethertype = 0x0800 p.length = octets link.transmit(app.input.input, p) @@ -1585,6 +1796,27 @@ function selftest () app:push() end end + print("link", "txpkt", "txbyte", "txdrop") + local i0 = io0.input.input + local i1 = io1.input.input + local o0 = io0.output.output + local o1 = io1.output.output + print("send0", tonumber(counter.read(i0.stats.txpackets)), tonumber(counter.read(i0.stats.txbytes)), tonumber(counter.read(i0.stats.txdrop))) + print("send1", tonumber(counter.read(i1.stats.txpackets)), tonumber(counter.read(i1.stats.txbytes)), tonumber(counter.read(i1.stats.txdrop))) + print("recv0", tonumber(counter.read(o0.stats.txpackets)), tonumber(counter.read(o0.stats.txbytes)), tonumber(counter.read(o0.stats.txdrop))) + print("recv1", tonumber(counter.read(o1.stats.txpackets)), tonumber(counter.read(o1.stats.txbytes)), tonumber(counter.read(o1.stats.txdrop))) + + print("payload snippets of first 5 packets") + print("port0") + for i = 1, 5 do + local p = link.receive(o0) + if p then print(p.length, lib.hexdump(ffi.string(p.data, math.min(32, p.length)))) end + end + print("port1") + for i = 1, 5 do + local p = link.receive(o1) + if p then print(p.length, lib.hexdump(ffi.string(p.data, math.min(32, p.length)))) end + end print() print(("%-16s %20s %20s"):format("hardware counter", pcidev0, pcidev1)) diff --git a/src/apps/mellanox/connectx_test.lua b/src/apps/mellanox/connectx_test.lua new file mode 100644 index 0000000000..0e7c56e880 --- /dev/null +++ b/src/apps/mellanox/connectx_test.lua @@ -0,0 +1,180 @@ +-- Test suite for the Mellanox ConnectX-4 driver. +-- Use of this source code is governed by the Apache 2.0 license; see COPYING. +module(..., package.seeall) + +local ffi = require("ffi") +local C = ffi.C +local connectx4 = require("apps.mellanox.connectx4") +local counter = require("core.counter") +local lib = require("core.lib") + +-- Test scenarios: +-- unicast-multiqueue +-- number of queues + +-- Test sending traffic between two directly attached network interfaces. +-- +-- pci0, pci1: device PCI addresses +-- npackets: number of packets to transfer (lower bound) +-- ncores: number of CPU cores per network interface +-- minlen: minimum packet length (excl. ethernet FCS) +-- maxlen: maximum packet length +-- minburst: minimum burst size (packets) sent to the driver +-- maxburst: maximum burst size +-- macs: number of unique mac addresses +-- vlans: number of unique VLAN IDs +-- rss: number of RSS hash buckets. +-- +-- Hardware queue count will be macs*vlans*rss on each interface. +function switch (pci0, pci1, npackets, ncores, minlen, maxlen, minburst, maxburst, macs, vlans, rss) + print("selftest: connectx4_test switch") + assert(rss == 1, "rss not yet handled") + assert(ncores == 1, "multicore not yet handled") + -- Create queue definitions + local queues = {} + for vlan = 1, vlans do + for mac = 1, macs do + local id = ("vlan%d.mac%d"):format(vlan, mac) + queues[#queues+1] = {id=id, vlan=vlan, mac="00:00:00:00:00:"..bit.tohex(mac, 2)} + end + end + -- Instantiate app network + local nic0 = connectx4.ConnectX4:new({pciaddress=pci0, queues=queues, macvlan=true}) + local nic1 = connectx4.ConnectX4:new({pciaddress=pci1, queues=queues, macvlan=true}) + local io0 = {} -- io apps on nic0 + local io1 = {} -- io apps on nic1 + print(("creating %d queues per device..."):format(#queues)) + for _, queue in ipairs(queues) do + local function ioapp (pci, queue) + local a = connectx4.IO:new({pciaddress=pci, queue=queue.id}) + a.input = { input = link.new(("input-%s-%s" ):format(pci, queue.id)) } + a.output = { output = link.new(("output-%s-%s"):format(pci, queue.id)) } + return a + end + io0[queue.id] = ioapp(pci0, queue) + io1[queue.id] = ioapp(pci1, queue) + end + -- Create diverse packet payload templates + print("creating payloads...") + local payload = {} + local npayloads = 1000 + for i = 1, npayloads do + local p = packet.allocate() + payload[i] = p + p.length = between(minlen, maxlen) + ffi.fill(p.data, p.length, 0) + + -- MAC destination + local r = math.random() + if r < 0.10 then -- 10% of packets are broadcast + ffi.fill(p.data, 6, 0xFF) + elseif r < 0.20 then -- 10% are unicast to random destinations + for i = 1, 5 do p.data[i] = math.random(256) - 1 end + else -- rest are unicast to known mac + p.data[5] = between(1, macs) + end + + p.data[12] = 0x08 -- ipv4 + + -- MAC source + for i = 7, 11 do p.data[i] = math.random(256) - 1 end + -- 802.1Q + p.data[12] = 0x81 + p.data[15] = between(1, vlans) -- vlan id can be out of expected range + p.data[16] = 0x08 -- ipv4 + -- Random payload + for i = 50, p.length-1 do + p.data[i] = math.random(256) - 1 + end + --print(lib.hexdump(ffi.string(p.data, 32))) + end + -- Wait for linkup on both ports + print("waiting for linkup...") + while not (nic0.hca:linkup() and nic1.hca:linkup()) do C.usleep(0.25e6) end + -- Send packets + print("sending packets...") + + local function dump (pci, id, app) + -- Dump received packets + while not link.empty(app.output.output) do + local p = link.receive(app.output.output) + --print(("recv %s %4d %s: %s"):format(pci, p.length, id, lib.hexdump(ffi.string(p.data, 32)))) + packet.free(p) + end + end + + local start = engine.now() + local remaining = npackets + require("lib.traceprof.traceprof").start() + while remaining > 0 do + -- Send packets + for id, _ in pairs(io0) do + for i = 1, between(minburst, maxburst) do + if remaining > 0 then + local p = payload[between(1, npayloads)] + --print(("send(%4d): %s"):format(p.length, lib.hexdump(ffi.string(p.data, 32)))) + link.transmit(io0[id].input.input, packet.clone(p)) + link.transmit(io1[id].input.input, packet.clone(p)) + remaining = remaining - 1 + end + end + end + -- Simulate breathing + --C.usleep(100) + for id, app in pairs(io0) do app:pull() app:push() dump(pci0, id, app) end + for id, app in pairs(io1) do app:pull() app:push() dump(pci1, id, app) end + -- Simulate breathing + end + require("lib.traceprof.traceprof").stop() + -- Receive any last packets + C.usleep(100) + for i = 1, 10 do + for id, app in pairs(io0) do app:pull() app:push() dump(pci0, id, app) end + for id, app in pairs(io1) do app:pull() app:push() dump(pci1, id, app) end + end + local finish = engine.now() + print("reporting...") + print(("%-16s %20s %20s"):format("hardware counter", pci0, pci1)) + print("---------------- -------------------- --------------------") + local stat0 = nic0.hca:query_vport_counter() + local stat1 = nic1.hca:query_vport_counter() + -- Sort into key order + local t = {} + for k in pairs(stat0) do table.insert(t, k) end + table.sort(t) + for _, k in pairs(t) do + print(("%-16s %20s %20s"):format(k, lib.comma_value(stat0[k]), lib.comma_value(stat1[k]))) + end + + print(("@@ %16s; %12s; %12s; %12s; %12s; %12s; %12s; %12s"):format( + "nic", "link", "txpkt", "txbyte", "txdrop", "rxpkt", "rxbyte", "rxdrop")) + for id in pairs(io0) do + local function prlink (nic, id, app) + local function count (cnt) return tonumber(counter.read(cnt)) end + local srx = app.input.input.stats + local stx = app.output.output.stats + print(("@@ %16s; %12s; %12d; %12d; %12d; %12d; %12d; %12d"):format( + nic, id, + count(srx.txpackets), count(srx.txbytes), count(srx.txdrop), + count(stx.txpackets), count(stx.txbytes), count(stx.txdrop))) + end + prlink(pci0, id, io0[id]) + prlink(pci1, id, io1[id]) + end + print(("time: %.1fs - Mpps: %.3f per NIC"):format(finish-start, npackets/1e6/(finish-start))) + print("selftest: done") +end + +-- Return a random number between min and max (inclusive.) +function between (min, max) + if min == max then + return min + else + return min + math.random(max-min+1) - 1 + end +end + +function selftest () + switch("02:00.0", "03:00.0", 10e6, 1, 60, 1500, 100, 100, 4, 4, 1) +end + From 44740000e74c1664693e14c0f2b52529b14f592b Mon Sep 17 00:00:00 2001 From: Alexander Gall Date: Tue, 10 Jul 2018 11:47:54 +0200 Subject: [PATCH 031/209] Uncommited changes from lukego --- src/apps/mellanox/connectx4.lua | 126 +++++++++++++++++--------------- 1 file changed, 67 insertions(+), 59 deletions(-) diff --git a/src/apps/mellanox/connectx4.lua b/src/apps/mellanox/connectx4.lua index b9ef71f1a4..7b450792c3 100644 --- a/src/apps/mellanox/connectx4.lua +++ b/src/apps/mellanox/connectx4.lua @@ -49,7 +49,7 @@ local band, bor, shl, shr, bswap, bnot = bit.band, bit.bor, bit.lshift, bit.rshift, bit.bswap, bit.bnot local cast, typeof = ffi.cast, ffi.typeof -local debug_trace = false -- Print trace messages +local debug_trace = true -- Print trace messages local debug_hexdump = false -- Print hexdumps (in Linux mlx5 format) -- Maximum size of a receive queue table. @@ -208,6 +208,8 @@ function ConnectX4:new (conf) local init_seg = InitializationSegment:new(mmio) local hca = HCA:new(init_seg) + -- Makes enable_hca() hang with ConnectX5 + -- init_seg:reset() init_seg:cmdq_phy_addr(memory.virtual_to_physical(hca.entry)) if debug_trace then init_seg:dump() end while not init_seg:ready() do @@ -1009,54 +1011,51 @@ function RQ:new (cxq) end function rq:receive (l) - if have_input() then - local limit = engine.pull_npackets - while limit > 0 and not link.full(l) do - -- Find the next completion entry. - local c = cxq.rcq[cxq.next_rx_cqeid] - local owner = bit.band(1, c.u8[0x3F]) - limit = limit - 1 - -- Advance to next completion. - -- Note: assumes sqsize == cqsize - cxq.next_rx_cqeid = slot(cxq.next_rx_cqeid + 1) - -- Toggle the ownership value if the CQ wraps around. - if cxq.next_rx_cqeid == 0 then - cxq.rx_mine = (cxq.rx_mine + 1) % 2 - end - -- Decode the completion entry. - local opcode = shr(c.u8[0x3F], 4) - local len = bswap(c.u32[0x2C/4]) - local wqeid = shr(bswap(c.u32[0x3C/4]), 16) - local idx = slot(wqeid) - if opcode == 0 or opcode == 2 then - -- Successful receive - local p = cxq.rx[idx] - assert(p ~= nil) - p.length = len - link.transmit(l, p) - cxq.rx[idx] = nil - elseif opcode == 13 or opcode == 14 then - local syndromes = { - [0x1] = "Local_Length_Error", - [0x4] = "Local_Protection_Error", - [0x5] = "Work_Request_Flushed_Error", - [0x6] = "Memory_Window_Bind_Error", - [0x10] = "Bad_Response_Error", - [0x11] = "Local_Access_Error", - [0x12] = "Remote_Invalid_Request_Error", - [0x13] = "Remote_Access_Error", - [0x14] = "Remote_Operation_Error" - } - local syndrome = c.u8[0x37] - print(("Got error. opcode=%d syndrome=0x%x message=%s"):format( - opcode, syndrome, syndromes[syndromes])) -- XXX - -- Error on receive - assert(packets[idx] ~= nil) - packet.free(packets[idx]) - packets[idx] = nil - else - error(("Unexpected CQE opcode: %d (0x%x)"):format(opcode, opcode)) - end + local limit = engine.pull_npackets + while have_input() and limit > 0 and not link.full(l) do + -- Find the next completion entry. + local c = cxq.rcq[cxq.next_rx_cqeid] + limit = limit - 1 + -- Advance to next completion. + -- Note: assumes sqsize == cqsize + cxq.next_rx_cqeid = slot(cxq.next_rx_cqeid + 1) + -- Toggle the ownership value if the CQ wraps around. + if cxq.next_rx_cqeid == 0 then + cxq.rx_mine = (cxq.rx_mine + 1) % 2 + end + -- Decode the completion entry. + local opcode = shr(c.u8[0x3F], 4) + local len = bswap(c.u32[0x2C/4]) + local wqeid = shr(bswap(c.u32[0x3C/4]), 16) + local idx = slot(wqeid) + if opcode == 0 or opcode == 2 then + -- Successful receive + local p = cxq.rx[idx] + assert(p ~= nil) + p.length = len + link.transmit(l, p) + cxq.rx[idx] = nil + elseif opcode == 13 or opcode == 14 then + local syndromes = { + [0x1] = "Local_Length_Error", + [0x4] = "Local_Protection_Error", + [0x5] = "Work_Request_Flushed_Error", + [0x6] = "Memory_Window_Bind_Error", + [0x10] = "Bad_Response_Error", + [0x11] = "Local_Access_Error", + [0x12] = "Remote_Invalid_Request_Error", + [0x13] = "Remote_Access_Error", + [0x14] = "Remote_Operation_Error" + } + local syndrome = c.u8[0x37] + print(("Got error. opcode=%d syndrome=0x%x message=%s"):format( + opcode, syndrome, syndromes[syndromes])) -- XXX + -- Error on receive + assert(packets[idx] ~= nil) + packet.free(packets[idx]) + packets[idx] = nil + else + error(("Unexpected CQE opcode: %d (0x%x)"):format(opcode, opcode)) end end end @@ -1659,6 +1658,11 @@ function InitializationSegment:health_syndrome () return self:getbits(0x1010, 31, 24) end +function InitializationSegment:reset () + -- Not covered in PRM + self:setbits(0x14, 10, 8, 0x7) +end + function InitializationSegment:dump () print('fw_rev ', self:fw_rev()) print('cmd_interface_rev ', self:cmd_interface_rev()) @@ -1776,24 +1780,28 @@ function selftest () C.usleep(1e6) end - local bursts = 1000 + local bursts = 10000 local each = 100 local octets = 100 print(("Links up. Sending %s packets."):format(lib.comma_value(each*bursts))) - for i = 1, bursts do + for i = 1, bursts + 100 do for id, app in ipairs({io0, io1}) do - for i = 1, each do - local p = packet.allocate() - ffi.fill(p.data, octets, 0) -- zero packet - local header = lib.hexundump("000000000001 000000000002 0800", 16) - ffi.copy(p.data, header, #header) - p.data[12] = 0x08 -- ethertype = 0x0800 - p.length = octets - link.transmit(app.input.input, p) + if i <= bursts then + for i = 1, each do + local p = packet.allocate() + ffi.fill(p.data, octets, 0) -- zero packet + local header = lib.hexundump("000000000001 000000000002 0800", 16) + ffi.copy(p.data, header, #header) + p.data[12] = 0x08 -- ethertype = 0x0800 + p.length = octets + link.transmit(app.input.input, p) + end end app:pull() app:push() + while not link.empty(io0.output.output) do packet.free(link.receive(io0.output.output)) end + while not link.empty(io1.output.output) do packet.free(link.receive(io1.output.output)) end end end print("link", "txpkt", "txbyte", "txdrop") From 580cf9b00359d9759462e326210a10662ec9f95f Mon Sep 17 00:00:00 2001 From: Alexander Gall Date: Tue, 10 Jul 2018 21:42:25 +0200 Subject: [PATCH 032/209] connectx4.lua: minor changes Fix the opcode for DISABLE_HCA (currently unused), add assert to alloc_pages() method. --- src/apps/mellanox/connectx4.lua | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/apps/mellanox/connectx4.lua b/src/apps/mellanox/connectx4.lua index 7b450792c3..c60de555d6 100644 --- a/src/apps/mellanox/connectx4.lua +++ b/src/apps/mellanox/connectx4.lua @@ -435,6 +435,7 @@ end -- Provide the NIC with freshly allocated memory. function HCA:alloc_pages (num_pages) + assert(num_pages > 0) self:command("MANAGE_PAGES", 0x14 + num_pages*8, 0x0C) :input("opcode", 0x00, 31, 16, 0x108) :input("opmod", 0x04, 15, 0, 1) -- allocate mode @@ -532,8 +533,7 @@ end function HCA:disable_hca () self:command("DISABLE_HCA", 0x0c, 0x0c) - :input("opcode", 0x00, 31, 16, 0x103) - :input("opmod", 0x04, 15, 0, mode) + :input("opcode", 0x00, 31, 16, 0x105) :execute() end From 10ceea423459d30a1a2785f11f3e6dd6009734f2 Mon Sep 17 00:00:00 2001 From: Alexander Gall Date: Wed, 11 Jul 2018 09:56:39 +0200 Subject: [PATCH 033/209] connectx4.lua: fix alignment of HCA command queue --- src/apps/mellanox/connectx4.lua | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/apps/mellanox/connectx4.lua b/src/apps/mellanox/connectx4.lua index c60de555d6..e05d08fae6 100644 --- a/src/apps/mellanox/connectx4.lua +++ b/src/apps/mellanox/connectx4.lua @@ -1318,7 +1318,7 @@ local data_per_mailbox = 0x200 -- Bytes of input/output data in a mailbox -- Create a command queue with dedicated/reusable DMA memory. function HCA:new (init_seg) - local entry = ffi.cast("uint32_t*", memory.dma_alloc(0x40)) + local entry = ffi.cast("uint32_t*", memory.dma_alloc(0x40, 4096)) local inboxes, outboxes = {}, {} for i = 0, max_mailboxes-1 do -- XXX overpadding.. 0x240 alignment is not accepted? From 1b9cf2b799fa617ea2ee1bde79ebcc95ec572663 Mon Sep 17 00:00:00 2001 From: Alexander Gall Date: Thu, 12 Jul 2018 12:56:40 +0200 Subject: [PATCH 034/209] connectx4.lua: fix RSS --- src/apps/mellanox/connectx4.lua | 113 ++++++++++++++++++++++++++++---- 1 file changed, 99 insertions(+), 14 deletions(-) diff --git a/src/apps/mellanox/connectx4.lua b/src/apps/mellanox/connectx4.lua index e05d08fae6..5729bb5c96 100644 --- a/src/apps/mellanox/connectx4.lua +++ b/src/apps/mellanox/connectx4.lua @@ -49,8 +49,8 @@ local band, bor, shl, shr, bswap, bnot = bit.band, bit.bor, bit.lshift, bit.rshift, bit.bswap, bit.bnot local cast, typeof = ffi.cast, ffi.typeof -local debug_trace = true -- Print trace messages -local debug_hexdump = false -- Print hexdumps (in Linux mlx5 format) +local debug_trace = false -- Print trace messages +local debug_hexdump = false -- Print hexdumps (in Linux mlx5 format) -- Maximum size of a receive queue table. -- XXX This is hard-coded in the Linux mlx5 driver too. Could @@ -293,10 +293,34 @@ function ConnectX4:new (conf) rule = rule + 1 end else + -- Set up RSS accross all queues. Hashing is only performed for + -- IPv4/IPv6 and TCP/UDP, i.e. non-IP packets as well as non + -- TCP/UDP packets are mapped to Queue #1. Hashing is done by + -- the TIR for a specific combination of protocols, hence + -- separate flows are needed to provide each TIR with the + -- appropriate types of packets. + local l3_protos = { 'v4', 'v6' } + local l4_protos = { 'udp', 'tcp' } local rqt = hca:create_rqt(rqlist) - local flow_group_id = hca:create_flow_group_wildcard(rxtable, NIC_RX, 0, 0) - local tir = hca:create_tir_indirect(rqt, tdomain) - hca:set_flow_table_entry_wildcard(rxtable, NIC_RX, flow_group_id, 0, tir) + local flow_group_ip = + hca:create_flow_group_ip(rxtable, NIC_RX, 0, + #l3_protos * #l4_protos - 1) + local index = 0 + for _, l3_proto in ipairs(l3_protos) do + for _, l4_proto in ipairs(l4_protos) do + local tir = hca:create_tir_indirect(rqt, tdomain, + l3_proto, l4_proto) + hca:set_flow_table_entry_ip(rxtable, NIC_RX, flow_group_ip, + index, tir, l3_proto, l4_proto) + index = index + 1 + end + end + + local flow_group_wildcard = + hca:create_flow_group_wildcard(rxtable, NIC_RX, index, index) + local tir_q1 = hca:create_tir_direct(rqlist[1], tdomain) + hca:set_flow_table_entry_wildcard(rxtable, NIC_RX, + flow_group_wildcard, index, tir_q1) end hca:set_flow_table_root(rxtable, NIC_RX) @@ -705,8 +729,19 @@ function HCA:create_tir_direct (rqn, transport_domain) return self:output(0x08, 23, 0) end --- Create a TIR with indirect dispatching (hashing) -function HCA:create_tir_indirect (rqt, transport_domain) +-- Create a TIR with indirect dispatching (hashing) for a particular +-- combination of IP protocol and TCP/UDP ports. +function HCA:create_tir_indirect (rqt, transport_domain, l3_proto, l4_proto) + local l3_protos = { + v4 = 0, + v6 = 1 + } + local l4_protos = { + tcp = 0, + udp = 1 + } + local l3_proto = assert(l3_protos[l3_proto or 'v4'], "invalid l3 proto") + local l4_proto = assert(l4_protos[l4_proto or 'tcp'], "invalid l4 proto") self:command("CREATE_TIR", 0x10C, 0x0C) :input("opcode", 0x00, 31, 16, 0x900) :input("disp_type", 0x20 + 0x04, 31, 28, 1) -- indirect @@ -714,6 +749,9 @@ function HCA:create_tir_indirect (rqt, transport_domain) :input("indirect_table", 0x20 + 0x20, 23, 0, rqt) :input("rx_hash_fn", 0x20 + 0x24, 31, 28, 2) -- toeplitz :input("transport_domain", 0x20 + 0x24, 23, 0, transport_domain) + :input("l3_prot_type", 0x20 + 0x50, 31, 31, l3_proto) + :input("l4_prot_type", 0x20 + 0x50, 30, 30, l4_proto) + :input("selected_fields", 0x20 + 0x50, 29, 0, 15) -- SRC/DST/SPORT/DPORT -- XXX Is random hash key a good solution? for i = 0x28, 0x4C, 4 do self:input("toeplitz_key["..((i-0x28)/4).."]", 0x20 + i, 31, 0, math.random(2^32)) @@ -1172,19 +1210,20 @@ end -- Create a "wildcard" flow group that does not inspect any fields. function HCA:create_flow_group_wildcard (table_id, table_type, start_ix, end_ix) self:command("CREATE_FLOW_GROUP", 0x3FC, 0x0C) - :input("opcode", 0x00, 31, 16, 0x933) - :input("table_type", 0x10, 31, 24, table_type) - :input("table_id", 0x14, 23, 0, table_id) - :input("start_ix", 0x1C, 31, 0, start_ix) - :input("end_ix", 0x24, 31, 0, end_ix) -- (inclusive) - :input("match_criteria", 0x3C, 7, 0, 0) + :input("opcode", 0x00, 31, 16, 0x933) + :input("table_type", 0x10, 31, 24, table_type) + :input("table_id", 0x14, 23, 0, table_id) + :input("start_ix", 0x1C, 31, 0, start_ix) + :input("end_ix", 0x24, 31, 0, end_ix) -- (inclusive) + :input("match_criteria_enable", 0x3C, 7, 0, 0) -- match outer headers :execute() local group_id = self:output(0x08, 23, 0) return group_id end -- Set a "wildcard" flow table entry that does not match on any fields. -function HCA:set_flow_table_entry_wildcard (table_id, table_type, group_id, flow_index, tir) +function HCA:set_flow_table_entry_wildcard (table_id, table_type, group_id, + flow_index, tir) self:command("SET_FLOW_TABLE_ENTRY", 0x40 + 0x300, 0x0C) :input("opcode", 0x00, 31, 16, 0x936) :input("opmod", 0x04, 15, 0, 0) -- new entry @@ -1199,6 +1238,52 @@ function HCA:set_flow_table_entry_wildcard (table_id, table_type, group_id, flow :execute() end +-- Create a flow group that inspects the ethertype and protocol fields. +function HCA:create_flow_group_ip (table_id, table_type, start_ix, end_ix) + self:command("CREATE_FLOW_GROUP", 0x3FC, 0x0C) + :input("opcode", 0x00, 31, 16, 0x933) + :input("table_type", 0x10, 31, 24, table_type) + :input("table_id", 0x14, 23, 0, table_id) + :input("start_ix", 0x1C, 31, 0, start_ix) + :input("end_ix", 0x24, 31, 0, end_ix) -- (inclusive) + :input("match_criteria_enable", 0x3C, 7, 0, 1) -- match outer headers + :input("match_ether", 0x40 + 0x04, 15, 0, 0xFFFF) + :input("match_proto", 0x40 + 0x10, 31, 24, 0xFF) + :execute() + local group_id = self:output(0x08, 23, 0) + return group_id +end + +-- Set a flow table entry that matches on the ethertype for IPv4/IPv6 +-- as well as TCP/UDP protocol/next-header. +function HCA:set_flow_table_entry_ip (table_id, table_type, group_id, + flow_index, tir, l3_proto, l4_proto) + local ethertypes = { + v4 = 0x0800, + v6 = 0x86dd + } + local l4_protos = { + udp = 17, + tcp = 6 + } + local type = assert(ethertypes[l3_proto], "invalid l3 proto") + local proto = assert(l4_protos[l4_proto], "invalid l4 proto") + self:command("SET_FLOW_TABLE_ENTRY", 0x40 + 0x300, 0x0C) + :input("opcode", 0x00, 31, 16, 0x936) + :input("opmod", 0x04, 15, 0, 0) -- new entry + :input("table_type", 0x10, 31, 24, table_type) + :input("table_id", 0x14, 23, 0, table_id) + :input("flow_index", 0x20, 31, 0, flow_index) + :input("group_id", 0x40 + 0x04, 31, 0, group_id) + :input("action", 0x40 + 0x0C, 15, 0, 4) -- action = FWD_DST + :input("dest_list_sz", 0x40 + 0x10, 23, 0, 1) -- destination list size + :input("match_ether", 0x40 + 0x40 + 0x04, 15, 0, type) + :input("match_proto", 0x40 + 0x40 + 0x10, 31, 24, proto) + :input("dest_type", 0x40 + 0x300, 31, 24, 2) -- TIR + :input("dest_id", 0x40 + 0x300, 23, 0, tir) + :execute() +end + -- Create a DMAC+VLAN flow group. function HCA:create_flow_group_macvlan (table_id, table_type, start_ix, end_ix, usevlan) self:command("CREATE_FLOW_GROUP", 0x3FC, 0x0C) From 83326715efad7471d57e786515458dd70cd5f0ab Mon Sep 17 00:00:00 2001 From: Alexander Gall Date: Fri, 13 Jul 2018 17:13:53 +0200 Subject: [PATCH 035/209] Register ConnectX5 device in lib.hardware.pci --- src/lib/hardware/pci.lua | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/lib/hardware/pci.lua b/src/lib/hardware/pci.lua index 967927517e..7b74ab370f 100644 --- a/src/lib/hardware/pci.lua +++ b/src/lib/hardware/pci.lua @@ -84,14 +84,16 @@ local cards = { ["0x0903"] = {model = 'SFN7122F', driver = 'apps.solarflare.solarflare'} }, ["0x15b3"] = { - ["0x1013" ] = {model = 'MT27700', driver = 'apps.mellanox.connectx4'} + ["0x1013" ] = {model = 'MT27700', driver = 'apps.mellanox.connectx4'}, + ["0x1017" ] = {model = 'MT27800', driver = 'apps.mellanox.connectx4'}, }, } local link_names = { ['apps.solarflare.solarflare'] = { "rx", "tx" }, ['apps.intel_mp.intel_mp'] = { "input", "output" }, - ['apps.intel.intel_app'] = { "rx", "tx" } + ['apps.intel.intel_app'] = { "rx", "tx" }, + ['apps.mellanox.connectx4'] = { "input", "output" }, } -- Return the name of the Lua module that implements support for this device. From 263919adef9f9115b879460f396b93bb7d55a9c9 Mon Sep 17 00:00:00 2001 From: Alexander Gall Date: Fri, 13 Jul 2018 17:24:54 +0200 Subject: [PATCH 036/209] connectx4.lua: identify driver for device_info --- src/apps/mellanox/connectx4.lua | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/apps/mellanox/connectx4.lua b/src/apps/mellanox/connectx4.lua index 5729bb5c96..d2fe02824f 100644 --- a/src/apps/mellanox/connectx4.lua +++ b/src/apps/mellanox/connectx4.lua @@ -903,6 +903,9 @@ end IO = {} IO.__index = IO +-- The IO module is the device driver in the sense of +-- lib.hardware.pci.device_info +driver = IO function IO:new (conf) local self = setmetatable({}, self) From d57c67abc72516ceb6ae9a8d6148f32f66f76f15 Mon Sep 17 00:00:00 2001 From: Alexander Gall Date: Wed, 18 Jul 2018 10:41:28 +0200 Subject: [PATCH 037/209] connectx4.lua: add statistics counters Add the same set of counters supported by the Intel driver. Add a pull() method to the ConnextX4 object to periodically sync the stats so the "control" app can be run as a regular Snabb app. --- src/apps/mellanox/connectx4.lua | 156 +++++++++++++++++++++++++++++++- 1 file changed, 153 insertions(+), 3 deletions(-) diff --git a/src/apps/mellanox/connectx4.lua b/src/apps/mellanox/connectx4.lua index d2fe02824f..c34719d260 100644 --- a/src/apps/mellanox/connectx4.lua +++ b/src/apps/mellanox/connectx4.lua @@ -324,6 +324,39 @@ function ConnectX4:new (conf) end hca:set_flow_table_root(rxtable, NIC_RX) + self.shm = { + mtu = {counter, mtu}, + txdrop = {counter} + } + + local vport_context = hca:query_nic_vport_context() + local frame = { + dtime = {counter, C.get_unix_time()}, + -- Keep a copy of the mtu here to have all + -- data available in a single shm frame + mtu = {counter, mtu}, + speed = {counter}, + status = {counter, 2}, -- Link down + type = {counter, 0x1000}, -- ethernetCsmacd + promisc = {counter, vport_context.promisc_all}, + macaddr = {counter, + macaddress:new(vport_context.permanent_address).bits}, + rxbytes = {counter}, + rxpackets = {counter}, + rxmcast = {counter}, + rxbcast = {counter}, + rxdrop = {counter}, + rxerrors = {counter}, + txbytes = {counter}, + txpackets = {counter}, + txmcast = {counter}, + txbcast = {counter}, + txdrop = {counter}, + txerrors = {counter}, + } + self.stats = shm.create_frame("pci/"..pciaddress, frame) + self.sync_timer = lib.throttle(1) + function self:stop () pci.set_bus_master(pciaddress, false) pci.reset_device(pciaddress) @@ -331,6 +364,33 @@ function ConnectX4:new (conf) mmio, fd = nil end + function self:pull () + if self.sync_timer() then + self:sync_stats() + end + end + + function self:sync_stats () + local set, stats = counter.set, self.stats + local port_stats = self.hca:get_port_stats() + set(stats.rxbytes, port_stats.rxbytes) + set(stats.rxpackets, port_stats.rxpackets) + set(stats.rxmcast, port_stats.rxmcast) + set(stats.rxbcast, port_stats.rxbcast) + set(stats.rxdrop, port_stats.rxdrop) + set(stats.rxerrors, port_stats.rxerrors) + set(stats.txbytes, port_stats.txbytes) + set(stats.txpackets, port_stats.txpackets) + set(stats.txmcast, port_stats.txmcast) + set(stats.txbcast, port_stats.txbcast) + set(stats.txdrop, port_stats.txdrop) + set(stats.txerrors, port_stats.txerrors) + + set(stats.speed, self.hca:get_port_speed()) + set(stats.status, + (self.hca:get_port_status().oper_status == 1 and 1) or 2) + end + -- Save "instance variable" values. self.hca = hca @@ -1332,9 +1392,51 @@ end -- Note: portnumber is always 1 because the ConnectX-4 HCA is managing -- a single physical port. -PAOS = 0x5006 -- Port Administrative & Operational Status -PPLR = 0x5018 -- Port Physical Loopback Register) -PMTU = 0x5003 +PMTU = 0x5003 +PTYS = 0x5004 -- Port Type and Speed +PAOS = 0x5006 -- Port Administrative & Operational Status +PPCNT = 0x5008 -- Ports Performance Counters +PPLR = 0x5018 -- Port Physical Loopback Register + +-- Mapping of speed/protocols per 11.1.2 to speed in units of gbps +local port_speed = { + [0x00000002] = 1, -- 1000Base-KX + [0x00000004] = 10, -- 10GBase-CX4 + [0x00000008] = 10, -- 10GBase-KX4 + [0x00000010] = 10, -- 10GBase-KR + [0x00000040] = 40, -- 40GBase-CR4 + [0x00000080] = 40, -- 40GBase-KR4 + [0x00001000] = 10, -- 10GBase-CR + [0x00002000] = 10, -- 10GBase-SR + [0x00004000] = 10, -- 10GBase-ER/LR + [0x00008000] = 40, -- 40GBase-SR4 + [0x00010000] = 40, -- 40GBase-LR4/ER4 + [0x00040000] = 50, -- 50GBase-SR2 + [0x00100000] = 100, -- 100GBase-CR4 + [0x00200000] = 100, -- 100GBase-SR4 + [0x00400000] = 100, -- 100GBase-KR4 + -- Undocumented (from a ConnectX5 NIC with CWDM plugin) + [0x00800000] = 100, -- 100GBase-CWDM + [0x08000000] = 25, -- 25GBase-CR + [0x10000000] = 25, -- 25GBase-KR + [0x20000000] = 25, -- 25GBase-SR + [0x40000000] = 50, -- 50GBase-CR2 + [0x80000000] = 50, -- 50GBase-KR2 +} + +-- Get the speed of the port in bps +function HCA:get_port_speed () + self:command("ACCESS_REGISTER", 0x4C, 0x4C) + :input("opcode", 0x00, 31, 16, 0x805) + :input("opmod", 0x04, 15, 0, 1) -- read + :input("register_id", 0x08, 15, 0, PTYS) + :input("local_port", 0x10, 23, 16, 1) + :input("proto_mask", 0x10, 2, 0, 0x4) -- Ethernet + :execute() + -- This doesn' + local eth_proto_oper = self:output(0x10 + 0x24, 31, 0) + return (port_speed[eth_proto_oper] or 0) * 1e9 +end -- Set the administrative status of the port (boolean up/down). function HCA:set_admin_status (admin_up) @@ -1390,6 +1492,49 @@ function HCA:set_port_loopback (loopback_mode) :execute() end +local port_stats = { + rxbytes = 0ULL, + rxmcast = 0ULL, + rxbcast = 0ULL, + rxpackets = 0ULL, + rxdrop = 0ULL, + rxerrors = 0ULL, + txbytes = 0ULL, + txmcast = 0ULL, + txbcast = 0ULL, + txpackets = 0ULL, + txdrop = 0ULL, + txerrors = 0ULL, +} +function HCA:get_port_stats () + self:command("ACCESS_REGISTER", 0x14, 0x10C) + :input("opcode", 0x00, 31, 16, 0x805) + :input("opmod", 0x04, 15, 0, 1) -- read + :input("register_id", 0x08, 15, 0, PPCNT) + :input("local_port", 0x10, 23, 16, 1) + :input("grp", 0x10, 5, 0, 0x1) -- RFC 2863 + :execute() + + port_stats.rxbytes = self:output64(0x18 + 0x00) -- includes 4-byte CRC + local in_ucast_packets = self:output64(0x18 + 0x08) + port_stats.rxmcast = self:output64(0x18 + 0x48) + port_stats.rxbcast = self:output64(0x18 + 0x50) + port_stats.rxpackets = in_ucast_packets + port_stats.rxmcast + + port_stats.rxbcast + port_stats.rxdrop = self:output64(0x18 + 0x10) + port_stats.rxerrors = self:output64(0x18 + 0x18) + + port_stats.txbytes = self:output64(0x18 + 0x28) + local out_ucast_packets = self:output64(0x18 + 0x30) + port_stats.txmcast = self:output64(0x18 + 0x58) + port_stats.txbcast = self:output64(0x18 + 0x60) + port_stats.txpackets = out_ucast_packets + port_stats.txmcast + + port_stats.txbcast + port_stats.txdrop = self:output64(0x18 + 0x38) + port_stats.txerrors = self:output64(0x18 + 0x40) + return port_stats +end + --------------------------------------------------------------- -- Command Interface implementation. -- @@ -1526,6 +1671,11 @@ function HCA:output (offset, hi, lo) end end +function HCA:output64 (offset) + local high = self:output(offset, 31, 0) + 0ULL + local low = band(self:output(offset+4, 31, 0) + 0ULL, 0xFFFFFFFF) + return shl(high, 32) + low +end From 22af8471f605a0741b8f19d78eb17a42b1ca7833 Mon Sep 17 00:00:00 2001 From: Alexander Gall Date: Fri, 20 Jul 2018 10:57:14 +0200 Subject: [PATCH 038/209] connectx4.lua: use async commands to read stats counters Polling for command completion in sync_stats() creates too much latency and causes packet drops. This is mitigated by creating separtate HCAs for each command that reads a statistics register and perfom polling for completion only in sync_stats(). --- src/apps/mellanox/connectx4.lua | 206 ++++++++++++++++++++++---------- 1 file changed, 146 insertions(+), 60 deletions(-) diff --git a/src/apps/mellanox/connectx4.lua b/src/apps/mellanox/connectx4.lua index c34719d260..2d7f2fd8de 100644 --- a/src/apps/mellanox/connectx4.lua +++ b/src/apps/mellanox/connectx4.lua @@ -206,11 +206,11 @@ function ConnectX4:new (conf) -- local mmio, fd = pci.map_pci_memory(pciaddress, 0, true) local init_seg = InitializationSegment:new(mmio) - local hca = HCA:new(init_seg) + HCA:init(init_seg) + local hca = HCA:new() -- Makes enable_hca() hang with ConnectX5 -- init_seg:reset() - init_seg:cmdq_phy_addr(memory.virtual_to_physical(hca.entry)) if debug_trace then init_seg:dump() end while not init_seg:ready() do C.usleep(1000) @@ -355,6 +355,49 @@ function ConnectX4:new (conf) txerrors = {counter}, } self.stats = shm.create_frame("pci/"..pciaddress, frame) + + -- Create separate HCAs to retreive port statistics. Those + -- commands must be called asynchronously to reduce latency. + self.stats_reqs = { + { + start_fn = HCA.get_port_stats_start, + finish_fn = HCA.get_port_stats_finish, + process_fn = function (r, stats) + local set = counter.set + set(stats.rxbytes, r.rxbytes) + set(stats.rxpackets, r.rxpackets) + set(stats.rxmcast, r.rxmcast) + set(stats.rxbcast, r.rxbcast) + set(stats.rxdrop, r.rxdrop) + set(stats.rxerrors, r.rxerrors) + set(stats.txbytes, r.txbytes) + set(stats.txpackets, r.txpackets) + set(stats.txmcast, r.txmcast) + set(stats.txbcast, r.txbcast) + set(stats.txdrop, r.txdrop) + set(stats.txerrors, r.txerrors) + end + }, + { + start_fn = HCA.get_port_speed_start, + finish_fn = HCA.get_port_speed_finish, + process_fn = function (r, stats) + counter.set(stats.speed, r) + end + }, + { + start_fn = HCA.get_port_status_start, + finish_fn = HCA.get_port_status_finish, + process_fn = function (r, stats) + counter.set(stats.status, (r.oper_status == 1 and 1) or 2) + end + }, + } + for _, req in ipairs(self.stats_reqs) do + req.hca = HCA:new() + -- Post command + req.start_fn(req.hca) + end self.sync_timer = lib.throttle(1) function self:stop () @@ -371,24 +414,13 @@ function ConnectX4:new (conf) end function self:sync_stats () - local set, stats = counter.set, self.stats - local port_stats = self.hca:get_port_stats() - set(stats.rxbytes, port_stats.rxbytes) - set(stats.rxpackets, port_stats.rxpackets) - set(stats.rxmcast, port_stats.rxmcast) - set(stats.rxbcast, port_stats.rxbcast) - set(stats.rxdrop, port_stats.rxdrop) - set(stats.rxerrors, port_stats.rxerrors) - set(stats.txbytes, port_stats.txbytes) - set(stats.txpackets, port_stats.txpackets) - set(stats.txmcast, port_stats.txmcast) - set(stats.txbcast, port_stats.txbcast) - set(stats.txdrop, port_stats.txdrop) - set(stats.txerrors, port_stats.txerrors) - - set(stats.speed, self.hca:get_port_speed()) - set(stats.status, - (self.hca:get_port_status().oper_status == 1 and 1) or 2) + for _, req in ipairs(self.stats_reqs) do + local hca = req.hca + if hca:completed() then + req.process_fn(req.finish_fn(hca), self.stats) + hca:post() + end + end end -- Save "instance variable" values. @@ -473,6 +505,22 @@ end -- hca object is the main interface towards the NIC firmware. HCA = {} +-- Allocate array of Command Queue Entries. Must be called prior to +-- HCA:new() +function HCA:init (init_seg, cmdq_size) + self.size = 2^init_seg:log_cmdq_size() + self.stride = 2^init_seg:log_cmdq_stride() + self.init_seg = init_seg + -- Next queue to be allocated by :new() + self.nextq = 0 + local cmdq_size = cmdq_size or self.size + assert(cmdq_size <= self.size, "command queue size limit exceeded") + local cmdq_t = ffi.typeof("uint8_t (*)[$]", self.stride) + local entries, entries_phy = memory.dma_alloc(cmdq_size * self.stride, 4096) + self.entries = ffi.cast(cmdq_t, entries) + init_seg:cmdq_phy_addr(entries_phy) +end + --------------------------------------------------------------- -- Startup & General commands --------------------------------------------------------------- @@ -1425,15 +1473,17 @@ local port_speed = { } -- Get the speed of the port in bps -function HCA:get_port_speed () +function HCA:get_port_speed_start () self:command("ACCESS_REGISTER", 0x4C, 0x4C) :input("opcode", 0x00, 31, 16, 0x805) :input("opmod", 0x04, 15, 0, 1) -- read :input("register_id", 0x08, 15, 0, PTYS) :input("local_port", 0x10, 23, 16, 1) :input("proto_mask", 0x10, 2, 0, 0x4) -- Ethernet - :execute() - -- This doesn' + :execute_async() +end + +function HCA:get_port_speed_finish () local eth_proto_oper = self:output(0x10 + 0x24, 31, 0) return (port_speed[eth_proto_oper] or 0) * 1e9 end @@ -1460,6 +1510,7 @@ function HCA:set_port_mtu (mtu) :execute() end +local port_status = { admin_status = 0, oper_status = 0 } function HCA:get_port_status () self:command("ACCESS_REGISTER", 0x10, 0x1C) :input("opcode", 0x00, 31, 16, 0x805) @@ -1467,8 +1518,24 @@ function HCA:get_port_status () :input("register_id", 0x08, 15, 0, PAOS) :input("local_port", 0x10, 23, 16, 1) :execute() - return {admin_status = self:output(0x10, 11, 8), - oper_status = self:output(0x10, 3, 0)} + port_status.admin_status = self:output(0x10, 11, 8) + port_status.oper_status = self:output(0x10, 3, 0) + return port_status +end + +function HCA:get_port_status_start () + self:command("ACCESS_REGISTER", 0x10, 0x1C) + :input("opcode", 0x00, 31, 16, 0x805) + :input("opmod", 0x04, 15, 0, 1) -- read + :input("register_id", 0x08, 15, 0, PAOS) + :input("local_port", 0x10, 23, 16, 1) + :execute() +end + +function HCA:get_port_status_finish () + port_status.admin_status = self:output(0x10, 11, 8) + port_status.oper_status = self:output(0x10, 3, 0) + return port_status end function HCA:get_port_loopback_capability () @@ -1506,15 +1573,17 @@ local port_stats = { txdrop = 0ULL, txerrors = 0ULL, } -function HCA:get_port_stats () +function HCA:get_port_stats_start () self:command("ACCESS_REGISTER", 0x14, 0x10C) :input("opcode", 0x00, 31, 16, 0x805) :input("opmod", 0x04, 15, 0, 1) -- read :input("register_id", 0x08, 15, 0, PPCNT) :input("local_port", 0x10, 23, 16, 1) :input("grp", 0x10, 5, 0, 0x1) -- RFC 2863 - :execute() - + :execute_async() +end + +function HCA:get_port_stats_finish () port_stats.rxbytes = self:output64(0x18 + 0x00) -- includes 4-byte CRC local in_ucast_packets = self:output64(0x18 + 0x08) port_stats.rxmcast = self:output64(0x18 + 0x48) @@ -1550,20 +1619,21 @@ local max_mailboxes = 1000 local data_per_mailbox = 0x200 -- Bytes of input/output data in a mailbox -- Create a command queue with dedicated/reusable DMA memory. -function HCA:new (init_seg) - local entry = ffi.cast("uint32_t*", memory.dma_alloc(0x40, 4096)) +function HCA:new () + local q = self.nextq + assert(q < self.size) + self.nextq = self.nextq + 1 + local inboxes, outboxes = {}, {} for i = 0, max_mailboxes-1 do -- XXX overpadding.. 0x240 alignment is not accepted? inboxes[i] = ffi.cast("uint32_t*", memory.dma_alloc(0x240, 4096)) outboxes[i] = ffi.cast("uint32_t*", memory.dma_alloc(0x240, 4096)) end - return setmetatable({entry = entry, + return setmetatable({entry = ffi.cast("uint32_t *", self.entries[q]), inboxes = inboxes, outboxes = outboxes, - init_seg = init_seg, - size = init_seg:log_cmdq_size(), - stride = init_seg:log_cmdq_stride()}, + q = q}, {__index = HCA}) end @@ -1746,7 +1816,12 @@ local command_errors = { [0x40] = 'BAD_SIZE: More outstanding CQEs in CQ than new CQ size', } -function HCA:execute () +function HCA:post () + self:setbits(0x3C, 0, 0, 1) + self.init_seg:ring_doorbell(self.q) +end + +function HCA:execute_async () local last_in_ofs = self.input_size local last_out_ofs = self.output_size if debug_hexdump then @@ -1761,38 +1836,49 @@ function HCA:execute () dumpoffset = hexdump(self.inboxes[i], 0, ffi.sizeof(cmdq_mailbox_t), dumpoffset) end end - assert(self:getbits(0x3C, 0, 0) == 1) - self.init_seg:ring_doorbell(0) --post command - - --poll for command completion - while self:getbits(0x3C, 0, 0) == 1 do + self:post() +end + +function HCA:completed () + if self:getbits(0x3C, 0, 0) == 0 then + if debug_hexdump then + local dumpoffset = 0 + print("command OUTPUT:") + dumpoffset = hexdump(self.entry, 0, 0x40, dumpoffset) + local noutboxes = math.ceil((last_out_ofs + 4 - 16) / data_per_mailbox) + for i = 0, noutboxes-1 do + local blocknumber = getint(self.outboxes[i], 0x238, 31, 0) + local address = memory.virtual_to_physical(self.outboxes[i]) + print("Block "..blocknumber.." @ "..bit.tohex(address, 12)..":") + dumpoffset = hexdump(self.outboxes[i], 0, ffi.sizeof(cmdq_mailbox_t), dumpoffset) + end + end + + local token = self:getbits(0x3C, 31, 24) + local signature = self:getbits(0x3C, 23, 16) + local status = self:getbits(0x3C, 7, 1) + + checkz(status) + self:checkstatus() + + return signature, token + else if self.init_seg:getbits(0x1010, 31, 24) ~= 0 then error("HCA health syndrome: " .. bit.tohex(self.init_seg:getbits(0x1010, 31, 24))) end - C.usleep(10000) + return nil, nil end +end - if debug_hexdump then - local dumpoffset = 0 - print("command OUTPUT:") - dumpoffset = hexdump(self.entry, 0, 0x40, dumpoffset) - local noutboxes = math.ceil((last_out_ofs + 4 - 16) / data_per_mailbox) - for i = 0, noutboxes-1 do - local blocknumber = getint(self.outboxes[i], 0x238, 31, 0) - local address = memory.virtual_to_physical(self.outboxes[i]) - print("Block "..blocknumber.." @ "..bit.tohex(address, 12)..":") - dumpoffset = hexdump(self.outboxes[i], 0, ffi.sizeof(cmdq_mailbox_t), dumpoffset) - end +function HCA:execute () + self:execute_async() + local signature, token = self:completed() + --poll for command completion + while not signature do + C.usleep(10000) + signature, token = self:completed() end - - local token = self:getbits(0x3C, 31, 24) - local signature = self:getbits(0x3C, 23, 16) - local status = self:getbits(0x3C, 7, 1) - - checkz(status) - self:checkstatus() - return signature, token end From a94924561ca29b8aa84d0a4de1e12991045418b1 Mon Sep 17 00:00:00 2001 From: Alexander Gall Date: Fri, 20 Jul 2018 14:45:39 +0200 Subject: [PATCH 039/209] connectx4.lua: use a factory to create HCAs A HCA is tied to an initialization segment (i.e. device). The previous commit mishandled the case when multiple devices are run in the same process. This is fixed by creating a factory for HCAs per device. --- src/apps/mellanox/connectx4.lua | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/src/apps/mellanox/connectx4.lua b/src/apps/mellanox/connectx4.lua index 2d7f2fd8de..99895d3b0b 100644 --- a/src/apps/mellanox/connectx4.lua +++ b/src/apps/mellanox/connectx4.lua @@ -206,8 +206,8 @@ function ConnectX4:new (conf) -- local mmio, fd = pci.map_pci_memory(pciaddress, 0, true) local init_seg = InitializationSegment:new(mmio) - HCA:init(init_seg) - local hca = HCA:new() + local hca_factory = HCA_factory(init_seg) + local hca = hca_factory:new() -- Makes enable_hca() hang with ConnectX5 -- init_seg:reset() @@ -394,7 +394,7 @@ function ConnectX4:new (conf) }, } for _, req in ipairs(self.stats_reqs) do - req.hca = HCA:new() + req.hca = hca_factory:new() -- Post command req.start_fn(req.hca) end @@ -505,9 +505,12 @@ end -- hca object is the main interface towards the NIC firmware. HCA = {} --- Allocate array of Command Queue Entries. Must be called prior to --- HCA:new() -function HCA:init (init_seg, cmdq_size) +-- Create a factory for HCAs for the given Initialization Segment +-- (i.e. device). Application of the new() method to the returned +-- object allocates a new HCA for the next available Command Queue +-- Entry. +function HCA_factory (init_seg, cmdq_size) + local self = {} self.size = 2^init_seg:log_cmdq_size() self.stride = 2^init_seg:log_cmdq_stride() self.init_seg = init_seg @@ -519,6 +522,7 @@ function HCA:init (init_seg, cmdq_size) local entries, entries_phy = memory.dma_alloc(cmdq_size * self.stride, 4096) self.entries = ffi.cast(cmdq_t, entries) init_seg:cmdq_phy_addr(entries_phy) + return setmetatable(self, { __index = HCA }) end --------------------------------------------------------------- @@ -1620,6 +1624,8 @@ local data_per_mailbox = 0x200 -- Bytes of input/output data in a mailbox -- Create a command queue with dedicated/reusable DMA memory. function HCA:new () + -- Must only be called from a factory created by HCA_factory() + assert(self ~= HCA) local q = self.nextq assert(q < self.size) self.nextq = self.nextq + 1 @@ -1634,7 +1640,7 @@ function HCA:new () inboxes = inboxes, outboxes = outboxes, q = q}, - {__index = HCA}) + {__index = self}) end -- Reset all data structures to zero values. From b4d9252d65c38135afb95f6a8f71d1e6505a9906 Mon Sep 17 00:00:00 2001 From: Alexander Gall Date: Mon, 23 Jul 2018 12:36:34 +0200 Subject: [PATCH 040/209] connectx4.lua: fix bug with async commands when debug_hexdump is enabled --- src/apps/mellanox/connectx4.lua | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/apps/mellanox/connectx4.lua b/src/apps/mellanox/connectx4.lua index 99895d3b0b..2cd2345798 100644 --- a/src/apps/mellanox/connectx4.lua +++ b/src/apps/mellanox/connectx4.lua @@ -50,7 +50,7 @@ local band, bor, shl, shr, bswap, bnot = local cast, typeof = ffi.cast, ffi.typeof local debug_trace = false -- Print trace messages -local debug_hexdump = false -- Print hexdumps (in Linux mlx5 format) +local debug_hexdump = false -- Print hexdumps (in Linux mlx5 format) -- Maximum size of a receive queue table. -- XXX This is hard-coded in the Linux mlx5 driver too. Could @@ -1828,13 +1828,11 @@ function HCA:post () end function HCA:execute_async () - local last_in_ofs = self.input_size - local last_out_ofs = self.output_size if debug_hexdump then local dumpoffset = 0 print("command INPUT:") dumpoffset = hexdump(self.entry, 0, 0x40, dumpoffset) - local ninboxes = math.ceil((last_in_ofs + 4 - 16) / data_per_mailbox) + local ninboxes = math.ceil((self.input_size + 4 - 16) / data_per_mailbox) for i = 0, ninboxes-1 do local blocknumber = getint(self.inboxes[i], 0x238, 31, 0) local address = memory.virtual_to_physical(self.inboxes[i]) @@ -1852,7 +1850,7 @@ function HCA:completed () local dumpoffset = 0 print("command OUTPUT:") dumpoffset = hexdump(self.entry, 0, 0x40, dumpoffset) - local noutboxes = math.ceil((last_out_ofs + 4 - 16) / data_per_mailbox) + local noutboxes = math.ceil((self.output_size + 4 - 16) / data_per_mailbox) for i = 0, noutboxes-1 do local blocknumber = getint(self.outboxes[i], 0x238, 31, 0) local address = memory.virtual_to_physical(self.outboxes[i]) From e2eecff0455a4fc719f8646b2f601ca4cc4ee74a Mon Sep 17 00:00:00 2001 From: Alexander Gall Date: Mon, 23 Jul 2018 12:44:49 +0200 Subject: [PATCH 041/209] connectx4.lua: fix page sizes in CREATE_{C,R}Q Set the page size parameters for CQs and RQs so that the entire queue fits in a single page. The alternative would be to select a fixed page size and add as many PAS entries as necessary. It is unclear whether there is a difference between the methods (e.g. for perfomance). --- src/apps/mellanox/connectx4.lua | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/apps/mellanox/connectx4.lua b/src/apps/mellanox/connectx4.lua index 2cd2345798..4c9cf10728 100644 --- a/src/apps/mellanox/connectx4.lua +++ b/src/apps/mellanox/connectx4.lua @@ -916,7 +916,9 @@ end function HCA:create_cq (entries, uar_page, eqn, collapsed) local doorbell, doorbell_phy = memory.dma_alloc(16) -- Memory for completion queue entries - local cqe, cqe_phy = memory.dma_alloc(entries * 64, 4096) + local size = entries * 64 + local cqe, cqe_phy = memory.dma_alloc(size, 4096) + local log_page_size = log2size(math.ceil(size/4096)) ffi.fill(cqe, entries * 64, 0xFF) self:command("CREATE_CQ", 0x114, 0x0C) :input("opcode", 0x00, 31, 16, 0x400) @@ -925,7 +927,7 @@ function HCA:create_cq (entries, uar_page, eqn, collapsed) :input("log_cq_size", 0x10 + 0x0C, 28, 24, log2size(entries)) :input("uar_page", 0x10 + 0x0C, 23, 0, uar_page) :input("c_eqn", 0x10 + 0x14, 7, 0, eqn) - :input("log_page_size", 0x10 + 0x18, 28, 24, 4) + :input("log_page_size", 0x10 + 0x18, 28, 24, log_page_size) :input("db_addr high", 0x10 + 0x38, 31, 0, ptrbits(doorbell_phy, 63, 32)) :input("db_addr_low", 0x10 + 0x3C, 31, 0, ptrbits(doorbell_phy, 31, 0)) :input("pas[0] high", 0x110, 31, 0, ptrbits(cqe_phy, 63, 32)) @@ -941,6 +943,7 @@ function HCA:create_rq (cqn, pd, size, doorbell, rwq) local log_wq_size = log2size(size) local db_phy = memory.virtual_to_physical(doorbell) local rwq_phy = memory.virtual_to_physical(rwq) + local log_page_size = log2size(math.ceil(size * 64/4096)) self:command("CREATE_RQ", 0x20 + 0x30 + 0xC4, 0x0C) :input("opcode", 0x00, 31, 16, 0x908) :input("rlkey", 0x20 + 0x00, 31, 31, 1) @@ -951,7 +954,7 @@ function HCA:create_rq (cqn, pd, size, doorbell, rwq) :input("dbr_addr high", 0x20 + 0x30 + 0x10, 31, 0, ptrbits(db_phy, 63, 32)) :input("dbr_addr low", 0x20 + 0x30 + 0x14, 31, 0, ptrbits(db_phy, 31, 0)) :input("log_wq_stride", 0x20 + 0x30 + 0x20, 19, 16, 4) - :input("page_size", 0x20 + 0x30 + 0x20, 12, 8, 4) -- XXX one big page? + :input("log_page_size", 0x20 + 0x30 + 0x20, 12, 8, log_page_size) :input("log_wq_size", 0x20 + 0x30 + 0x20, 4 , 0, log_wq_size) :input("pas[0] high", 0x20 + 0x30 + 0xC0, 63, 32, ptrbits(rwq_phy, 63, 32)) :input("pas[0] low", 0x20 + 0x30 + 0xC4, 31, 0, ptrbits(rwq_phy, 31, 0)) From 77e548cab452d61ba962f545146440850512b13f Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Mon, 4 Nov 2019 17:50:00 +0100 Subject: [PATCH 042/209] Squashed 'lib/ljsyscall/' changes from d78235c50..e587f8c55 e587f8c55 Merge pull request #225 from vavrusa/master 5ea3a881e bpf: add missing constants for linux 4.10 - 4.15 2c691e5a7 Merge pull request #224 from wingo/lseek-syscall-tweak ae38bdbd7 Make "offset" arg to lseek a signed integer 5cb3b6950 Merge pull request #221 from wingo/util-ls-fix 8e0874609 Promptly close util.ls() dir fd; fix bug with deleted entries 277517436 Merge pull request #220 from sileht/master 8e48fd094 linux/nl.lua: Use ndmsg struct instead of rtmsg for neighbors 3e482bc4e Merge pull request #215 from qsn/gettid 57520cce3 expose gettid db1a88e94 Merge pull request #214 from jsitnicki/sof-flags-linux 270a6e611 Add missing type for struct scm_timestamping for Linux e49232047 Add missing SCM_* constants for Linux timestamping API 60fcc6b48 Add constants for SOF_* flags for Linux 26ac34851 Merge pull request #210 from alexandergall/linux-if-ioctls 3e6d3e27c Merge pull request #211 from fperrad/deb d425a22b2 dummy changelog b27eca538 update .gitignore 92292aa4b debian files be257a7e1 debian files generated by lua-create-gitbuildpackage-layout 50a02b94b Add some SIOC{G,S}IF* ioctls for Linux ee90324b0 Merge pull request #209 from justincormack/osx_clock ee17863e2 Add CLOCK_ constants for OSX 178d244a0 Merge pull request #208 from justincormack/holes 61450f5b2 Add SEEK_DATA and SEEK_HOLE constants 9a7b58438 Add memfd fnctl sealing support 99beaf522 more test fixes for memfd cc221e4ed fix ctest for new fcntl changes, typowq 56c4c7684 fix ctest for new fcntl changes 96073cc38 fix typo 7a73e8ad4 Add more constants for fcntl, memfd 8d3034cc6 Fix ppc64le syscall numbers for newer calls 62828f65a Merge pull request #206 from johnae/master c4002b640 The spook project is using ljsyscall. 0b266e8f4 Merge pull request #205 from lukego/close-fd-safely ad91aa902 Add more protection against fd double-close c8baf9e94 Merge pull request #202 from justincormack/dockerignore 66843c552 Use dockerignore to simplify Dockerfile 7b7211dc2 Merge pull request #200 from justincormack/redo-dockerfile a779caa7a Docker Cloud does not start processes at priority 0, remove from test b85382d38 Rework Dockerfile and tests 24f778901 Merge pull request #199 from vavrusa/master 2ecf48608 linux/constants: added new BPF map and prog types 00c194962 use Alpine 3.4 for docker build 0bcafc6d5 Merge pull request #196 from kbara/removecunused a4217a8f8 Merge pull request #195 from kbara/fixgetcpu ee6743017 Remove unused variables from c.lua 4b2e0b2d2 Fix getcpu: the cpu argument was incorrectly given the node variable previously 214550af0 update changelog and rockspec for 0.12 release 21f3fd81d Merge pull request #192 from vavrusa/master 0da437fcc linux/bpf_prog_load: support custom kernel builds 998119053 fix missing vhangup f2451148a Merge pull request #191 from vavrusa/linux-perf-open 93558c15d linux: added support for tracing/performance counters 0511fb800 Merge pull request #190 from vavrusa/master 1f141cab3 linux: added new constants (e.g. attach BPF to socket) 36274f3a8 linux/bpf: added strflag support 4fd3bd63b Merge pull request #189 from vavrusa/master aaa89cb93 linux: added support for eBPF e09529594 linux: added new syscall numbers (up to __nr_bpf) 1e079c45b test calls container so just needs file to run d92625e59 define a docker compose test 5f147111c Merge pull request #188 from aperezdc/fix-if-nameindex 8915a8376 Close socket immediately after error in if_nametoindex() 96c02869e use addons for travis, as learned at fosdem 71241e068 update changelog for unreleased changes 559b499e0 ignore audit arch constant 5d867d304 new architectures do not have open, will use openat 49d9ff9a6 test issues with new constants 7cd460ebd aarch64 audit constants for seccomp 6249e99cd more docker examples dd00af942 rockspec fix d20033b88 rockspec for new release fb172442f update copyright years 9d875971e more Changelog tweaks before release 53856b05e typo 188152611 sometimes winsz is 0, eg if terminal not set up ab0c08d69 add error message a1c207e71 update Changelog for forthcoming 0.11 release ce12fb209 addDocker hub to README 602a2b3b6 fix osx fstatat to use 32 bit stat type, as cannot find how to call to get 64 bit one 8a0a6ad75 Now have arm machines with working seccomp d14bd38ef Appears that setting maac address on lo often works 6e878a1ed remove debug print from test 30e9b5b3f allow skip on EPERM for adjtimex bff3e90c9 Add strace in Docker image for convenient debugging e67fa312f Use alpine 3.3 for Docker 5148bc3bc fix ipv6 tests 185c1a632 more failures with no ipv6 98cc9a289 more fixes for ipv4 only environments for netlink tests 248a935a8 fix bind errors in environments that do not support ipv6 0eacd6450 clean up travis file 5fb71b6cf switch to newer Ubuntu in travis 8235724df fix more constant checks not in headers 8dff4ef57 constants missing on travis db51b08f9 update Changelog 7540b04af add new rtnetlink values, so tests work under docker 22604b801 remove test that fails in some environments d43169389 fix waitid test under docker 1595b7d71 fix swap test under docker 321fdd2e2 Now an alpine package available 984b533b4 Add Dockerfile, fix some of the tests that made unreasonable assumptions 9aeff8875 recent osx has *at functions 18cd82991 better handling for xattr errors b6bb892ee freebsd 11 now has utimensat 7065b0d20 on freebsd/zfs chflags will fail, skip git-subtree-dir: lib/ljsyscall git-subtree-split: e587f8c55aad3955dddab3a4fa6c1968037b5c6e --- .dockerignore | 12 + .gitignore | 9 + .travis.yml | 14 +- COPYRIGHT | 2 +- ChangeLog | 13 +- Dockerfile | 4 + README.md | 5 +- debian/changelog | 5 + debian/compat | 1 + debian/control | 25 ++ debian/copyright | 30 +++ debian/dh-lua.conf | 24 ++ debian/lua-ljsyscall-dev.docs | 2 + debian/patches/series | 0 debian/rules | 4 + debian/source/format | 1 + debian/tests/control | 3 + debian/tests/dh-lua-tests | 1 + debian/watch | 6 + docker-compose.test.yml | 5 + rockspec/ljsyscall-0.11-1.rockspec | 170 ++++++++++++++ rockspec/ljsyscall-0.12-1.rockspec | 170 ++++++++++++++ syscall/bsd/ffi.lua | 1 - syscall/bsd/syscalls.lua | 1 + syscall/freebsd/constants.lua | 8 + syscall/freebsd/ffi.lua | 1 + syscall/linux/arm/nr.lua | 9 + syscall/linux/c.lua | 29 ++- syscall/linux/constants.lua | 316 +++++++++++++++++++++++++- syscall/linux/fcntl.lua | 2 + syscall/linux/ffi.lua | 136 +++++++++++ syscall/linux/ioctl.lua | 16 ++ syscall/linux/nl.lua | 67 +++++- syscall/linux/ppc/nr.lua | 9 + syscall/linux/ppc64le/nr.lua | 13 ++ syscall/linux/syscalls.lua | 141 ++++++++++++ syscall/linux/types.lua | 32 +++ syscall/linux/util.lua | 5 +- syscall/linux/x64/nr.lua | 1 + syscall/linux/x86/nr.lua | 1 + syscall/methods.lua | 83 +++++++ syscall/netbsd/ffifunctions.lua | 2 + syscall/openbsd/ffi.lua | 1 + syscall/osx/c.lua | 3 +- syscall/osx/constants.lua | 24 +- syscall/osx/ffi.lua | 25 +- syscall/osx/syscalls.lua | 8 + syscall/osx/types.lua | 3 + syscall/syscalls.lua | 10 +- syscall/types.lua | 9 +- syscall/util.lua | 25 +- test/bsd.lua | 24 +- test/ctest-linux.lua | 98 ++++++++ test/freebsd.lua | 2 +- test/helpers.lua | 81 +++++++ test/linux-constants.lua | 64 ++++++ test/linux-structures.lua | 8 +- test/linux.lua | 188 +++++++++++++-- {include/luaunit => test}/luaunit.lua | 0 test/netbsd.lua | 2 +- test/openbsd.lua | 2 +- test/osx.lua | 2 +- test/rump.lua | 2 +- {include/strict => test}/strict.lua | 0 test/test.lua | 69 +++--- 65 files changed, 1903 insertions(+), 126 deletions(-) create mode 100644 .dockerignore create mode 100644 Dockerfile create mode 100644 debian/changelog create mode 100644 debian/compat create mode 100644 debian/control create mode 100644 debian/copyright create mode 100644 debian/dh-lua.conf create mode 100644 debian/lua-ljsyscall-dev.docs create mode 100644 debian/patches/series create mode 100755 debian/rules create mode 100644 debian/source/format create mode 100644 debian/tests/control create mode 100644 debian/tests/dh-lua-tests create mode 100644 debian/watch create mode 100644 docker-compose.test.yml create mode 100644 rockspec/ljsyscall-0.11-1.rockspec create mode 100644 rockspec/ljsyscall-0.12-1.rockspec create mode 100644 test/helpers.lua rename {include/luaunit => test}/luaunit.lua (100%) rename {include/strict => test}/strict.lua (100%) diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000000..fd1d8943f6 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,12 @@ +.* +*.md +COPYRIGHT +ChangeLog +Dockerfile +INSTALL +doc +*.yml +examples +include +rockspec +test diff --git a/.gitignore b/.gitignore index ea3ef8819f..a6e4c5785e 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,12 @@ tmp/* *.core ktrace.out obj/* + +/5.1-ljsyscall +/debian/debhelper-build-stamp +/debian/files +/debian/lua-ljsyscall* +/debian/lua_versions +/debian/tmp +/debian/trash + diff --git a/.travis.yml b/.travis.yml index 6c90dc6e7c..e474901663 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,11 +1,15 @@ language: c +sudo: required +dist: trusty + +addons: + apt: + packages: + - luajit + - luarocks + - strace before_install: - - sudo add-apt-repository ppa:mwild1/ppa -y - - sudo apt-get update -y - - sudo apt-get install luajit -y --force-yes - - sudo apt-get install luarocks -y - - sudo apt-get install strace -y - git submodule update --init --recursive env: diff --git a/COPYRIGHT b/COPYRIGHT index 9f187c3513..2f9256587c 100644 --- a/COPYRIGHT +++ b/COPYRIGHT @@ -6,7 +6,7 @@ Files under the include directory include their own copyright information. ljsyscall: System call interface for LuaJIT -Copyright (C) 2011-2014 Justin Cormack. All rights reserved. +Copyright (C) 2011-2016 Justin Cormack. All rights reserved. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/ChangeLog b/ChangeLog index 0442771dfe..c56e549bb8 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,10 +1,19 @@ - unreleased + 0.12 release + ++ Fix seccomp on arm64 ++ Linux added support for eBPF ++ bug fixes + + 0.11 release + + OSX time functions + OSX Mach types ++ OSX fixes for Yosemite + arm64 support -+ OpenBSD 5.6 and 5.7 support ++ OpenBSD 5.6, 5.7 and 5.8 support + ppc64le support, by Gustavo Serra Scalet + mipsel support ++ added Dockerfile, now available on Docker Hub 0.10 release diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000000..50bfcdfd9b --- /dev/null +++ b/Dockerfile @@ -0,0 +1,4 @@ +FROM alpine:3.4 +RUN apk update && apk add luajit luajit-dev strace && mkdir -p /usr/share/lua/5.1 +COPY . /usr/share/lua/5.1/ +ENTRYPOINT ["luajit"] diff --git a/README.md b/README.md index a96203a40d..bcb016820e 100644 --- a/README.md +++ b/README.md @@ -16,13 +16,15 @@ The [video of my FOSDEM 2013 talk](http://www.myriabit.com/ljsyscall/) here, and ## Install +A Docker hub automated build (currently only for Linux) is available via `docker pull justincormack/ljsyscall`. You can run the test suite with `docker run justincormack/ljsyscall test/test.lua`, use in a scripted way eg `docker run justincormack/ljsyscall -e "print(require('syscall').nl.interfaces())"` or get an interactive session with `docker -it run justincormack/ljsyscall`. + The stable release is now available in the luarocks repository, so you should be able to run ```luarocks install ljsyscall```. There will be a ```ljsyscall-rump``` rock soon, but I need to fix the install for the rump libraries. For simple uses, you just need to put the ```.lua``` files somewhere that LuaJIT will find them, eg typically in ```/usr/local/share/lua/5.1/```. Keep the directory structure there is. You can safely remove files from architectures and operating systems you do not use. You can also install the head version using luarocks: ```luarocks install rockspec/ljsyscall-scm-1.rockspec``` . -It is also available as a package in [buildroot](http://buildroot.uclibc.org/), a build system for embedded systems, and in [pkgsrc](http://www.pkgsrc.org] the portable packaging system for many systems. +It is also available as a package in [buildroot](http://buildroot.uclibc.org/), a build system for embedded systems, and in [pkgsrc](http://www.pkgsrc.org] the portable packaging system for many systems. It is now packaged for [Alpine Linux](http://www.alpinelinux.org/), in the testing repository. If you are using Lua rather than LuaJIT you need to install [luaffi](https://github.com/jmckaskill/luaffi) first; this is largely working now, but there will be more support for standard Lua coming soon. @@ -66,6 +68,7 @@ This project is being used in a variety of places, such as for testing the Linux * [buildroot](http://buildroot.uclibc.org/) has an ljsyscall package. * [luatz](https://github.com/daurnimator/luatz) uses ljsyscall when available * [Snabb switch](https://github.com/SnabbCo/snabbswitch) a high performance networking toolkit. +* [Spook](https://github.com/johnae/spook) started out as an fs events based test runner similar to Rubys guard but grew into an event toolkit of sorts. ## Testing diff --git a/debian/changelog b/debian/changelog new file mode 100644 index 0000000000..f44a5b608a --- /dev/null +++ b/debian/changelog @@ -0,0 +1,5 @@ +lua-ljsyscall (0.12-1) unstable; urgency=medium + + * UNRELEASED + + -- John Doe Sun, 23 Jul 2017 19:43:15 +0200 diff --git a/debian/compat b/debian/compat new file mode 100644 index 0000000000..ec635144f6 --- /dev/null +++ b/debian/compat @@ -0,0 +1 @@ +9 diff --git a/debian/control b/debian/control new file mode 100644 index 0000000000..154d215980 --- /dev/null +++ b/debian/control @@ -0,0 +1,25 @@ +Source: lua-ljsyscall +Section: interpreters +Priority: optional +Maintainer: nobody +Build-Depends: debhelper (>= 9), dh-lua +Standards-Version: 4.0.0 +Homepage: http://www.myriabit.com/ljsyscall/ + +Package: lua-ljsyscall +Architecture: all +Pre-Depends: ${misc:Pre-Depends} +Depends: luajit, ${misc:Depends} +Provides: ${lua:Provides} +XB-Lua-Versions: ${lua:Versions} +Description: LuaJIT Linux syscall FFI + +Package: lua-ljsyscall-dev +Section: libdevel +Architecture: all +Pre-Depends: ${misc:Pre-Depends} +Depends: ${misc:Depends} +Provides: ${lua:Provides} +XB-Lua-Versions: ${lua:Versions} +Description: ljsyscall doc + This package contains the documentation of the ljsyscall library. diff --git a/debian/copyright b/debian/copyright new file mode 100644 index 0000000000..679922b416 --- /dev/null +++ b/debian/copyright @@ -0,0 +1,30 @@ +Format: http://www.debian.org/doc/packaging-manuals/copyright-format/1.0/ +Upstream-Name: ljsyscall +Source: https://github.com/justincormack/ljsyscall + +Files: * +Copyright: Copyright (C) 2011-2016 Justin Cormack. All rights reserved. +License: Expat + +Files: */doc +Copyright: Copyright (C) 2011-2016 Justin Cormack. All rights reserved. +License: CC0 + +License: Expat + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + . + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + . + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. diff --git a/debian/dh-lua.conf b/debian/dh-lua.conf new file mode 100644 index 0000000000..19d2316714 --- /dev/null +++ b/debian/dh-lua.conf @@ -0,0 +1,24 @@ +### mandatory fields +LUA_VERSION=5.1 +PKG_NAME=ljsyscall + +### things relative to the C library part +CLIB_CFLAGS= +CLIB_LDFLAGS= +CLIB_LDFLAGS_STATIC= +CLIB_OBJS= +LUA_MODNAME_CPART= + +### things relative to the lua library part +LUA_HEADER= +LUA_SOURCES=syscall.lua syscall/*.lua syscall/shared/*.lua syscall/linux/*.lua syscall/linux/*/*.lua +LUA_SOURCES_MANGLER= +LUA_MODNAME=syscall +LUA_TEST= + +### this part is relative to pkg-config +PKG_VERSION= +PKG_LIBS_PRIVATE= +PKG_URL= +PKG_REQUIRES= +PKG_CONFLICTS= diff --git a/debian/lua-ljsyscall-dev.docs b/debian/lua-ljsyscall-dev.docs new file mode 100644 index 0000000000..ea60385cf8 --- /dev/null +++ b/debian/lua-ljsyscall-dev.docs @@ -0,0 +1,2 @@ +doc +test diff --git a/debian/patches/series b/debian/patches/series new file mode 100644 index 0000000000..e69de29bb2 diff --git a/debian/rules b/debian/rules new file mode 100755 index 0000000000..4f36696ce5 --- /dev/null +++ b/debian/rules @@ -0,0 +1,4 @@ +#!/usr/bin/make -f + +%: + dh $@ --buildsystem=lua --with lua diff --git a/debian/source/format b/debian/source/format new file mode 100644 index 0000000000..163aaf8d82 --- /dev/null +++ b/debian/source/format @@ -0,0 +1 @@ +3.0 (quilt) diff --git a/debian/tests/control b/debian/tests/control new file mode 100644 index 0000000000..cdb0fa9909 --- /dev/null +++ b/debian/tests/control @@ -0,0 +1,3 @@ +Tests: dh-lua-tests +Restrictions: rw-build-tree +Depends: @, dh-lua diff --git a/debian/tests/dh-lua-tests b/debian/tests/dh-lua-tests new file mode 100644 index 0000000000..738a2eb7ce --- /dev/null +++ b/debian/tests/dh-lua-tests @@ -0,0 +1 @@ +debian/rules autopkgtest diff --git a/debian/watch b/debian/watch new file mode 100644 index 0000000000..39da9e737d --- /dev/null +++ b/debian/watch @@ -0,0 +1,6 @@ +# test this watch file using: +# uscan --watchfile debian/watch --upstream-version 0.1 --package lua-ljsyscall +# +version=3 +opts=filenamemangle=s/.+\/v?(\d\S*)\.tar\.gz/ljsyscall-$1\.tar\.gz/ \ + https://github.com/justincormack/ljsyscall/releases .*/v?(\d\S*)\.tar\.gz diff --git a/docker-compose.test.yml b/docker-compose.test.yml new file mode 100644 index 0000000000..300cd4b981 --- /dev/null +++ b/docker-compose.test.yml @@ -0,0 +1,5 @@ +sut: + build: . + command: /test/test.lua + volumes: + - ./test:/test diff --git a/rockspec/ljsyscall-0.11-1.rockspec b/rockspec/ljsyscall-0.11-1.rockspec new file mode 100644 index 0000000000..2b66396d20 --- /dev/null +++ b/rockspec/ljsyscall-0.11-1.rockspec @@ -0,0 +1,170 @@ +package = "ljsyscall" +version = "0.11-1" +source = +{ + url = "https://github.com/justincormack/ljsyscall/archive/v0.11.tar.gz"; + dir = "ljsyscall-0.11"; +} + +description = +{ + summary = "LuaJIT Linux syscall FFI"; + homepage = "http://www.myriabit.com/ljsyscall/"; + license = "MIT"; +} +dependencies = +{ + "lua == 5.1"; -- In fact this should be "luajit >= 2.0.0" +} +build = +{ + type = "builtin"; + modules = + { + ["syscall"] = "syscall.lua"; + ["syscall.abi"] = "syscall/abi.lua"; + ["syscall.helpers"] = "syscall/helpers.lua"; + ["syscall.syscalls"] = "syscall/syscalls.lua"; + ["syscall.libc"] = "syscall/libc.lua"; + ["syscall.methods"] = "syscall/methods.lua"; + ["syscall.ffitypes"] = "syscall/ffitypes.lua"; + ["syscall.util"] = "syscall/util.lua"; + ["syscall.compat"] = "syscall/compat.lua"; + ["syscall.bit"] = "syscall/bit.lua"; + ["syscall.types"] = "syscall/types.lua"; + ["syscall.lfs"] = "syscall/lfs.lua"; + + ["syscall.shared.types"] = "syscall/shared/types.lua"; + }; + platforms = + { + linux = + { + modules = { + ["syscall.linux.syscalls"] = "syscall/linux/syscalls.lua"; + ["syscall.linux.c"] = "syscall/linux/c.lua"; + ["syscall.linux.constants"] = "syscall/linux/constants.lua"; + ["syscall.linux.ffi"] = "syscall/linux/ffi.lua"; + ["syscall.linux.ioctl"] = "syscall/linux/ioctl.lua"; + ["syscall.linux.types"] = "syscall/linux/types.lua"; + ["syscall.linux.fcntl"] = "syscall/linux/fcntl.lua"; + ["syscall.linux.errors"] = "syscall/linux/errors.lua"; + ["syscall.linux.util"] = "syscall/linux/util.lua"; + ["syscall.linux.nr"] = "syscall/linux/nr.lua"; + + ["syscall.linux.nl"] = "syscall/linux/nl.lua"; + ["syscall.linux.netfilter"] = "syscall/linux/netfilter.lua"; + ["syscall.linux.sockopt"] = "syscall/linux/sockopt.lua"; + ["syscall.linux.cgroup"] = "syscall/linux/cgroup.lua"; + + ["syscall.linux.arm.constants"] = "syscall/linux/arm/constants.lua"; + ["syscall.linux.arm.ffi"] = "syscall/linux/arm/ffi.lua"; + ["syscall.linux.arm.ioctl"] = "syscall/linux/arm/ioctl.lua"; + ["syscall.linux.arm.nr"] = "syscall/linux/arm/nr.lua"; + ["syscall.linux.arm64.constants"] = "syscall/linux/arm64/constants.lua"; + ["syscall.linux.arm64.ffi"] = "syscall/linux/arm64/ffi.lua"; + ["syscall.linux.arm64.ioctl"] = "syscall/linux/arm64/ioctl.lua"; + ["syscall.linux.arm64.nr"] = "syscall/linux/arm64/nr.lua"; + ["syscall.linux.mips.constants"] = "syscall/linux/mips/constants.lua"; + ["syscall.linux.mips.ffi"] = "syscall/linux/mips/ffi.lua"; + ["syscall.linux.mips.ioctl"] = "syscall/linux/mips/ioctl.lua"; + ["syscall.linux.mips.nr"] = "syscall/linux/mips/nr.lua"; + ["syscall.linux.ppc.constants"] = "syscall/linux/ppc/constants.lua"; + ["syscall.linux.ppc.ffi"] = "syscall/linux/ppc/ffi.lua"; + ["syscall.linux.ppc.ioctl"] = "syscall/linux/ppc/ioctl.lua"; + ["syscall.linux.ppc.nr"] = "syscall/linux/ppc/nr.lua"; + ["syscall.linux.ppc64le.constants"] = "syscall/linux/ppc64le/constants.lua"; + ["syscall.linux.ppc64le.ffi"] = "syscall/linux/ppc64le/ffi.lua"; + ["syscall.linux.ppc64le.ioctl"] = "syscall/linux/ppc64le/ioctl.lua"; + ["syscall.linux.ppc64le.nr"] = "syscall/linux/ppc64le/nr.lua"; + ["syscall.linux.x64.constants"] = "syscall/linux/x64/constants.lua"; + ["syscall.linux.x64.ffi"] = "syscall/linux/x64/ffi.lua"; + ["syscall.linux.x64.ioctl"] = "syscall/linux/x64/ioctl.lua"; + ["syscall.linux.x64.nr"] = "syscall/linux/x64/nr.lua"; + ["syscall.linux.x86.constants"] = "syscall/linux/x86/constants.lua"; + ["syscall.linux.x86.ffi"] = "syscall/linux/x86/ffi.lua"; + ["syscall.linux.x86.ioctl"] = "syscall/linux/x86/ioctl.lua"; + ["syscall.linux.x86.nr"] = "syscall/linux/x86/nr.lua"; + } + }; + macosx = + { + modules = + { + ["syscall.osx.syscalls"] = "syscall/osx/syscalls.lua"; + ["syscall.osx.c"] = "syscall/osx/c.lua"; + ["syscall.osx.constants"] = "syscall/osx/constants.lua"; + ["syscall.osx.ffi"] = "syscall/osx/ffi.lua"; + ["syscall.osx.ioctl"] = "syscall/osx/ioctl.lua"; + ["syscall.osx.types"] = "syscall/osx/types.lua"; + ["syscall.osx.fcntl"] = "syscall/osx/fcntl.lua"; + ["syscall.osx.errors"] = "syscall/osx/errors.lua"; + ["syscall.osx.util"] = "syscall/osx/util.lua"; + ["syscall.osx.sysctl"] = "syscall/osx/sysctl.lua"; + } + }; + freebsd = + { + modules = + { + ["syscall.freebsd.syscalls"] = "syscall/freebsd/syscalls.lua"; + ["syscall.freebsd.c"] = "syscall/freebsd/c.lua"; + ["syscall.freebsd.constants"] = "syscall/freebsd/constants.lua"; + ["syscall.freebsd.ffi"] = "syscall/freebsd/ffi.lua"; + ["syscall.freebsd.ioctl"] = "syscall/freebsd/ioctl.lua"; + ["syscall.freebsd.types"] = "syscall/freebsd/types.lua"; + ["syscall.freebsd.fcntl"] = "syscall/freebsd/fcntl.lua"; + ["syscall.freebsd.errors"] = "syscall/freebsd/errors.lua"; + ["syscall.freebsd.util"] = "syscall/freebsd/util.lua"; + ["syscall.freebsd.version"] = "syscall/freebsd/version.lua"; + ["syscall.freebsd.sysctl"] = "syscall/freebsd/sysctl.lua"; + } + }; + netbsd = + { + modules = + { + ["syscall.netbsd.syscalls"] = "syscall/netbsd/syscalls.lua"; + ["syscall.netbsd.c"] = "syscall/netbsd/c.lua"; + ["syscall.netbsd.constants"] = "syscall/netbsd/constants.lua"; + ["syscall.netbsd.ffitypes"] = "syscall/netbsd/ffitypes.lua"; + ["syscall.netbsd.ffifunctions"] = "syscall/netbsd/ffifunctions.lua"; + ["syscall.netbsd.ioctl"] = "syscall/netbsd/ioctl.lua"; + ["syscall.netbsd.types"] = "syscall/netbsd/types.lua"; + ["syscall.netbsd.fcntl"] = "syscall/netbsd/fcntl.lua"; + ["syscall.netbsd.errors"] = "syscall/netbsd/errors.lua"; + ["syscall.netbsd.util"] = "syscall/netbsd/util.lua"; + ["syscall.netbsd.nr"] = "syscall/netbsd/nr.lua"; + ["syscall.netbsd.init"] = "syscall/netbsd/init.lua"; + ["syscall.netbsd.version"] = "syscall/netbsd/version.lua"; + ["syscall.netbsd.sysctl"] = "syscall/netbsd/sysctl.lua"; + } + }; + openbsd = + { + modules = + { + ["syscall.openbsd.syscalls"] = "syscall/openbsd/syscalls.lua"; + ["syscall.openbsd.c"] = "syscall/openbsd/c.lua"; + ["syscall.openbsd.constants"] = "syscall/openbsd/constants.lua"; + ["syscall.openbsd.ffi"] = "syscall/openbsd/ffi.lua"; + ["syscall.openbsd.ioctl"] = "syscall/openbsd/ioctl.lua"; + ["syscall.openbsd.types"] = "syscall/openbsd/types.lua"; + ["syscall.openbsd.fcntl"] = "syscall/openbsd/fcntl.lua"; + ["syscall.openbsd.errors"] = "syscall/openbsd/errors.lua"; + ["syscall.openbsd.util"] = "syscall/openbsd/util.lua"; + ["syscall.openbsd.version"] = "syscall/openbsd/version.lua"; + ["syscall.openbsd.sysctl"] = "syscall/openbsd/sysctl.lua"; + } + }; + bsd = + { + modules = + { + ["syscall.bsd.syscalls"] = "syscall/bsd/syscalls.lua"; + ["syscall.bsd.ffi"] = "syscall/bsd/ffi.lua"; + ["syscall.bsd.types"] = "syscall/bsd/types.lua"; + } + }; + } +} diff --git a/rockspec/ljsyscall-0.12-1.rockspec b/rockspec/ljsyscall-0.12-1.rockspec new file mode 100644 index 0000000000..f614e71605 --- /dev/null +++ b/rockspec/ljsyscall-0.12-1.rockspec @@ -0,0 +1,170 @@ +package = "ljsyscall" +version = "0.12-1" +source = +{ + url = "https://github.com/justincormack/ljsyscall/archive/v0.12.tar.gz"; + dir = "ljsyscall-0.12"; +} + +description = +{ + summary = "LuaJIT Linux syscall FFI"; + homepage = "http://www.myriabit.com/ljsyscall/"; + license = "MIT"; +} +dependencies = +{ + "lua == 5.1"; -- In fact this should be "luajit >= 2.0.0" +} +build = +{ + type = "builtin"; + modules = + { + ["syscall"] = "syscall.lua"; + ["syscall.abi"] = "syscall/abi.lua"; + ["syscall.helpers"] = "syscall/helpers.lua"; + ["syscall.syscalls"] = "syscall/syscalls.lua"; + ["syscall.libc"] = "syscall/libc.lua"; + ["syscall.methods"] = "syscall/methods.lua"; + ["syscall.ffitypes"] = "syscall/ffitypes.lua"; + ["syscall.util"] = "syscall/util.lua"; + ["syscall.compat"] = "syscall/compat.lua"; + ["syscall.bit"] = "syscall/bit.lua"; + ["syscall.types"] = "syscall/types.lua"; + ["syscall.lfs"] = "syscall/lfs.lua"; + + ["syscall.shared.types"] = "syscall/shared/types.lua"; + }; + platforms = + { + linux = + { + modules = { + ["syscall.linux.syscalls"] = "syscall/linux/syscalls.lua"; + ["syscall.linux.c"] = "syscall/linux/c.lua"; + ["syscall.linux.constants"] = "syscall/linux/constants.lua"; + ["syscall.linux.ffi"] = "syscall/linux/ffi.lua"; + ["syscall.linux.ioctl"] = "syscall/linux/ioctl.lua"; + ["syscall.linux.types"] = "syscall/linux/types.lua"; + ["syscall.linux.fcntl"] = "syscall/linux/fcntl.lua"; + ["syscall.linux.errors"] = "syscall/linux/errors.lua"; + ["syscall.linux.util"] = "syscall/linux/util.lua"; + ["syscall.linux.nr"] = "syscall/linux/nr.lua"; + + ["syscall.linux.nl"] = "syscall/linux/nl.lua"; + ["syscall.linux.netfilter"] = "syscall/linux/netfilter.lua"; + ["syscall.linux.sockopt"] = "syscall/linux/sockopt.lua"; + ["syscall.linux.cgroup"] = "syscall/linux/cgroup.lua"; + + ["syscall.linux.arm.constants"] = "syscall/linux/arm/constants.lua"; + ["syscall.linux.arm.ffi"] = "syscall/linux/arm/ffi.lua"; + ["syscall.linux.arm.ioctl"] = "syscall/linux/arm/ioctl.lua"; + ["syscall.linux.arm.nr"] = "syscall/linux/arm/nr.lua"; + ["syscall.linux.arm64.constants"] = "syscall/linux/arm64/constants.lua"; + ["syscall.linux.arm64.ffi"] = "syscall/linux/arm64/ffi.lua"; + ["syscall.linux.arm64.ioctl"] = "syscall/linux/arm64/ioctl.lua"; + ["syscall.linux.arm64.nr"] = "syscall/linux/arm64/nr.lua"; + ["syscall.linux.mips.constants"] = "syscall/linux/mips/constants.lua"; + ["syscall.linux.mips.ffi"] = "syscall/linux/mips/ffi.lua"; + ["syscall.linux.mips.ioctl"] = "syscall/linux/mips/ioctl.lua"; + ["syscall.linux.mips.nr"] = "syscall/linux/mips/nr.lua"; + ["syscall.linux.ppc.constants"] = "syscall/linux/ppc/constants.lua"; + ["syscall.linux.ppc.ffi"] = "syscall/linux/ppc/ffi.lua"; + ["syscall.linux.ppc.ioctl"] = "syscall/linux/ppc/ioctl.lua"; + ["syscall.linux.ppc.nr"] = "syscall/linux/ppc/nr.lua"; + ["syscall.linux.ppc64le.constants"] = "syscall/linux/ppc64le/constants.lua"; + ["syscall.linux.ppc64le.ffi"] = "syscall/linux/ppc64le/ffi.lua"; + ["syscall.linux.ppc64le.ioctl"] = "syscall/linux/ppc64le/ioctl.lua"; + ["syscall.linux.ppc64le.nr"] = "syscall/linux/ppc64le/nr.lua"; + ["syscall.linux.x64.constants"] = "syscall/linux/x64/constants.lua"; + ["syscall.linux.x64.ffi"] = "syscall/linux/x64/ffi.lua"; + ["syscall.linux.x64.ioctl"] = "syscall/linux/x64/ioctl.lua"; + ["syscall.linux.x64.nr"] = "syscall/linux/x64/nr.lua"; + ["syscall.linux.x86.constants"] = "syscall/linux/x86/constants.lua"; + ["syscall.linux.x86.ffi"] = "syscall/linux/x86/ffi.lua"; + ["syscall.linux.x86.ioctl"] = "syscall/linux/x86/ioctl.lua"; + ["syscall.linux.x86.nr"] = "syscall/linux/x86/nr.lua"; + } + }; + macosx = + { + modules = + { + ["syscall.osx.syscalls"] = "syscall/osx/syscalls.lua"; + ["syscall.osx.c"] = "syscall/osx/c.lua"; + ["syscall.osx.constants"] = "syscall/osx/constants.lua"; + ["syscall.osx.ffi"] = "syscall/osx/ffi.lua"; + ["syscall.osx.ioctl"] = "syscall/osx/ioctl.lua"; + ["syscall.osx.types"] = "syscall/osx/types.lua"; + ["syscall.osx.fcntl"] = "syscall/osx/fcntl.lua"; + ["syscall.osx.errors"] = "syscall/osx/errors.lua"; + ["syscall.osx.util"] = "syscall/osx/util.lua"; + ["syscall.osx.sysctl"] = "syscall/osx/sysctl.lua"; + } + }; + freebsd = + { + modules = + { + ["syscall.freebsd.syscalls"] = "syscall/freebsd/syscalls.lua"; + ["syscall.freebsd.c"] = "syscall/freebsd/c.lua"; + ["syscall.freebsd.constants"] = "syscall/freebsd/constants.lua"; + ["syscall.freebsd.ffi"] = "syscall/freebsd/ffi.lua"; + ["syscall.freebsd.ioctl"] = "syscall/freebsd/ioctl.lua"; + ["syscall.freebsd.types"] = "syscall/freebsd/types.lua"; + ["syscall.freebsd.fcntl"] = "syscall/freebsd/fcntl.lua"; + ["syscall.freebsd.errors"] = "syscall/freebsd/errors.lua"; + ["syscall.freebsd.util"] = "syscall/freebsd/util.lua"; + ["syscall.freebsd.version"] = "syscall/freebsd/version.lua"; + ["syscall.freebsd.sysctl"] = "syscall/freebsd/sysctl.lua"; + } + }; + netbsd = + { + modules = + { + ["syscall.netbsd.syscalls"] = "syscall/netbsd/syscalls.lua"; + ["syscall.netbsd.c"] = "syscall/netbsd/c.lua"; + ["syscall.netbsd.constants"] = "syscall/netbsd/constants.lua"; + ["syscall.netbsd.ffitypes"] = "syscall/netbsd/ffitypes.lua"; + ["syscall.netbsd.ffifunctions"] = "syscall/netbsd/ffifunctions.lua"; + ["syscall.netbsd.ioctl"] = "syscall/netbsd/ioctl.lua"; + ["syscall.netbsd.types"] = "syscall/netbsd/types.lua"; + ["syscall.netbsd.fcntl"] = "syscall/netbsd/fcntl.lua"; + ["syscall.netbsd.errors"] = "syscall/netbsd/errors.lua"; + ["syscall.netbsd.util"] = "syscall/netbsd/util.lua"; + ["syscall.netbsd.nr"] = "syscall/netbsd/nr.lua"; + ["syscall.netbsd.init"] = "syscall/netbsd/init.lua"; + ["syscall.netbsd.version"] = "syscall/netbsd/version.lua"; + ["syscall.netbsd.sysctl"] = "syscall/netbsd/sysctl.lua"; + } + }; + openbsd = + { + modules = + { + ["syscall.openbsd.syscalls"] = "syscall/openbsd/syscalls.lua"; + ["syscall.openbsd.c"] = "syscall/openbsd/c.lua"; + ["syscall.openbsd.constants"] = "syscall/openbsd/constants.lua"; + ["syscall.openbsd.ffi"] = "syscall/openbsd/ffi.lua"; + ["syscall.openbsd.ioctl"] = "syscall/openbsd/ioctl.lua"; + ["syscall.openbsd.types"] = "syscall/openbsd/types.lua"; + ["syscall.openbsd.fcntl"] = "syscall/openbsd/fcntl.lua"; + ["syscall.openbsd.errors"] = "syscall/openbsd/errors.lua"; + ["syscall.openbsd.util"] = "syscall/openbsd/util.lua"; + ["syscall.openbsd.version"] = "syscall/openbsd/version.lua"; + ["syscall.openbsd.sysctl"] = "syscall/openbsd/sysctl.lua"; + } + }; + bsd = + { + modules = + { + ["syscall.bsd.syscalls"] = "syscall/bsd/syscalls.lua"; + ["syscall.bsd.ffi"] = "syscall/bsd/ffi.lua"; + ["syscall.bsd.types"] = "syscall/bsd/types.lua"; + } + }; + } +} diff --git a/syscall/bsd/ffi.lua b/syscall/bsd/ffi.lua index a09b59816f..a0fed50a4f 100644 --- a/syscall/bsd/ffi.lua +++ b/syscall/bsd/ffi.lua @@ -147,7 +147,6 @@ int mkfifoat(int dirfd, const char *pathname, mode_t mode); int fchmodat(int dirfd, const char *pathname, mode_t mode, int flags); int readlinkat(int dirfd, const char *pathname, char *buf, size_t bufsiz); int faccessat(int dirfd, const char *pathname, int mode, int flags); -int fstatat(int dirfd, const char *pathname, struct stat *buf, int flags); int futimens(int fd, const struct timespec times[2]); int utimensat(int dirfd, const char *pathname, const struct timespec times[2], int flags); diff --git a/syscall/bsd/syscalls.lua b/syscall/bsd/syscalls.lua index eb98ed8065..f4fad72288 100644 --- a/syscall/bsd/syscalls.lua +++ b/syscall/bsd/syscalls.lua @@ -31,6 +31,7 @@ if C.getdirentries then basep = basep or t.long1() local ret, err = C.getdirentries(getfd(fd), buf, size, basep) if ret == -1 then return nil, t.error(err or errno()) end + if ret == 0 then return nil, nil end return t.dirents(buf, ret) end end diff --git a/syscall/freebsd/constants.lua b/syscall/freebsd/constants.lua index 305a8cb151..b1a703da03 100644 --- a/syscall/freebsd/constants.lua +++ b/syscall/freebsd/constants.lua @@ -1335,5 +1335,13 @@ c.CAP_RIGHTS_VERSION = 0 -- we do not understand others end -- freebsd >= 10 +if version >= 11 then +-- for utimensat +c.UTIME = strflag { + NOW = -1, + OMIT = -2, +} +end + return c diff --git a/syscall/freebsd/ffi.lua b/syscall/freebsd/ffi.lua index 872237e771..fecc9f509b 100644 --- a/syscall/freebsd/ffi.lua +++ b/syscall/freebsd/ffi.lua @@ -297,6 +297,7 @@ int cap_ioctls_limit(int fd, const unsigned long *cmds, size_t ncmds); ssize_t cap_ioctls_get(int fd, unsigned long *cmds, size_t maxcmds); int cap_fcntls_limit(int fd, uint32_t fcntlrights); int cap_fcntls_get(int fd, uint32_t *fcntlrightsp); +int fstatat(int dirfd, const char *pathname, struct stat *buf, int flags); int __sys_utimes(const char *filename, const struct timeval times[2]); int __sys_futimes(int, const struct timeval times[2]); diff --git a/syscall/linux/arm/nr.lua b/syscall/linux/arm/nr.lua index 058c8158d1..7fdb66d58a 100644 --- a/syscall/linux/arm/nr.lua +++ b/syscall/linux/arm/nr.lua @@ -338,6 +338,15 @@ local nr = { setns = 375, process_vm_readv = 376, process_vm_writev= 377, + kcmp = 378, + finit_module = 379, + sched_setattr = 380, + sched_getattr = 381, + renameat2 = 382, + seccomp = 383, + getrandom = 384, + memfd_create = 385, + bpf = 386, } } diff --git a/syscall/linux/c.lua b/syscall/linux/c.lua index f914df60e4..18eca30982 100644 --- a/syscall/linux/c.lua +++ b/syscall/linux/c.lua @@ -7,12 +7,8 @@ Note a fair number are being deprecated, see include/uapi/asm-generic/unistd.h u Some of these we already don't use, but some we do, eg use open not openat etc. ]] -local require, error, assert, tonumber, tostring, -setmetatable, pairs, ipairs, unpack, rawget, rawset, -pcall, type, table, string, select = -require, error, assert, tonumber, tostring, -setmetatable, pairs, ipairs, unpack, rawget, rawset, -pcall, type, table, string, select +local require, tonumber, pcall, select = +require, tonumber, pcall, select local abi = require "syscall.abi" @@ -34,7 +30,6 @@ local uint, ulong = ffi.typeof("unsigned int"), ffi.typeof("unsigned long") local h = require "syscall.helpers" local err64 = h.err64 -local errpointer = h.errpointer local i6432, u6432 = bit.i6432, bit.u6432 @@ -53,7 +48,6 @@ else arg64u = function(val) return u6432(val) end end -- _llseek very odd, preadv -local function llarg64u(val) return u6432(val) end local function llarg64(val) return i6432(val) end local C = {} @@ -69,7 +63,6 @@ local u64 = ffi.typeof("uint64_t") -- TODO could make these return errno here, also are these best casts? local syscall_long = ffi.C.syscall -- returns long local function syscall(...) return tonumber(syscall_long(...)) end -- int is default as most common -local function syscall_uint(...) return uint(syscall_long(...)) end local function syscall_void(...) return void(syscall_long(...)) end local function syscall_off(...) return u64(syscall_long(...)) end -- off_t @@ -171,7 +164,7 @@ else -- 64 bit function C.fstatfs(fd, buf) return syscall(sys.fstatfs, int(fd), void(buf)) end function C.preadv(fd, iov, iovcnt, offset) return syscall_long(sys.preadv, int(fd), void(iov), long(iovcnt), ulong(offset)) end function C.pwritev(fd, iov, iovcnt, offset) return syscall_long(sys.pwritev, int(fd), void(iov), long(iovcnt), ulong(offset)) end - function C.lseek(fd, offset, whence) return syscall_off(sys.lseek, int(fd), ulong(offset), int(whence)) end + function C.lseek(fd, offset, whence) return syscall_off(sys.lseek, int(fd), long(offset), int(whence)) end function C.sendfile(outfd, infd, offset, count) return syscall_long(sys.sendfile, int(outfd), int(infd), void(offset), ulong(count)) end @@ -182,6 +175,7 @@ end -- glibc caches pid, but this fails to work eg after clone(). function C.getpid() return syscall(sys.getpid) end +function C.gettid() return syscall(sys.gettid) end -- underlying syscalls function C.exit_group(status) return syscall(sys.exit_group, int(status)) end -- void return really @@ -630,6 +624,7 @@ end function C.timer_gettime(timerid, curr_value) return syscall(sys.timer_gettime, int(timerid), void(curr_value)) end function C.timer_delete(timerid) return syscall(sys.timer_delete, int(timerid)) end function C.timer_getoverrun(timerid) return syscall(sys.timer_getoverrun, int(timerid)) end +function C.vhangup() return syscall(sys.vhangup) end -- only on some architectures if sys.waitpid then @@ -684,12 +679,24 @@ C.gettimeofday = ffi.C.gettimeofday --function C.gettimeofday(tv, tz) return syscall(sys.gettimeofday, void(tv), void(tz)) end -- glibc does not provide getcpu; it is however VDSO -function C.getcpu(cpu, node, tcache) return syscall(sys.getcpu, void(node), void(node), void(tcache)) end +function C.getcpu(cpu, node, tcache) return syscall(sys.getcpu, void(cpu), void(node), void(tcache)) end -- time is VDSO but not really performance critical; does not exist for some architectures if sys.time then function C.time(t) return syscall(sys.time, void(t)) end end +-- bpf syscall that is only on Linux 3.19+ +if sys.bpf then + function C.bpf(cmd, attr) + return syscall(sys.bpf, int(cmd), void(attr), u64(ffi.sizeof('union bpf_attr'))) + end +end +if sys.perf_event_open then + function C.perf_event_open(attr, pid, cpu, group_fd, flags) + return syscall(sys.perf_event_open, void(attr), int(pid), int(cpu), int(group_fd), ulong(flags)) + end +end + -- socketcalls if not sys.socketcall then function C.socket(domain, tp, protocol) return syscall(sys.socket, int(domain), int(tp), int(protocol)) end diff --git a/syscall/linux/constants.lua b/syscall/linux/constants.lua index 23c68af3f3..a9dcc1c3a8 100644 --- a/syscall/linux/constants.lua +++ b/syscall/linux/constants.lua @@ -161,9 +161,12 @@ c.F = strflag(arch.F or { SETLEASE = 1024, GETLEASE = 1025, NOTIFY = 1026, + CANCELLK = 1029, + DUPFD_CLOEXEC = 1030, SETPIPE_SZ = 1031, GETPIPE_SZ = 1032, - DUPFD_CLOEXEC = 1030, + ADD_SEALS = 1033, + GET_SEALS = 1034, }) -- messy @@ -208,6 +211,14 @@ c.LOCK = multiflags { RW = 192, } +-- for memfd +c.F_SEAL = multiflags { + SEAL = 0x0001, + SHRINK = 0x0002, + GROW = 0x0004, + WRITE = 0x0008, +} + --mmap c.PROT = multiflags { NONE = 0x0, @@ -300,6 +311,8 @@ c.SEEK = strflag { SET = 0, CUR = 1, END = 2, + DATA = 3, + HOLE = 4, } -- exit @@ -393,6 +406,12 @@ c.SOCK = multiflags(arch.SOCK or { c.SCM = strflag { RIGHTS = 0x01, CREDENTIALS = 0x02, + + TSTAMP_SND = 0, + TSTAMP_SCHED = 1, + TSTAMP_ACK = 2, + + TIMESTAMPING_OPT_STATS = 54, } -- setsockopt @@ -426,7 +445,7 @@ c.SO = strflag(arch.SO or { PRIORITY = 12, LINGER = 13, BSDCOMPAT = 14, ---REUSEPORT = 15, -- new, may not be defined yet + REUSEPORT = 15, -- new, may not be defined yet PASSCRED = 16, PEERCRED = 17, RCVLOWAT = 18, @@ -455,13 +474,49 @@ c.SO = strflag(arch.SO or { WIFI_STATUS = 41, PEEK_OFF = 42, NOFCS = 43, + LOCK_FILTER = 44, + SELECT_ERR_QUEUE = 45, + BUSY_POLL = 46, + MAX_PACING_RATE = 47, + BPF_EXTENSIONS = 48, + INCOMING_CPU = 49, + ATTACH_BPF = 50, + ATTACH_REUSEPORT_CBPF = 51, + ATTACH_REUSEPORT_EBPF = 52, }) c.SO.GET_FILTER = c.SO.ATTACH_FILTER +c.SO.DETACH_BPF = c.SO.DETACH_FILTER + +c.SCM.TIMESTAMP = c.SO.TIMESTAMP +c.SCM.TIMESTAMPNS = c.SO.TIMESTAMPNS +c.SCM.TIMESTAMPING = c.SO.TIMESTAMPING -- Maximum queue length specifiable by listen. c.SOMAXCONN = 128 +c.SOF = strflag { + TIMESTAMPING_TX_HARDWARE = bit.lshift(1, 0), + TIMESTAMPING_TX_SOFTWARE = bit.lshift(1, 1), + TIMESTAMPING_RX_HARDWARE = bit.lshift(1, 2), + TIMESTAMPING_RX_SOFTWARE = bit.lshift(1, 3), + TIMESTAMPING_SOFTWARE = bit.lshift(1, 4), + TIMESTAMPING_SYS_HARDWARE = bit.lshift(1, 5), + TIMESTAMPING_RAW_HARDWARE = bit.lshift(1, 6), + TIMESTAMPING_OPT_ID = bit.lshift(1, 7), + TIMESTAMPING_TX_SCHED = bit.lshift(1, 8), + TIMESTAMPING_TX_ACK = bit.lshift(1, 9), + TIMESTAMPING_OPT_CMSG = bit.lshift(1, 10), + TIMESTAMPING_OPT_TSONLY = bit.lshift(1, 11), + TIMESTAMPING_OPT_STATS = bit.lshift(1, 12), + TIMESTAMPING_OPT_PKTINFO = bit.lshift(1, 13), + TIMESTAMPING_OPT_TX_SWHW = bit.lshift(1, 14), +} + +c.SOF.TIMESTAMPING_LAST = c.SOF.TIMESTAMPING_OPT_TX_SWHW +c.SOF.TIMESTAMPING_MASK = bit.bor(c.SOF.TIMESTAMPING_LAST - 1, + c.SOF.TIMESTAMPING_LAST) + -- shutdown c.SHUT = strflag { RD = 0, @@ -1162,6 +1217,10 @@ c.RTA = strflag { MP_ALGO = 14, TABLE = 15, MARK = 16, + MFC_STATS = 17, + VIA = 18, + NEWDST = 19, + PREF = 20, } -- route flags @@ -1954,6 +2013,7 @@ c.EM = strflag { MN10300 = 89, BLACKFIN = 106, TI_C6000 = 140, + AARCH64 = 183, FRV = 0x5441, AVR32 = 0x18ad, ALPHA = 0x9026, @@ -1970,6 +2030,7 @@ local __AUDIT_ARCH_64BIT = 0x80000000 local __AUDIT_ARCH_LE = 0x40000000 c.AUDIT_ARCH = strflag { + AARCH64 = c.EM.AARCH64 + __AUDIT_ARCH_64BIT + __AUDIT_ARCH_LE, ALPHA = c.EM.ALPHA + __AUDIT_ARCH_64BIT + __AUDIT_ARCH_LE, ARM = c.EM.ARM + __AUDIT_ARCH_LE, ARMEB = c.EM.ARM, @@ -2007,6 +2068,7 @@ c.BPF = multiflags { ST = 0x02, STX = 0x03, ALU = 0x04, + ALU64 = 0x07, JMP = 0x05, RET = 0x06, MISC = 0x07, @@ -2014,6 +2076,7 @@ c.BPF = multiflags { W = 0x00, H = 0x08, B = 0x10, + DW = 0x18, -- mode IMM = 0x00, ABS = 0x20, @@ -2030,12 +2093,23 @@ c.BPF = multiflags { AND = 0x50, LSH = 0x60, RSH = 0x70, + ARSH = 0xc0, NEG = 0x80, + MOD = 0x90, + XOR = 0xa0, + MOV = 0xb0, + XADD = 0xc0, + END = 0xd0, JA = 0x00, JEQ = 0x10, JGT = 0x20, JGE = 0x30, JSET = 0x40, + JNE = 0x50, + JSGT = 0x60, + JSGE = 0x70, + CALL = 0x80, + EXIT = 0x90, -- src K = 0x00, X = 0x08, @@ -2044,6 +2118,244 @@ c.BPF = multiflags { -- miscop TAX = 0x00, TXA = 0x80, + TO_LE = 0x00, + TO_BE = 0x08, +-- flags + ANY = 0, + NOEXIST = 1, + EXIST = 2, +} + +-- BPF map type +c.BPF_MAP = strflag { + UNSPEC = 0, + HASH = 1, + ARRAY = 2, + PROG_ARRAY = 3, + PERF_EVENT_ARRAY = 4, + PERCPU_HASH = 5, + PERCPU_ARRAY = 6, + STACK_TRACE = 7, + CGROUP_ARRAY = 8, + LRU_HASH = 9, + LRU_PERCPU_HASH = 10, + LPM_TRIE = 11, + ARRAY_OF_MAPS = 12, + HASH_OF_MAPS = 13, + DEVMAP = 14, + SOCKMAP = 15, + CPUMAP = 16, +} + +-- BPF syscall commands +c.BPF_CMD = strflag { + MAP_CREATE = 0, + MAP_LOOKUP_ELEM = 1, + MAP_UPDATE_ELEM = 2, + MAP_DELETE_ELEM = 3, + MAP_GET_NEXT_KEY = 4, + PROG_LOAD = 5, + OBJ_PIN = 6, + OBJ_GET = 7, + PROG_ATTACH = 8, + PROG_DETACH = 9, + PROG_TEST_RUN = 10, + PROG_GET_NEXT_ID = 11, + MAP_GET_NEXT_ID = 12, + PROG_GET_FD_BY_ID = 13, + MAP_GET_FD_BY_ID = 14, + OBJ_GET_INFO_BY_FD = 15, + PROG_QUERY = 16, + RAW_TRACEPOINT_OPEN = 17, +} + +-- BPF program types +c.BPF_PROG = strflag { + UNSPEC = 0, + SOCKET_FILTER = 1, + KPROBE = 2, + SCHED_CLS = 3, + SCHED_ACT = 4, + TRACEPOINT = 5, + XDP = 6, + PERF_EVENT = 7, + CGROUP_SKB = 8, + CGROUP_SOCK = 9, + LWT_IN = 10, + LWT_OUT = 11, + LWT_XMIT = 12, + SOCK_OPS = 13, + SK_SKB = 14, + CGROUP_DEVICE = 15, + SK_MSG = 16, + RAW_TRACEPOINT = 17, + CGROUP_SOCK_ADDR = 18, +} + +-- BPF attach type +c.BPF_ATTACH_TYPE = strflag { + CGROUP_INET_INGRESS = 0, + CGROUP_INET_EGRESS = 1, + CGROUP_INET_SOCK_CREATE = 2, + CGROUP_SOCK_OPS = 3, + SK_SKB_STREAM_PARSER = 4, + SK_SKB_STREAM_VERDICT = 5, + CGROUP_DEVICE = 6, + SK_MSG_VERDICT = 7, + CGROUP_INET4_BIND = 8, + CGROUP_INET6_BIND = 9, + CGROUP_INET4_CONNECT = 10, + CGROUP_INET6_CONNECT = 11, + CGROUP_INET4_POST_BIND = 12, + CGROUP_INET6_POST_BIND = 13, +} + +-- Linux performance monitoring +-- perf_event_attr.type +c.PERF_TYPE = strflag { + HARDWARE = 0, + SOFTWARE = 1, + TRACEPOINT = 2, + HW_CACHE = 3, + RAW = 4, + BREAKPOINT = 5, +} + +-- perf_event_attr.event_id +c.PERF_COUNT = strflag { + -- Generalized performance event event_id types + HW_CPU_CYCLES = 0, + HW_INSTRUCTIONS = 1, + HW_CACHE_REFERENCES = 2, + HW_CACHE_MISSES = 3, + HW_BRANCH_INSTRUCTIONS = 4, + HW_BRANCH_MISSES = 5, + HW_BUS_CYCLES = 6, + HW_STALLED_CYCLES_FRONTEND = 7, + HW_STALLED_CYCLES_BACKEND = 8, + HW_REF_CPU_CYCLES = 9, + -- Generalized hardware cache events + HW_CACHE_L1D = 0, + HW_CACHE_L1I = 1, + HW_CACHE_LL = 2, + HW_CACHE_DTLB = 3, + HW_CACHE_ITLB = 4, + HW_CACHE_BPU = 5, + HW_CACHE_NODE = 6, + HW_CACHE_OP_READ = 0, + HW_CACHE_OP_WRITE = 1, + HW_CACHE_OP_PREFETCH = 2, + HW_CACHE_RESULT_ACCESS = 0, + HW_CACHE_RESULT_MISS = 1, + -- Special "software" events provided by the kernel + SW_CPU_CLOCK = 0, + SW_TASK_CLOCK = 1, + SW_PAGE_FAULTS = 2, + SW_CONTEXT_SWITCHES = 3, + SW_CPU_MIGRATIONS = 4, + SW_PAGE_FAULTS_MIN = 5, + SW_PAGE_FAULTS_MAJ = 6, + SW_ALIGNMENT_FAULTS = 7, + SW_EMULATION_FAULTS = 8, + SW_DUMMY = 9, + SW_BPF_OUTPUT = 10, +} + +-- Bits that can be set in perf_event_attr.sample_type to request information +c.PERF_SAMPLE = multiflags { + IP = bit.lshift(1, 0), + TID = bit.lshift(1, 1), + TIME = bit.lshift(1, 2), + ADDR = bit.lshift(1, 3), + READ = bit.lshift(1, 4), + CALLCHAIN = bit.lshift(1, 5), + ID = bit.lshift(1, 6), + CPU = bit.lshift(1, 7), + PERIOD = bit.lshift(1, 8), + STREAM_ID = bit.lshift(1, 9), + RAW = bit.lshift(1, 10), + BRANCH_STACK = bit.lshift(1, 11), + REGS_USER = bit.lshift(1, 12), + STACK_USER = bit.lshift(1, 13), + WEIGHT = bit.lshift(1, 14), + DATA_SRC = bit.lshift(1, 15), + IDENTIFIER = bit.lshift(1, 16), + TRANSACTION = bit.lshift(1, 17), + REGS_INTR = bit.lshift(1, 18), +} + +-- values to program into perf_event_attr.branch_sample_type when PERF_SAMPLE_BRANCH is set +c.PERF_SAMPLE_BRANCH = multiflags { + USER_SHIFT = 0, + KERNEL_SHIFT = 1, + HV_SHIFT = 2, + ANY_SHIFT = 3, + ANY_CALL_SHIFT = 4, + ANY_RETURN_SHIFT = 5, + IND_CALL_SHIFT = 6, + ABORT_TX_SHIFT = 7, + IN_TX_SHIFT = 8, + NO_TX_SHIFT = 9, + COND_SHIFT = 10, + CALL_STACK_SHIFT = 11, + IND_JUMP_SHIFT = 12, + CALL_SHIFT = 13, + NO_FLAGS_SHIFT = 14, + NO_CYCLES_SHIFT = 15, +} +c.PERF_SAMPLE_BRANCH.USER = bit.lshift(1, c.PERF_SAMPLE_BRANCH.USER_SHIFT) +c.PERF_SAMPLE_BRANCH.KERNEL = bit.lshift(1, c.PERF_SAMPLE_BRANCH.KERNEL_SHIFT) +c.PERF_SAMPLE_BRANCH.HV = bit.lshift(1, c.PERF_SAMPLE_BRANCH.HV_SHIFT) +c.PERF_SAMPLE_BRANCH.ANY = bit.lshift(1, c.PERF_SAMPLE_BRANCH.ANY_SHIFT) +c.PERF_SAMPLE_BRANCH.ANY_CALL = bit.lshift(1, c.PERF_SAMPLE_BRANCH.ANY_CALL_SHIFT) +c.PERF_SAMPLE_BRANCH.ANY_RETURN = bit.lshift(1, c.PERF_SAMPLE_BRANCH.ANY_RETURN_SHIFT) +c.PERF_SAMPLE_BRANCH.IND_CALL = bit.lshift(1, c.PERF_SAMPLE_BRANCH.IND_CALL_SHIFT) +c.PERF_SAMPLE_BRANCH.ABORT_TX = bit.lshift(1, c.PERF_SAMPLE_BRANCH.ABORT_TX_SHIFT) +c.PERF_SAMPLE_BRANCH.IN_TX = bit.lshift(1, c.PERF_SAMPLE_BRANCH.IN_TX_SHIFT) +c.PERF_SAMPLE_BRANCH.NO_TX = bit.lshift(1, c.PERF_SAMPLE_BRANCH.NO_TX_SHIFT) +c.PERF_SAMPLE_BRANCH.COND = bit.lshift(1, c.PERF_SAMPLE_BRANCH.COND_SHIFT) +c.PERF_SAMPLE_BRANCH.CALL_STACK = bit.lshift(1, c.PERF_SAMPLE_BRANCH.CALL_STACK_SHIFT) +c.PERF_SAMPLE_BRANCH.IND_JUMP = bit.lshift(1, c.PERF_SAMPLE_BRANCH.IND_JUMP_SHIFT) +c.PERF_SAMPLE_BRANCH.CALL = bit.lshift(1, c.PERF_SAMPLE_BRANCH.CALL_SHIFT) +c.PERF_SAMPLE_BRANCH.NO_FLAGS = bit.lshift(1, c.PERF_SAMPLE_BRANCH.NO_FLAGS_SHIFT) +c.PERF_SAMPLE_BRANCH.NO_CYCLES = bit.lshift(1, c.PERF_SAMPLE_BRANCH.NO_CYCLES_SHIFT) + +-- Flags for perf_attr.read_format +c.PERF_READ_FORMAT = multiflags { + TOTAL_TIME_ENABLED = bit.lshift(1, 0), + TOTAL_TIME_RUNNING = bit.lshift(1, 1), + ID = bit.lshift(1, 2), + GROUP = bit.lshift(1, 3), +} + +-- Flags for perf_event_open +c.PERF_FLAG = multiflags { + FD_NO_GROUP = bit.lshift(1, 0), + FD_OUTPUT = bit.lshift(1, 1), + PID_CGROUP = bit.lshift(1, 2), + FD_CLOEXEC = bit.lshift(1, 3), +} + + +-- If perf_event_attr.sample_id_all is set then all event types will +-- have the sample_type selected fields related to where/when +-- (identity) an event took place (TID, TIME, ID, STREAM_ID, CPU, IDENTIFIER) +c.PERF_RECORD = strflag { + MMAP = 1, + LOST = 2, + COMM = 3, + EXIT = 4, + THROTTLE = 5, + UNTHROTTLE = 6, + FORK = 7, + READ = 8, + SAMPLE = 9, + MMAP2 = 10, + AUX = 11, + ITRACE_START = 12, + LOST_SAMPLES = 13, + SWITCH = 14, + SWITCH_CPU_WIDE= 15, } -- termios - c_cc characters diff --git a/syscall/linux/fcntl.lua b/syscall/linux/fcntl.lua index dd6621ffc6..67567c25be 100644 --- a/syscall/linux/fcntl.lua +++ b/syscall/linux/fcntl.lua @@ -22,6 +22,7 @@ local fcntl = { [c.F.GETLK] = t.flock, [c.F.SETLK] = t.flock, [c.F.SETLKW] = t.flock, + [c.F.ADD_SEALS] = function(arg) return c.F_SEAL[arg] end, }, ret = { [c.F.DUPFD] = function(ret) return t.fd(ret) end, @@ -33,6 +34,7 @@ local fcntl = { [c.F.GETSIG] = function(ret) return tonumber(ret) end, [c.F.GETPIPE_SZ] = function(ret) return tonumber(ret) end, [c.F.GETLK] = function(ret, arg) return arg end, + [c.F.GET_SEALS] = function(ret) return tonumber(ret) end, } } diff --git a/syscall/linux/ffi.lua b/syscall/linux/ffi.lua index 084fa7cdcf..86026c9141 100644 --- a/syscall/linux/ffi.lua +++ b/syscall/linux/ffi.lua @@ -498,10 +498,143 @@ struct sock_filter { uint8_t jf; uint32_t k; }; +struct bpf_insn { + uint8_t code; /* opcode */ + uint8_t dst_reg:4; /* dest register */ + uint8_t src_reg:4; /* source register */ + uint16_t off; /* signed offset */ + uint32_t imm; /* signed immediate constant */ +}; struct sock_fprog { unsigned short len; struct sock_filter *filter; }; +union bpf_attr { + struct { + uint32_t map_type; + uint32_t key_size; + uint32_t value_size; + uint32_t max_entries; + }; + struct { + uint32_t map_fd; + uint64_t key __attribute__((aligned(8))); + union { + uint64_t value __attribute__((aligned(8))); + uint64_t next_key __attribute__((aligned(8))); + }; + uint64_t flags; + }; + struct { + uint32_t prog_type; + uint32_t insn_cnt; + uint64_t insns __attribute__((aligned(8))); + uint64_t license __attribute__((aligned(8))); + uint32_t log_level; + uint32_t log_size; + uint64_t log_buf __attribute__((aligned(8))); + uint32_t kern_version; + }; + struct { + uint64_t pathname __attribute__((aligned(8))); + uint32_t bpf_fd; + }; +} __attribute__((aligned(8))); +struct perf_event_attr { + uint32_t pe_type; + uint32_t size; + uint64_t pe_config; + union { + uint64_t sample_period; + uint64_t sample_freq; + }; + uint64_t pe_sample_type; + uint64_t read_format; + uint32_t disabled:1, + inherit:1, + pinned:1, + exclusive:1, + exclude_user:1, + exclude_kernel:1, + exclude_hv:1, + exclude_idle:1, + mmap:1, + comm:1, + freq:1, + inherit_stat:1, + enable_on_exec:1, + task:1, + watermark:1, + precise_ip:2, + mmap_data:1, + sample_id_all:1, + exclude_host:1, + exclude_guest:1, + exclude_callchain_kernel:1, + exclude_callchain_user:1, + mmap2:1, + comm_exec:1, + use_clockid:1, + __reserved_1a:6; + uint32_t __reserved_1b; + union { + uint32_t wakeup_events; + uint32_t wakeup_watermark; + }; + uint32_t bp_type; + union { + uint64_t bp_addr; + uint64_t config1; + }; + union { + uint64_t bp_len; + uint64_t config2; + }; + uint64_t branch_sample_type; + uint64_t sample_regs_user; + uint32_t sample_stack_user; + int32_t clockid; + uint64_t sample_regs_intr; + uint32_t aux_watermark; + uint32_t __reserved_2; +}; +struct perf_event_mmap_page { + uint32_t version; + uint32_t compat_version; + uint32_t lock; + uint32_t index; + int64_t offset; + uint64_t time_enabled; + uint64_t time_running; + union { + uint64_t capabilities; + struct { + uint32_t cap_bit0 : 1, + cap_bit0_is_deprecated : 1, + cap_user_rdpmc : 1, + cap_user_time : 1, + cap_user_time_zero : 1; + }; + }; + uint16_t pmc_width; + uint16_t time_shift; + uint32_t time_mult; + uint64_t time_offset; + uint64_t __reserved[120]; + volatile uint64_t data_head; + volatile uint64_t data_tail; + volatile uint64_t data_offset; + volatile uint64_t data_size; + uint64_t aux_head; + uint64_t aux_tail; + uint64_t aux_offset; + uint64_t aux_size; +}; +struct perf_event_header { + uint32_t type; + uint16_t misc; + uint16_t size; +}; struct mq_attr { long mq_flags, mq_maxmsg, mq_msgsize, mq_curmsgs, __unused[4]; }; @@ -703,6 +836,9 @@ struct rusage { long ru_nvcsw; long ru_nivcsw; }; +struct scm_timestamping { + struct timespec ts[3]; +}; ]] append(arch.nsig or [[ diff --git a/syscall/linux/ioctl.lua b/syscall/linux/ioctl.lua index ec05d5e5ca..c695dca2c3 100644 --- a/syscall/linux/ioctl.lua +++ b/syscall/linux/ioctl.lua @@ -189,6 +189,12 @@ local ioctl = strflag { SIOCDELRT = 0x890C, SIOCRTMSG = 0x890D, + SIOCGIFFLAGS = 0x8913, + SIOCSIFFLAGS = 0x8914, + SIOCGIFMTU = 0x8921, + SIOCSIFMTU = 0x8922, + SIOCSIFHWADDR = 0x8924, + SIOCGIFHWADDR = 0x8927, SIOCGIFINDEX = 0x8933, SIOCDARP = 0x8953, @@ -266,6 +272,16 @@ local ioctl = strflag { -- from linux/vfio.h type is ';' base is 100 VFIO_GET_API_VERSION = vfio('NONE', 0), VFIO_CHECK_EXTENSION = vfio('WRITE', 1, "uint32"), +-- from linux/perf_event.h + PERF_EVENT_IOC_ENABLE = _IO('$', 0), + PERF_EVENT_IOC_DISABLE = _IO('$', 1), + PERF_EVENT_IOC_REFRESH = _IO('$', 2), + PERF_EVENT_IOC_RESET = _IO('$', 3), + PERF_EVENT_IOC_PERIOD = _IOW('$', 4, "uint64"), + PERF_EVENT_IOC_SET_OUTPUT= _IO('$', 5), + PERF_EVENT_IOC_SET_FILTER= _IOW('$', 6, "uintptr"), + PERF_EVENT_IOC_ID = _IOR('$', 7, "uint64_1"), + PERF_EVENT_IOC_SET_BPF = _IOW('$', 8, "uint32"), -- allow user defined ioctls _IO = _IO, diff --git a/syscall/linux/nl.lua b/syscall/linux/nl.lua index fad5625ed8..a7da48a831 100644 --- a/syscall/linux/nl.lua +++ b/syscall/linux/nl.lua @@ -173,6 +173,10 @@ local rta_decode = { ir.cacheinfo = t.rta_cacheinfo() ffi.copy(ir.cacheinfo, buf, s.rta_cacheinfo) end, + [c.RTA.PREF] = function(ir, buf, len) + local i = pt.uint8(buf) + ir.pref = tonumber(i[0]) + end, -- TODO some missing } @@ -340,6 +344,44 @@ mt.iflink = { end } +meth.ndmsg = { + index = { + family = function(i) return tonumber(i.ndmsg.ndm_family) end, + ifindex = function(i) return tonumber(i.ndmsg.ndm_ifindex) end, + state = function(i) return tonumber(i.ndmsg.ndm_state) end, + flags = function(i) return tonumber(i.ndmsg.ndm_flags) end, + type = function(i) return tonumber(i.ndmsg.ndm_type) end, + dest = function(i) return i.dst or addrtype(i.family) end, + -- might not be set in Lua table, so return nil + dst = function() return nil end, + lladdr = function() return nil end, + }, + flags = { + [c.NTF.PROXY] = "proxy", + [c.NTF.ROUTER] = "router", + }, + state = { + [c.NUD.INCOMPLETE] = "incomplete", + [c.NUD.REACHABLE] = "reachable", + [c.NUD.STALE] = "stale", + [c.NUD.DELAY] = "delay", + [c.NUD.PROBE] = "probe", + [c.NUD.FAILED] = "failed", + [c.NUD.NOARP] = "noarp", + [c.NUD.PERMANENT] = "permanent", + } +} + +mt.ndmsg = { + __index = function(i, k) + if meth.ndmsg.index[k] then return meth.ndmsg.index[k](i) end + end, + __tostring = function(i) -- TODO make more like output of ip route + local s = "dst: " .. tostring(i.dest) .. " lladdr: " .. tostring(i.lladdr) .. " if: " .. i.ifindex + return s + end, +} + meth.rtmsg = { index = { family = function(i) return tonumber(i.rtmsg.rtm_family) end, @@ -378,6 +420,17 @@ mt.rtmsg = { end, } + +mt.neighs = { + __tostring = function(is) + local s = {} + for k, v in ipairs(is) do + s[#s + 1] = tostring(v) + end + return table.concat(s, '\n') + end, +} + meth.routes = { fn = { match = function(rs, addr, len) -- exact match @@ -496,12 +549,12 @@ local function decode_route(buf, len) end local function decode_neigh(buf, len) - local rt = pt.rtmsg(buf) - buf = buf + nlmsg_align(s.rtmsg) - len = len - nlmsg_align(s.rtmsg) + local rt = pt.ndmsg(buf) + buf = buf + nlmsg_align(s.ndmsg) + len = len - nlmsg_align(s.ndmsg) local rtattr = pt.rtattr(buf) - local ir = setmetatable({rtmsg = t.rtmsg()}, mt.rtmsg) - ffi.copy(ir.rtmsg, rt, s.rtmsg) + local ir = setmetatable({ndmsg = t.ndmsg()}, mt.ndmsg) + ffi.copy(ir.ndmsg, rt, s.ndmsg) while rta_ok(rtattr, len) do if nda_decode[rtattr.rta_type] then nda_decode[rtattr.rta_type](ir, buf + rta_length(0), rta_align(rtattr.rta_len) - rta_length(0)) @@ -1051,7 +1104,9 @@ function nl.getneigh(index, tab, ...) if type(index) == 'table' then index = index.index end tab.ifindex = index local ndm = t.ndmsg(tab) - return nlmsg("getneigh", "request, dump", ndm.family, t.ndmsg, ndm, ...) + local n, err = nlmsg("getneigh", "request, dump", ndm.family, t.ndmsg, ndm, ...) + if not n then return nil, err end + return setmetatable(n, mt.neighs) end function nl.newneigh(index, tab, ...) diff --git a/syscall/linux/ppc/nr.lua b/syscall/linux/ppc/nr.lua index 010fdb4c24..1d712ebf03 100644 --- a/syscall/linux/ppc/nr.lua +++ b/syscall/linux/ppc/nr.lua @@ -354,6 +354,15 @@ local nr = { setns = 350, process_vm_readv = 351, process_vm_writev = 352, + kcmp = 353, + finit_module = 354, + sched_setattr = 355, + sched_getattr = 356, + renameat2 = 357, + seccomp = 358, + getrandom = 359, + memfd_create = 360, + bpf = 361, } } diff --git a/syscall/linux/ppc64le/nr.lua b/syscall/linux/ppc64le/nr.lua index bd0df08fc1..0aa6ca6a01 100644 --- a/syscall/linux/ppc64le/nr.lua +++ b/syscall/linux/ppc64le/nr.lua @@ -349,6 +349,19 @@ local nr = { kcmp = 354, sched_setattr = 355, sched_getattr = 356, + renameat2 = 357, + seccomp = 358, + getrandom = 359, + memfd_create = 360, + bpf = 361, + execveat = 362, + switch_endian = 363, + userfaultfd = 364, + membarrier = 365, + mlock2 = 378, + copy_file_range = 379, + preadv2 = 380, + pwritev2 = 381, } } diff --git a/syscall/linux/syscalls.lua b/syscall/linux/syscalls.lua index 335144080b..261040fac2 100644 --- a/syscall/linux/syscalls.lua +++ b/syscall/linux/syscalls.lua @@ -774,6 +774,147 @@ function S.sysctl(name, new) return old end +-- BPF syscall has a complex semantics with one union serving for all purposes +-- The interface exports both raw syscall and helper functions based on libbpf +if C.bpf then + local function ptr_to_u64(p) return ffi.cast('uint64_t', ffi.cast('void *', p)) end + function S.bpf(cmd, attr) + return C.bpf(cmd, attr) + end + function S.bpf_prog_load(type, insns, len, license, version, log_level) + if not license then license = "GPL" end -- Must stay alive during the syscall + local bpf_log_buf = ffi.new('char [?]', 64*1024) -- Must stay alive during the syscall + if not version then + -- We have no better way to extract current kernel hex-string other + -- than parsing headers, compiling a helper function or reading /proc + local ver_str, count = S.sysctl('kernel.osrelease'):match('%d+.%d+.%d+'), 2 + version = 0 + for i in ver_str:gmatch('%d+') do -- Convert 'X.Y.Z' to 0xXXYYZZ + version = bit.bor(version, bit.lshift(tonumber(i), 8*count)) + count = count - 1 + end + end + local attr = t.bpf_attr1() + attr[0].prog_type = type + attr[0].insns = ptr_to_u64(insns) + attr[0].insn_cnt = len + attr[0].license = ptr_to_u64(license) + attr[0].log_buf = ptr_to_u64(bpf_log_buf) + attr[0].log_size = ffi.sizeof(bpf_log_buf) + attr[0].log_level = log_level or 1 + attr[0].kern_version = version -- MUST match current kernel version + local fd = S.bpf(c.BPF_CMD.PROG_LOAD, attr) + if fd < 0 then + return nil, t.error(errno()), ffi.string(bpf_log_buf) + end + return retfd(fd), ffi.string(bpf_log_buf) + end + function S.bpf_map_create(type, key_size, value_size, max_entries) + local attr = t.bpf_attr1() + attr[0].map_type = type + attr[0].key_size = key_size + attr[0].value_size = value_size + attr[0].max_entries = max_entries + local fd = S.bpf(c.BPF_CMD.MAP_CREATE, attr) + if fd < 0 then + return nil, t.error(errno()) + end + return retfd(fd) + end + function S.bpf_map_op(op, fd, key, val_or_next, flags) + local attr = t.bpf_attr1() + attr[0].map_fd = fd + attr[0].key = ptr_to_u64(key) + attr[0].value = ptr_to_u64(val_or_next) + attr[0].flags = flags or 0 + local ret = S.bpf(op, attr) + if ret ~= 0 then + return nil, t.error(errno()) + end + return ret + end +end + +-- Linux performance monitoring +if C.perf_event_open then + -- Open perf event fd + -- @note see man 2 perf_event_open + -- @return fd, err + function S.perf_event_open(attr, pid, cpu, group_fd, flags) + if attr[0].size == 0 then attr[0].size = ffi.sizeof(attr[0]) end + local fd = C.perf_event_open(attr, pid or 0, cpu or -1, group_fd or -1, c.PERF_FLAG[flags or 0]) + if fd < 0 then + return nil, t.error(errno()) + end + return retfd(fd) + end + -- Read the tracepoint configuration (see "/sys/kernel/debug/tracing/available_events") + -- @param event_path path to tracepoint (e.g. "/sys/kernel/debug/tracing/events/syscalls/sys_enter_write") + -- @return tp, err (e.g. 538, nil) + function S.perf_tracepoint(event_path) + local config = nil + event_path = event_path.."/id" + local fd, err = S.open(event_path, c.O.RDONLY) + if fd then + local ret, err = fd:read(nil, 256) + if ret then + config = tonumber(ret) + end + fd:close() + end + return config, err + end + -- Attach or detach a probe, same semantics as Lua tables. + -- See https://www.kernel.org/doc/Documentation/trace/kprobetrace.txt + -- (When the definition is not nil, it will be created, otherwise it will be detached) + -- @param probe_type either "kprobe" or "uprobe", no other probe types are supported + -- @param name chosen probe name (e.g. "myprobe") + -- @param definition (set to nil to disable probe) (e.g. "do_sys_open $retval") + -- @param retval true/false if this should be entrypoint probe or return probe + -- @return tp, err (e.g. 1099, nil) + function S.perf_probe(probe_type, name, definition, retval) + local event_path = string.format('/sys/kernel/debug/tracing/%s_events', probe_type) + local probe_path = string.format('/sys/kernel/debug/tracing/events/%ss/%s', probe_type, name) + -- Check if probe already exists + if definition and S.statfs(probe_path) then return nil, t.error(c.E.EEXIST) end + local fd, err = S.open(event_path, "wronly, append") + if not fd then return nil, err end + -- Format a probe definition + if not definition then + definition = "-:"..name -- Detach + else + definition = string.format("%s:%s %s", retval and "r" or "p", name, definition) + end + local ok, err = fd:write(definition) + fd:close() + -- Return tracepoint or success + if ok and definition then + return S.perf_tracepoint(probe_path) + end + return ok, err + end + -- Attach perf event reader to tracepoint (see "/sys/kernel/debug/tracing/available_events") + -- @param tp tracepoint identifier (e.g.: 538, use `S.perf_tracepoint()`) + -- @param type perf_attr.sample_type (default: "raw") + -- @param attrs table of attributes (e.g. {sample_type="raw, callchain"}, see `struct perf_event_attr`) + -- @return reader, err + function S.perf_attach_tracepoint(tp, pid, cpu, group_fd, attrs) + local pe = t.perf_event_attr1() + pe[0].type = "tracepoint" + pe[0].config = tp + pe[0].sample_type = "raw" + pe[0].sample_period = 1 + pe[0].wakeup_events = 1 + if attrs then + for k,v in pairs(attrs) do pe[0][k] = v end + end + -- Open perf event reader with given parameters + local fd, err = S.perf_event_open(pe, pid, cpu, group_fd, "fd_cloexec") + if not fd then return nil, err end + return t.perf_reader(fd) + end +end + return S end diff --git a/syscall/linux/types.lua b/syscall/linux/types.lua index 89bcd14da8..68ba25192d 100644 --- a/syscall/linux/types.lua +++ b/syscall/linux/types.lua @@ -115,6 +115,7 @@ local addstructs = { ff_rumble_effect = "struct ff_rumble_effect", ff_effect = "struct ff_effect", sock_fprog = "struct sock_fprog", + bpf_attr = "union bpf_attr", user_cap_header = "struct user_cap_header", user_cap_data = "struct user_cap_data", xt_get_revision = "struct xt_get_revision", @@ -128,6 +129,7 @@ local addstructs = { vhost_vring_addr = "struct vhost_vring_addr", vhost_memory_region = "struct vhost_memory_region", vhost_memory = "struct vhost_memory", + scm_timestamping = "struct scm_timestamping", } for k, v in pairs(addtypes) do addtype(types, k, v) end @@ -136,9 +138,12 @@ for k, v in pairs(addstructs) do addtype(types, k, v, lenmt) end -- these ones not in table as not helpful with vararg or arrays TODO add more addtype variants t.inotify_event = ffi.typeof("struct inotify_event") pt.inotify_event = ptt("struct inotify_event") -- still need pointer to this +pt.perf_event_header = ptt("struct perf_event_header") t.aio_context1 = ffi.typeof("aio_context_t[1]") t.sock_fprog1 = ffi.typeof("struct sock_fprog[1]") +t.bpf_attr1 = ffi.typeof("union bpf_attr[1]") +t.perf_event_attr1 = ffi.typeof("struct perf_event_attr[1]") t.user_cap_data2 = ffi.typeof("struct user_cap_data[2]") @@ -147,6 +152,8 @@ local iocbs = ffi.typeof("struct iocb[?]") t.iocbs = function(n, ...) return ffi.new(iocbs, n, ...) end local sock_filters = ffi.typeof("struct sock_filter[?]") t.sock_filters = function(n, ...) return ffi.new(sock_filters, n, ...) end +local bpf_insns = ffi.typeof("struct bpf_insn[?]") +t.bpf_insns = function(n, ...) return ffi.new(bpf_insns, n, ...) end local iocb_ptrs = ffi.typeof("struct iocb *[?]") t.iocb_ptrs = function(n, ...) return ffi.new(iocb_ptrs, n, ...) end @@ -760,6 +767,14 @@ mt.sock_filter = { addtype(types, "sock_filter", "struct sock_filter", mt.sock_filter) +mt.bpf_insn = { + __new = function(tp, code, dst_reg, src_reg, off, imm) + return ffi.new(tp, c.BPF[code], dst_reg or 0, src_reg or 0, off or 0, imm or 0) + end +} + +addtype(types, "bpf_insn", "struct bpf_insn", mt.bpf_insn) + -- capabilities data is an array so cannot put metatable on it. Also depends on version, so combine into one structure. -- TODO maybe add caching @@ -1163,6 +1178,23 @@ mt.mmsghdrs = { addtype_var(types, "mmsghdrs", "struct {int count; struct mmsghdr msg[?];}", mt.mmsghdrs) +addtype(types, "bpf_attr", "union bpf_attr") + +-- Metatype for Linux perf events +mt.perf_event_attr = { + index = { + type = function(self) return self.pe_type end, + config = function(self) return self.pe_config end, + sample_type = function(self) return self.pe_sample_type end, + }, + newindex = { + type = function(self, v) self.pe_type = c.PERF_TYPE[v] end, + config = function(self, v) self.pe_config = c.PERF_COUNT[v] end, + sample_type = function(self, v) self.pe_sample_type = c.PERF_SAMPLE[v] end, + }, +} +addtype(types, "perf_event_attr", "struct perf_event_attr", mt.perf_event_attr) + -- this is declared above samap_pt = { [c.AF.UNIX] = pt.sockaddr_un, diff --git a/syscall/linux/util.lua b/syscall/linux/util.lua index dd9d298e34..65f4e7b632 100644 --- a/syscall/linux/util.lua +++ b/syscall/linux/util.lua @@ -46,7 +46,10 @@ function util.if_nametoindex(name) -- standard function in some libc versions local s, err = S.socket(c.AF.LOCAL, c.SOCK.STREAM, 0) if not s then return nil, err end local i, err = if_nametoindex(name, s) - if not i then return nil, err end + if not i then + S.close(s) + return nil, err + end local ok, err = S.close(s) if not ok then return nil, err end return i diff --git a/syscall/linux/x64/nr.lua b/syscall/linux/x64/nr.lua index 7309565435..0a91a2d2c1 100644 --- a/syscall/linux/x64/nr.lua +++ b/syscall/linux/x64/nr.lua @@ -323,6 +323,7 @@ local nr = { getrandom = 318, memfd_create = 319, kexec_file_load = 320, + bpf = 321, } } diff --git a/syscall/linux/x86/nr.lua b/syscall/linux/x86/nr.lua index 9757aa2e6a..deb7551239 100644 --- a/syscall/linux/x86/nr.lua +++ b/syscall/linux/x86/nr.lua @@ -350,6 +350,7 @@ local nr = { seccomp = 354, getrandom = 355, memfd_create = 356, + bpf = 357, } } diff --git a/syscall/methods.lua b/syscall/methods.lua index fb2dcd36f5..51b04e51bf 100644 --- a/syscall/methods.lua +++ b/syscall/methods.lua @@ -205,6 +205,89 @@ t.timer = metatype("struct {timer_t timerid[1];}", { --__gc = S.timer_delete, }) +if abi.os == "linux" then + -- Linux performance monitoring reader + t.perf_reader = metatype("struct {int fd; char *map; size_t map_pages; }", { + __new = function (ct, fd) + if not fd then return ffi.new(ct) end + if istype(t.fd, fd) then fd = fd:nogc():getfd() end + return ffi.new(ct, fd) + end, + __len = function(t) return ffi.sizeof(t) end, + __gc = function (t) t:close() end, + __index = { + close = function(t) + t:munmap() + if t.fd > 0 then S.close(t.fd) end + end, + munmap = function (t) + if t.map_pages > 0 then + S.munmap(t.map, (t.map_pages + 1) * S.getpagesize()) + t.map_pages = 0 + end + end, + -- read(2) interface, see `perf_attr.read_format` + -- @return u64 or an array of u64 + read = function (t, len) + local rvals = ffi.new('uint64_t [4]') + local nb, err = S.read(t.fd, rvals, len or ffi.sizeof(rvals)) + if not nb then return nil, err end + return nb == 8 and rvals[0] or rvals + end, + -- mmap(2) interface, see sampling interface (`perf_attr.sample_type` and `perf_attr.mmap`) + -- first page is metadata page, the others are sample_type dependent + mmap = function (t, pages) + t:munmap() + pages = pages or 8 + local map, err = S.mmap(nil, (pages + 1) * S.getpagesize(), "read, write", "shared", t.fd, 0) + if not map then return nil, err end + t.map = map + t.map_pages = pages + return pages + end, + meta = function (t) + return t.map_pages > 0 and ffi.cast("struct perf_event_mmap_page *", t.map) or nil + end, + -- next() function for __ipairs returning (len, event) pairs + -- it only retires read events when current event length is passed + next = function (t, curlen) + local buffer_size = S.getpagesize() * t.map_pages + local base = t.map + S.getpagesize() + local meta = t:meta() + -- Retire last read event or start iterating + if curlen then + meta.data_tail = meta.data_tail + curlen + end + -- End of ring buffer, yield + -- TODO: + if meta.data_head == meta.data_tail then + return + end + local e = pt.perf_event_header(base + (meta.data_tail % buffer_size)) + local e_end = base + (meta.data_tail + e.size) % buffer_size; + -- If the perf event wraps around the ring, we need to make a contiguous copy + if ffi.cast("uintptr_t", e_end) < ffi.cast("uintptr_t", e) then + local tmp_e = ffi.new("char [?]", e.size) + local len = (base + buffer_size) - ffi.cast('char *', e) + ffi.copy(tmp_e, e, len) + ffi.copy(tmp_e + len, base, e.size - len) + e = ffi.cast(ffi.typeof(e), tmp_e) + end + return e.size, e + end, + -- Various ioctl() wrappers + ioctl = function(t, cmd, val) return S.ioctl(t.fd, cmd, val or 0) end, + start = function(t) return t:ioctl("PERF_EVENT_IOC_ENABLE") end, + stop = function(t) return t:ioctl("PERF_EVENT_IOC_DISABLE") end, + refresh = function(t) return t:ioctl("PERF_EVENT_IOC_REFRESH") end, + reset = function(t) return t:ioctl("PERF_EVENT_IOC_RESET") end, + setfilter = function(t, val) return t:ioctl("PERF_EVENT_IOC_SET_FILTER", val) end, + setbpf = function(t, fd) return t:ioctl("PERF_EVENT_IOC_SET_BPF", pt.void(fd)) end, + }, + __ipairs = function(t) return t.next, t, nil end + }) +end + -- TODO reinstate this, more like fd is, hence changes to destroy --[[ t.aio_context = metatype("struct {aio_context_t ctx;}", { diff --git a/syscall/netbsd/ffifunctions.lua b/syscall/netbsd/ffifunctions.lua index 2a6741b425..25b32278ca 100644 --- a/syscall/netbsd/ffifunctions.lua +++ b/syscall/netbsd/ffifunctions.lua @@ -78,5 +78,7 @@ int __nanosleep50(const struct timespec *req, struct timespec *rem); int __timer_settime50(timer_t timerid, int flags, const struct itimerspec *new_value, struct itimerspec * old_value); int __timer_gettime50(timer_t timerid, struct itimerspec *curr_value); int __adjtime50(const struct timeval *delta, struct timeval *olddelta); + +int fstatat(int dirfd, const char *pathname, struct stat *buf, int flags); ]] diff --git a/syscall/openbsd/ffi.lua b/syscall/openbsd/ffi.lua index d89b277f27..a53fc2b5b2 100644 --- a/syscall/openbsd/ffi.lua +++ b/syscall/openbsd/ffi.lua @@ -295,6 +295,7 @@ struct sigaction { append [[ int reboot(int howto); int ioctl(int d, unsigned long request, void *arg); +int fstatat(int dirfd, const char *pathname, struct stat *buf, int flags); /* not syscalls, but using for now */ int grantpt(int fildes); diff --git a/syscall/osx/c.lua b/syscall/osx/c.lua index 08d6a0339e..82d077f53b 100644 --- a/syscall/osx/c.lua +++ b/syscall/osx/c.lua @@ -42,6 +42,7 @@ local C = setmetatable({}, { }) -- new stat structure, else get legacy one; could use syscalls instead +-- does not work for fstatat C.stat = C.stat64 C.fstat = C.fstat64 C.lstat = C.lstat64 @@ -56,7 +57,7 @@ function C.getdirentries(fd, buf, len, basep) end ]] --- cannot find these anywhere! +-- cannot find these anywhere! Apparently not there since 64 bit inodes? --C.getdirentries = ffi.C._getdirentries --C.sigaction = ffi.C._sigaction diff --git a/syscall/osx/constants.lua b/syscall/osx/constants.lua index 9a6ec0714d..40108a2003 100644 --- a/syscall/osx/constants.lua +++ b/syscall/osx/constants.lua @@ -1109,5 +1109,27 @@ c.CLOCKTYPE = { c.CLOCKTYPE.REALTIME = c.CLOCKTYPE.SYSTEM -return c +c.CLOCK = strflag { + REALTIME = 0, + MONOTONIC_RAW = 4, + MONOTONIC_RAW_APPROX = 5, + MONOTONIC = 6, + UPTIME_RAW = 8, + UPTIME_RAW_APPROX = 9, + PROCESS_CPUTIME_ID = 12, + THREAD_CPUTIME_ID = 16, +} + +-- AT constants only in recent versions, should check when added +c.AT_FDCWD = atflag { + FDCWD = -2, +} +c.AT = multiflags { + EACCESS = 0x0010, + SYMLINK_NOFOLLOW = 0x0020, + SYMLINK_FOLLOW = 0x0040, + REMOVEDIR = 0x0080, +} + +return c diff --git a/syscall/osx/ffi.lua b/syscall/osx/ffi.lua index ae6fedc1b0..831e5621bf 100644 --- a/syscall/osx/ffi.lua +++ b/syscall/osx/ffi.lua @@ -30,7 +30,8 @@ typedef int64_t blkcnt_t; typedef int32_t blksize_t; typedef int32_t suseconds_t; typedef uint16_t nlink_t; -typedef uint64_t ino_t; // at least on recent desktop; TODO define as ino64_t +typedef uint64_t ino64_t; +typedef uint32_t ino_t; typedef long time_t; typedef int32_t daddr_t; typedef unsigned long clock_t; @@ -158,7 +159,7 @@ struct stat { dev_t st_dev; mode_t st_mode; nlink_t st_nlink; - ino_t st_ino; + ino64_t st_ino; uid_t st_uid; gid_t st_gid; dev_t st_rdev; @@ -174,6 +175,25 @@ struct stat { int32_t st_lspare; int64_t st_qspare[2]; }; +struct stat32 { + dev_t st_dev; + ino_t st_ino; + mode_t st_mode; + nlink_t st_nlink; + uid_t st_uid; + gid_t st_gid; + dev_t st_rdev; + struct timespec st_atimespec; + struct timespec st_mtimespec; + struct timespec st_ctimespec; + off_t st_size; + blkcnt_t st_blocks; + blksize_t st_blksize; + uint32_t st_flags; + uint32_t st_gen; + int32_t st_lspare; + int64_t st_qspare[2]; +}; union sigval { int sival_int; void *sival_ptr; @@ -292,6 +312,7 @@ int mount(const char *type, const char *dir, int flags, void *data); int stat64(const char *path, struct stat *sb); int lstat64(const char *path, struct stat *sb); int fstat64(int fd, struct stat *sb); +int fstatat(int dirfd, const char *pathname, struct stat32 *buf, int flags); int _getdirentries(int fd, char *buf, int nbytes, long *basep); int _sigaction(int signum, const struct sigaction *act, struct sigaction *oldact); diff --git a/syscall/osx/syscalls.lua b/syscall/osx/syscalls.lua index 47d7918db7..67949c1905 100644 --- a/syscall/osx/syscalls.lua +++ b/syscall/osx/syscalls.lua @@ -53,6 +53,14 @@ function S.clock_get_time(clock_serv, cur_time) return cur_time end +-- cannot find out how to get new stat type from fstatat +function S.fstatat(fd, path, buf, flags) + if not buf then buf = t.stat32() end + local ret, err = C.fstatat(c.AT_FDCWD[fd], path, buf, c.AT[flags]) + if ret == -1 then return nil, t.error(err or errno()) end + return buf +end + return S end diff --git a/syscall/osx/types.lua b/syscall/osx/types.lua index 4b3304cde0..204ad5aecd 100644 --- a/syscall/osx/types.lua +++ b/syscall/osx/types.lua @@ -120,6 +120,9 @@ end addtype(types, "stat", "struct stat", mt.stat) +-- for fstatat where we can'tseem to get 64 bit version at present +addtype(types, "stat32", "struct stat32", mt.stat) + local signames = {} local duplicates = {LWT = true, IOT = true, CLD = true, POLL = true} for k, v in pairs(c.SIG) do diff --git a/syscall/syscalls.lua b/syscall/syscalls.lua index 8ff4c5358c..1a29cd24d0 100644 --- a/syscall/syscalls.lua +++ b/syscall/syscalls.lua @@ -75,7 +75,13 @@ local function retiter(ret, err, array) end -- generic system calls -function S.close(fd) return retbool(C.close(getfd(fd))) end +function S.close(fd) + if fd == getfd(fd) then -- fd number + return retbool(C.close(getfd(fd))) + else -- fd object: avoid mulitple close + return fd:close() + end +end function S.chdir(path) return retbool(C.chdir(path)) end function S.fchdir(fd) return retbool(C.fchdir(getfd(fd))) end function S.fchmod(fd, mode) return retbool(C.fchmod(getfd(fd), c.MODE[mode])) end @@ -422,6 +428,7 @@ function S.getpid() return C.getpid() end function S.getppid() return C.getppid() end function S.getgid() return C.getgid() end function S.getegid() return C.getegid() end +function S.gettid() return C.gettid() end function S.setuid(uid) return retbool(C.setuid(uid)) end function S.setgid(gid) return retbool(C.setgid(gid)) end function S.seteuid(uid) return retbool(C.seteuid(uid)) end @@ -734,6 +741,7 @@ if C.getdents then buf = buf or t.buffer(size) local ret, err = C.getdents(getfd(fd), buf, size) if ret == -1 then return nil, t.error(err or errno()) end + if ret == 0 then return nil, nil end return t.dirents(buf, ret) end end diff --git a/syscall/types.lua b/syscall/types.lua index 17c74b1fec..539ab3c715 100644 --- a/syscall/types.lua +++ b/syscall/types.lua @@ -601,15 +601,10 @@ if bsdtypes then types = bsdtypes.init(c, types) end -- define dents type if dirent is defined if t.dirent then t.dirents = function(buf, size) -- buf should be char* - local d, i = nil, 0 + local i = 0 return function() -- TODO work out if possible to make stateless - if size > 0 and not d then - d = pt.dirent(buf) - i = i + d.d_reclen - return d - end while i < size do - d = pt.dirent(pt.char(d) + d.d_reclen) + local d = pt.dirent(buf + i) i = i + d.d_reclen if d.ino ~= 0 then return d end -- some systems use ino = 0 for deleted files before removed eg OSX; it is never valid end diff --git a/syscall/util.lua b/syscall/util.lua index 8fb543edf4..3f3515853f 100644 --- a/syscall/util.lua +++ b/syscall/util.lua @@ -56,22 +56,19 @@ function util.ls(name, buf, size) if err then return nil, err end local di return function() - local d, first - repeat + while true do + if di then + local d = di() + if d then return d.name, d end + end + -- Fetch more entries. + local err + di, err = fd:getdents(buf, size) if not di then - local err - di, err = fd:getdents(buf, size) - if not di then - fd:close() - error(err) - end - first = true + fd:close() + if err then error(err) else return nil end end - d = di() - if not d then di = nil end - if not d and first then return nil end - until d - return d.name, d + end end end diff --git a/test/bsd.lua b/test/bsd.lua index 70cba01b39..5fcb4a3b74 100644 --- a/test/bsd.lua +++ b/test/bsd.lua @@ -2,7 +2,7 @@ local function init(S) -local helpers = require "syscall.helpers" +local helpers = require "test.helpers" local types = S.types local c = S.c local abi = S.abi @@ -89,7 +89,9 @@ test.filesystem_bsd = { test_chflags = function() local fd = assert(S.creat(tmpfile, "RWXU")) assert(fd:write("append")) - assert(S.chflags(tmpfile, "uf_append")) + local ok, err = S.chflags(tmpfile, "uf_append") + if not ok and err.OPNOTSUPP then error "skipped" end + assert(ok, err) assert(fd:write("append")) assert(fd:seek(0, "set")) local n, err = fd:write("not append") @@ -102,7 +104,9 @@ test.filesystem_bsd = { if not S.lchflags then error "skipped" end local fd = assert(S.creat(tmpfile, "RWXU")) assert(fd:write("append")) - assert(S.lchflags(tmpfile, "uf_append")) + local ok, err = S.lchflags(tmpfile, "uf_append") + if not ok and err.OPNOTSUPP then error "skipped" end + assert(ok, err) assert(fd:write("append")) assert(fd:seek(0, "set")) local n, err = fd:write("not append") @@ -114,7 +118,9 @@ test.filesystem_bsd = { test_fchflags = function() local fd = assert(S.creat(tmpfile, "RWXU")) assert(fd:write("append")) - assert(fd:chflags("uf_append")) + local ok, err = fd:chflags("uf_append") + if not ok and err.OPNOTSUPP then error "skipped" end + assert(ok, err) assert(fd:write("append")) assert(fd:seek(0, "set")) local n, err = fd:write("not append") @@ -127,7 +133,9 @@ test.filesystem_bsd = { if not S.chflagsat then error "skipped" end local fd = assert(S.creat(tmpfile, "RWXU")) assert(fd:write("append")) - assert(S.chflagsat("fdcwd", tmpfile, "uf_append", "symlink_nofollow")) + local ok, err = S.chflagsat("fdcwd", tmpfile, "uf_append", "symlink_nofollow") + if not ok and err.OPNOTSUPP then error "skipped" end + assert(ok, err) assert(fd:write("append")) assert(fd:seek(0, "set")) local n, err = fd:write("not append") @@ -258,7 +266,8 @@ test.bsd_extattr = { assert(S.unlink(tmpfile)) local n, err = fd:extattr_get("user", "myattr", false) -- false does raw call with no buffer to return length if not n and err.OPNOTSUPP then error "skipped" end -- fs does not support extattr - assert(not n and err.NOATTR) + assert(not n, "expected to fail") + assert(err.NOATTR, err) assert(fd:close()) end, test_extattr_getsetdel_fd = function() @@ -267,7 +276,8 @@ test.bsd_extattr = { assert(S.unlink(tmpfile)) local n, err = fd:extattr_get("user", "myattr", false) -- false does raw call with no buffer to return length if not n and err.OPNOTSUPP then error "skipped" end -- fs does not support extattr - assert(not n and err.NOATTR) + assert(not n, "expected to fail") + assert(err.NOATTR, err) local n, err = fd:extattr_set("user", "myattr", "myvalue") if not n and err.OPNOTSUPP then error "skipped" end -- fs does not support setting extattr assert(n, err) diff --git a/test/ctest-linux.lua b/test/ctest-linux.lua index 4433013fcd..a531542231 100644 --- a/test/ctest-linux.lua +++ b/test/ctest-linux.lua @@ -66,6 +66,81 @@ ctypes["struct termios"] = nil -- not defined by glibc ctypes["struct k_sigaction"] = nil +-- eBPF not available on Travis / opaque types +ctypes["struct bpf_insn"] = nil +ctypes["union bpf_attr"] = nil +c.BPF_MAP = {} +c.BPF_CMD = {} +c.BPF_PROG = {} +c.BPF_ATTACH_TYPE = {} +c.BPF.ALU64 = nil +c.BPF.DW = nil +c.BPF.JSGT = nil +c.BPF.JSGE = nil +c.BPF.CALL = nil +c.BPF.EXIT = nil +c.BPF.TO_LE = nil +c.BPF.TO_BE = nil +c.BPF.ANY = nil +c.BPF.NOEXIST = nil +c.BPF.EXIST = nil +c.BPF.END = nil +c.BPF.ARSH = nil +c.BPF.XADD = nil +c.BPF.JNE = nil +c.BPF.MOV = nil +c.SYS.bpf = nil + +-- no perf_event_open on Travis CI +ctypes["struct perf_event_attr"] = nil +ctypes["struct perf_event_reader"] = nil +ctypes["struct perf_event_header"] = nil +ctypes["struct perf_event_mmap_page"] = nil +c.PERF_TYPE = {} +c.PERF_COUNT = {} +c.PERF_SAMPLE = {} +c.PERF_FLAG = {} +c.PERF_SAMPLE_REGS = {} +c.PERF_SAMPLE_BRANCH = {} +c.PERF_READ_FORMAT = {} +c.PERF_RECORD = {} +-- no perf_event_open ioctls on Travis CI +c.IOCTL.PERF_EVENT_IOC_ENABLE = nil +c.IOCTL.PERF_EVENT_IOC_DISABLE = nil +c.IOCTL.PERF_EVENT_IOC_REFRESH = nil +c.IOCTL.PERF_EVENT_IOC_RESET = nil +c.IOCTL.PERF_EVENT_IOC_PERIOD = nil +c.IOCTL.PERF_EVENT_IOC_SET_OUTPUT = nil +c.IOCTL.PERF_EVENT_IOC_SET_FILTER = nil +c.IOCTL.PERF_EVENT_IOC_ID = nil +c.IOCTL.PERF_EVENT_IOC_SET_BPF = nil + +-- not in kernel headers used by Travis CI +ctypes["struct scm_timestamping"] = nil +c.SCM.TSTAMP_ACK = nil +c.SCM.TSTAMP_SCHED = nil +c.SCM.TSTAMP_SND = nil +c.SCM.TIMESTAMPING_OPT_STATS = nil + +-- not in kernel headers used by Travis CI +c.SOF.TIMESTAMPING_LAST = nil +c.SOF.TIMESTAMPING_MASK = nil +c.SOF.TIMESTAMPING_OPT_CMSG = nil +c.SOF.TIMESTAMPING_OPT_ID = nil +c.SOF.TIMESTAMPING_OPT_PKTINFO = nil +c.SOF.TIMESTAMPING_OPT_STATS = nil +c.SOF.TIMESTAMPING_OPT_TSONLY = nil +c.SOF.TIMESTAMPING_OPT_TX_SWHW = nil +c.SOF.TIMESTAMPING_RAW_HARDWARE = nil +c.SOF.TIMESTAMPING_RX_HARDWARE = nil +c.SOF.TIMESTAMPING_RX_SOFTWARE = nil +c.SOF.TIMESTAMPING_SOFTWARE = nil +c.SOF.TIMESTAMPING_SYS_HARDWARE = nil +c.SOF.TIMESTAMPING_TX_ACK = nil +c.SOF.TIMESTAMPING_TX_HARDWARE = nil +c.SOF.TIMESTAMPING_TX_SCHED = nil +c.SOF.TIMESTAMPING_TX_SOFTWARE = nil + if abi.arch == "arm" then ctypes["struct statfs64"] = nil end -- padding difference, not that important for k, v in pairs(c.IOCTL) do if type(v) == "table" then c.IOCTL[k] = v.number end end @@ -223,6 +298,10 @@ c.TCP.QUEUE_SEQ = nil c.TCP.TIMESTAMP = nil c.TCP.USER_TIMEOUT = nil c.TCP.REPAIR_QUEUE = nil +c.RTA.NEWDST = nil +c.RTA.PREF = nil +c.RTA.VIA = nil +c.RTA.MFC_STATS = nil -- these are not in Musl at present TODO send patches to get them in c.IPPROTO.UDPLITE = nil @@ -279,6 +358,23 @@ c.SO.PEEK_OFF = nil c.SO.GET_FILTER = nil c.SO.NOFCS = nil c.SO.WIFI_STATUS = nil +c.SO.REUSEPORT = nil +c.SO.LOCK_FILTER = nil +c.SO.SELECT_ERR_QUEUE = nil +c.SO.BUSY_POLL = nil +c.SO.MAX_PACING_RATE = nil +c.SO.BPF_EXTENSIONS = nil +c.SO.INCOMING_CPU = nil +c.SO.ATTACH_BPF = nil +c.SO.DETACH_BPF = nil +c.SO.ATTACH_REUSEPORT_CBPF = nil +c.SO.ATTACH_REUSEPORT_EBPF = nil + +-- new fcntl +c.F.CANCELLK = nil +c.F.ADD_SEALS = nil +c.F.GET_SEALS = nil +c.F_SEAL = nil -- Musl changes some of the syscall constants in its 32/64 bit handling c.SYS.getdents = nil @@ -318,6 +414,8 @@ c.CBAUDEX = nil -- missing on my mips box c.AUDIT_ARCH.H8300 = nil +-- missing on CI +c.AUDIT_ARCH.AARCH64 = nil -- defined only in linux/termios.h which we cannot include on mips c.TIOCM.OUT1 = nil diff --git a/test/freebsd.lua b/test/freebsd.lua index 4a57af07ad..e296017f4b 100644 --- a/test/freebsd.lua +++ b/test/freebsd.lua @@ -2,7 +2,7 @@ local function init(S) -local helpers = require "syscall.helpers" +local helpers = require "test.helpers" local types = S.types local c = S.c local abi = S.abi diff --git a/test/helpers.lua b/test/helpers.lua new file mode 100644 index 0000000000..6f29c0a1d3 --- /dev/null +++ b/test/helpers.lua @@ -0,0 +1,81 @@ +-- misc helper functions + +local require, error, assert, tonumber, tostring, +setmetatable, pairs, ipairs, unpack, rawget, rawset, +pcall, type, table, string, math = +require, error, assert, tonumber, tostring, +setmetatable, pairs, ipairs, unpack, rawget, rawset, +pcall, type, table, string, math + +local debug, collectgarbage = require "debug", collectgarbage + +local ffi = require "ffi" +local bit = require "bit" + +local h = {} + +-- generic assert helper, mainly for tests +function h.assert(cond, err, ...) + if not cond then + error(tostring(err or "unspecified error")) -- annoyingly, assert does not call tostring! + end + collectgarbage("collect") -- force gc, to test for bugs + if type(cond) == "function" then return cond, err, ... end + if cond == true then return ... end + return cond, ... +end + +-- endian conversion +if ffi.abi("be") then -- nothing to do + function h.htonl(b) return b end + function h.htons(b) return b end + function h.convle32(b) return bit.bswap(b) end -- used by file system capabilities, always stored as le +else + function h.htonl(b) return bit.bswap(b) end + function h.htons(b) return bit.rshift(bit.bswap(b), 16) end + function h.convle32(b) return b end -- used by file system capabilities, always stored as le +end +h.ntohl = h.htonl -- reverse is the same +h.ntohs = h.htons -- reverse is the same + +function h.octal(s) return tonumber(s, 8) end +local octal = h.octal + +function h.split(delimiter, text) + if delimiter == "" then return {text} end + if #text == 0 then return {} end + local list = {} + local pos = 1 + while true do + local first, last = text:find(delimiter, pos) + if first then + list[#list + 1] = text:sub(pos, first - 1) + pos = last + 1 + else + list[#list + 1] = text:sub(pos) + break + end + end + return list +end + +function h.trim(s) -- TODO should replace underscore with space + return (s:gsub("^%s*(.-)%s*$", "%1")) +end + +local split, trim = h.split, h.trim + +h.divmod = function(a, b) + return math.floor(a / b), a % b +end + +h.booltoc = setmetatable({ + [0] = 0, + [1] = 1, + [false] = 0, + [true] = 1, +}, {__call = function(tb, arg) return tb[arg or 0] end}) -- allow nil as false + +function h.ctobool(i) return tonumber(i) ~= 0 end + +return h diff --git a/test/linux-constants.lua b/test/linux-constants.lua index cb2de6e968..b25947f639 100644 --- a/test/linux-constants.lua +++ b/test/linux-constants.lua @@ -155,6 +155,21 @@ local function fixup_constants(abi, c) c.SECCOMP_MODE = nil c.SECCOMP_RET = nil c.MFD = nil + c.RTA.NEWDST = nil + c.RTA.PREF = nil + c.RTA.VIA = nil + c.RTA.MFC_STATS = nil + c.AUDIT_ARCH.AARCH64 = nil + c.SO.MAX_PACING_RATE = nil + c.SO.BPF_EXTENSIONS = nil + c.SO.INCOMING_CPU = nil + c.SO.ATTACH_BPF = nil + c.SO.DETACH_BPF = nil + c.SO.ATTACH_REUSEPORT_CBPF = nil + c.SO.ATTACH_REUSEPORT_EBPF = nil + c.F_SEAL = nil + c.F.ADD_SEALS = nil + c.F.GET_SEALS = nil -- these are not even in linux git head headers or names wrong c.O.ASYNC = nil @@ -202,9 +217,58 @@ local function fixup_constants(abi, c) c.SYS.getrandom = nil c.SYS.memfd_create = nil c.SYS.kexec_file_load = nil + c.SYS.bpf = nil -- new constants c.GRND = nil + -- requires Linux 3.19+, not supported on Travis + c.BPF_MAP = {} + c.BPF_CMD = {} + c.BPF_PROG = {} + c.BPF_ATTACH_TYPE = {} + c.BPF.ALU64 = nil + c.BPF.DW = nil + c.BPF.JSGT = nil + c.BPF.JSGE = nil + c.BPF.CALL = nil + c.BPF.EXIT = nil + c.BPF.TO_LE = nil + c.BPF.TO_BE = nil + c.BPF.END = nil + c.BPF.ARSH = nil + c.BPF.XADD = nil + c.BPF.JNE = nil + c.BPF.MOV = nil + c.BPF.ANY = nil + c.BPF.EXIST = nil + c.BPF.NOEXIST = nil + -- no perf_event_open on Travis CI + c.PERF_TYPE = {} + c.PERF_COUNT = {} + c.PERF_SAMPLE = {} + c.PERF_FLAG = {} + c.PERF_SAMPLE_REGS = {} + c.PERF_SAMPLE_BRANCH = {} + c.PERF_READ_FORMAT = {} + c.PERF_RECORD = {} + + c.SOF.TIMESTAMPING_LAST = nil + c.SOF.TIMESTAMPING_MASK = nil + c.SOF.TIMESTAMPING_OPT_CMSG = nil + c.SOF.TIMESTAMPING_OPT_ID = nil + c.SOF.TIMESTAMPING_OPT_PKTINFO = nil + c.SOF.TIMESTAMPING_OPT_STATS = nil + c.SOF.TIMESTAMPING_OPT_TSONLY = nil + c.SOF.TIMESTAMPING_OPT_TX_SWHW = nil + c.SOF.TIMESTAMPING_RAW_HARDWARE = nil + c.SOF.TIMESTAMPING_RX_HARDWARE = nil + c.SOF.TIMESTAMPING_RX_SOFTWARE = nil + c.SOF.TIMESTAMPING_SOFTWARE = nil + c.SOF.TIMESTAMPING_SYS_HARDWARE = nil + c.SOF.TIMESTAMPING_TX_ACK = nil + c.SOF.TIMESTAMPING_TX_HARDWARE = nil + c.SOF.TIMESTAMPING_TX_SCHED = nil + c.SOF.TIMESTAMPING_TX_SOFTWARE = nil return c end diff --git a/test/linux-structures.lua b/test/linux-structures.lua index 811d723dcb..0ab4a68a6c 100644 --- a/test/linux-structures.lua +++ b/test/linux-structures.lua @@ -28,6 +28,7 @@ local function fixup_structs(abi, ctypes) ctypes["struct capabilities"] = nil ctypes["struct cap"] = nil ctypes["struct {dev_t dev;}"] = nil + ctypes["struct perf_event_reader"] = nil -- standard headers use __kernel types for these or just fixed sizes ctypes.ino_t = nil @@ -68,7 +69,12 @@ local function fixup_structs(abi, ctypes) ctypes["struct sockaddr_storage"] = nil -- uses __kernel_ ctypes["struct k_sigaction"] = nil -- seems to be incorrect in headers ctypes["struct mmsghdr"] = nil -- too new for our headers - + ctypes["union bpf_attr"] = nil -- too new for our headers + ctypes["struct bpf_insn"] = nil -- too new for our headers + ctypes["struct perf_event_attr"] = nil -- too new for our headers + ctypes["struct perf_event_header"] = nil -- too new for our headers + ctypes["struct perf_event_mmap_page"] = nil -- too new for our headers + ctypes["struct scm_timestamping"] = nil -- too new for our headers ctypes["sigset_t"] = nil -- still some issues return ctypes diff --git a/test/linux.lua b/test/linux.lua index 40e4d48506..953e5ea859 100644 --- a/test/linux.lua +++ b/test/linux.lua @@ -2,7 +2,7 @@ local function init(S) -local helpers = require "syscall.helpers" +local helpers = require "test.helpers" local abi = S.abi local types = S.types local c = S.c @@ -269,7 +269,9 @@ test.misc_linux = { end, ]] test_adjtimex = function() - local tt = assert(S.adjtimex()) + local tt, err = S.adjtimex() + if not tt and err.PERM then error "skipped" end + assert(tt, err) end, test_prctl = function() local n @@ -346,9 +348,12 @@ test.misc_linux = { end, test_memfd = function() if not S.memfd_create then error "skipped" end - local fd, err = S.memfd_create("", "cloexec") + local fd, err = S.memfd_create("", "cloexec, allow_sealing") if not fd and err.NOSYS then error "skipped" end assert(fd, err) + local seals = assert(fd:fcntl("get_seals")) + assert(seals == 0) + assert(fd:fcntl("add_seals", "shrink, grow, write, seal")) assert(fd:close()) end, } @@ -431,10 +436,6 @@ test.netlink = { end, test_getlink = function() local i = assert(nl.getlink()) - local st, err = S.stat("/sys/class/net") -- just in case sysfs not mounted - if not st then error "skipped" end - local df = assert(util.dirtable("/sys/class/net", true)) - assert_equal(#df, #i, "expect same number of interfaces as /sys/class/net") assert(i.lo, "expect a loopback interface") local lo = i.lo assert(lo.flags.up, "loopback interface should be up") @@ -486,7 +487,9 @@ test.netlink = { test_interfaces = function() local i = assert(nl.interfaces()) assert_equal(tostring(i.lo.inet[1].addr), "127.0.0.1", "loopback ipv4 on lo") - assert_equal(tostring(i.lo.inet6[1].addr), "::1", "loopback ipv6 on lo") + if i.lo.inet6[1] then + assert_equal(tostring(i.lo.inet6[1].addr), "::1", "loopback ipv6 on lo") + end end, test_newlink_flags_root = function() local p = assert(S.clone()) @@ -566,12 +569,6 @@ test.netlink = { assert(i.dummy0:down()) assert(i.dummy0:delete()) end, - test_interface_set_macaddr_fail = function() - local i = assert(nl.interfaces()) - assert(i.lo, "expect to find lo") - local ok, err = nl.newlink(i.lo.index, 0, 0, 0, "address", "46:9d:c9:06:dd:dd") - assert(not ok and err and (err.PERM or err.OPNOTSUPP), "should not be able to change macaddr on lo") - end, test_newlink_error_root = function() local ok, err = nl.newlink(-1, 0, "up", "up") assert(not ok, "expect bogus newlink to fail") @@ -670,6 +667,7 @@ test.netlink = { test_getroute_inet6 = function() local r = assert(nl.routes("inet6", "unspec")) local nr = r:match("::1/128") + if #nr == 0 then error "skipped" end -- no ipv6 support assert(#nr >= 1, "expect at least one matched route") -- one of my machines has two local lor = nr[1] assert_equal(tostring(lor.source), "::", "expect empty source route") @@ -766,6 +764,10 @@ test.netlink = { assert_equal(#n, 1) assert_equal(tostring(n[1].lladdr), "46:9d:c9:06:dd:dd") assert_equal(tostring(n[1].dst), "10.0.0.2") + assert_equal(tostring(n[1].dest), "10.0.0.2") + assert_equal(n[1].ifindex, i.dummy0.index) + assert_equal(n[1].state, c.NUD.PERMANENT) + assert_equal(n[1].flags, 0) assert(nl.delneigh(i.dummy0, {family = "inet"}, "dst", "10.0.0.2", "lladdr", "46:9d:c9:06:dd:dd")) assert(i.dummy0:delete()) end, @@ -1315,9 +1317,152 @@ test.bpf = { end, } +-- test eBPF filters +if S.bpf and not S.__rump then + test.bpf_root = {} + test.bpf_root.test_bpf_map_create = function() + local bpf = t.sock_filters(1, { + t.sock_filter("RET,K", 0) + }) + -- Update + local key, klen = ffi.new('int [1]', 0xdead), ffi.sizeof('int') + local fd, err = assert(S.bpf_map_create(c.BPF_MAP.HASH, klen, klen, 10)) + assert(S.bpf_map_op(c.BPF_CMD.MAP_UPDATE_ELEM, fd, key, key) == 0) + -- Retrieve + local val = ffi.new('int [1]', 0xbeef) + local ok, err = S.bpf_map_op(c.BPF_CMD.MAP_LOOKUP_ELEM, fd, key, val) + assert(ok and key[0] == val[0]) + S.close(fd) + end + test.bpf_root.test_bpf_prog_load = function() + local bpf = t.bpf_insns(2, { + t.bpf_insn("ALU64,MOV,K", 0, 0, 0, 1), + t.bpf_insn("JMP,EXIT"), + }) + local fd, err, log = S.bpf_prog_load(c.BPF_PROG.SOCKET_FILTER, bpf, 2) + if not fd then assert(false, err..': '..log) end + S.close(fd) + end +end + +-- test perf_event_open +if S.perf_event_open and not S.__rump then + test.perf_root = {} + test.perf_root.test_perf_open = function () + -- Create perf event attribute with dummy config + local pe = t.perf_event_attr1() + pe[0].type = "software" + pe[0].config = "sw_dummy" + pe[0].disabled = 1 + pe[0].exclude_kernel = 1 + pe[0].exclude_hv = 1 + -- Open event and read a dummy value + local fd = S.perf_event_open(pe) + fd:ioctl("PERF_EVENT_IOC_ENABLE", 0) + local count = t.buffer(ffi.sizeof('int64_t')) + local rb = fd:read(count, ffi.sizeof(count)) + fd:ioctl("PERF_EVENT_IOC_DISABLE", 0) + fd:close() + -- Check just the size of read count + assert(rb == ffi.sizeof(count)) + end + test.perf_root.test_perf_sw = function () + -- Read out a software perf counter + local pe = t.perf_event_attr1() + pe[0].type = "software" + pe[0].config = "sw_cpu_clock" + pe[0].exclude_kernel = 1 + pe[0].exclude_hv = 1 + -- Open event and read a dummy value + -- @note perf event fd has CLO_EXEC, must not fork + local reader = t.perf_reader(S.perf_event_open(pe)) + reader:start() + local ticks = reader:read() + reader:close() + -- Check just the size of read count + assert(ticks > 0) + end + test.perf_root.test_perf_attach = function () + if not S.statfs("/sys/kernel/debug/tracing/events") then + print('skipping') -- debugfs must be mounted + return + end + -- Get tracepoint id + local tp = assert(S.perf_tracepoint("/sys/kernel/debug/tracing/events/syscalls/sys_enter_getcwd")) + local reader = S.perf_attach_tracepoint(tp) + -- Trace getcwd() syscall + reader:start() + S.getcwd() + S.getcwd() + local cnt = reader:read() + reader:stop() + reader:close() + -- Check value + assert(cnt == 2) + end + test.perf_root.test_perf_sampling = function () + if not S.statfs("/sys/kernel/debug/tracing/events") then + print('skipping') -- debugfs must be mounted + return + end + local sample_t = ffi.typeof [[ + struct { + struct perf_event_header header; + uint32_t size; + struct { + uint16_t id; + uint8_t flags; + uint8_t preempt_count; + int pid; + }; + uint64_t ip; + } * + ]] + -- Get tracepoint id + local tp = assert(S.perf_tracepoint("/sys/kernel/debug/tracing/events/syscalls/sys_enter_getcwd")) + local reader = S.perf_attach_tracepoint(tp) + -- Trace getcwd() syscall + reader:mmap() + reader:start() + for i = 1,10 do S.getcwd() end + reader:stop() + -- Read samples from mmap + local cnt = 0; + for len,e in ipairs(reader) do + if e.type ~= c.PERF_RECORD.SAMPLE then break end + -- Check if we're the caller + e = ffi.cast(sample_t, e) + if e.pid == S.getpid() then + cnt = cnt + 1 + end + end + reader:close() + -- Check if we got all samples + assert(cnt == 10) + end + test.perf_root.test_perf_kprobe = function () + if not S.statfs("/sys/kernel/debug/tracing/events") then + print('skipping') -- debugfs must be mounted + return + end + -- Attach a kprobe to open() + local tp = assert(S.perf_probe("kprobe", "myprobe", "do_sys_open $retval", true)) + local reader = S.perf_attach_tracepoint(tp) + reader:start() + S.open("/tmp", "rdonly") + local cnt = reader:read() + reader:stop() + reader:close() + -- Detach probe + S.perf_probe("kprobe", "myprobe", false) + -- See if we hit the probe + assert(cnt == 1) + end +end + -- TODO remove arch tests. Unclear if my ppc/arm does not support or a bug, retest later with newer kernel -- still ppc issues with 3.12.6 ppc, need to debug more, and mips issues -if not (abi.arch == "ppc64le" or abi.arch == "ppc" or abi.arch == "arm" or abi.arch == "mips" or S.__rump) then -- cannot test on rump as uses clone() +if not (abi.arch == "ppc64le" or abi.arch == "ppc" or abi.arch == "mips" or S.__rump) then -- cannot test on rump as uses clone() test.seccomp = { test_no_new_privs = function() -- this must be done for non root to call type 2 seccomp local p = assert(S.clone()) @@ -1481,7 +1626,7 @@ test.seccomp = { local pid = S.getpid() local ofd, err = S.open("/dev/null", "rdonly") -- not allowed fork_assert(not ofd, "should not run open") - fork_assert(err.errno == nr.SYS.open, "syscall that did not work should be open") + fork_assert(err.errno == nr.SYS.open or err.errno == nr.SYS.openat, "syscall that did not work should be open[at]") local pid = S.getpid() S._exit() else @@ -1764,7 +1909,7 @@ test.processes_linux = { fork_assert(S.getppid() == pid0, "parent pid should be previous pid") S.exit(23) else -- parent - local infop, rusage = assert(S.waitid("all", 0, "exited, stopped, continued")) + local infop, rusage = assert(S.waitid("pid", pid, "exited, stopped, continued")) assert_equal(infop.signo, c.SIG.CHLD, "waitid to return SIGCHLD") assert_equal(infop.status, 23, "exit should be 23") assert_equal(infop.code, c.SIGCLD.EXITED, "normal exit expected") @@ -1784,6 +1929,9 @@ test.processes_linux = { assert(status.EXITSTATUS == 23, "exit should be 23") end end, + test_tid = function() + assert(S.getpid() == S.gettid(), "PID should be the same as TID") + end, } test.scheduler = { test_getcpu = function() @@ -1850,13 +1998,11 @@ test.swap = { assert_equal(c.SWAP_FLAG["23, discard"], c.SWAP_FLAG["prefer, discard"] + bit.lshift(23, c.SWAP_FLAG["prio_shift"])) end, test_swap_fail = function() - local ex = "PERM" -- EPERM if not root - if S.geteuid() == 0 then ex = "INVAL" end local ok, err = S.swapon("/dev/null", "23, discard") if not ok and err.NOSYS then return end -- Android does not implement swap, so skip test - assert(not ok and err[ex], "should not create swap on /dev/null") + assert(not ok and (err.PERM or err.INVAL), "should not create swap on /dev/null") local ok, err = S.swapoff("/dev/null") - assert(not ok and err[ex], "no swap on /dev/null") + assert(not ok and (err.PERM or err.INVAL), "no swap on /dev/null") end, -- TODO need mkswap to test success } diff --git a/include/luaunit/luaunit.lua b/test/luaunit.lua similarity index 100% rename from include/luaunit/luaunit.lua rename to test/luaunit.lua diff --git a/test/netbsd.lua b/test/netbsd.lua index 9570d393d4..3039ed49f9 100644 --- a/test/netbsd.lua +++ b/test/netbsd.lua @@ -2,7 +2,7 @@ local function init(S) -local helpers = require "syscall.helpers" +local helpers = require "test.helpers" local types = S.types local c = S.c local abi = S.abi diff --git a/test/openbsd.lua b/test/openbsd.lua index ce2b4a7125..755c32094a 100644 --- a/test/openbsd.lua +++ b/test/openbsd.lua @@ -2,7 +2,7 @@ local function init(S) -local helpers = require "syscall.helpers" +local helpers = require "test.helpers" local types = S.types local c = S.c local abi = S.abi diff --git a/test/osx.lua b/test/osx.lua index 75ff6b932c..5346ccdbce 100644 --- a/test/osx.lua +++ b/test/osx.lua @@ -2,7 +2,7 @@ local function init(S) -local helpers = require "syscall.helpers" +local helpers = require "test.helpers" local types = S.types local c = S.c local abi = S.abi diff --git a/test/rump.lua b/test/rump.lua index d46a47cb3a..afd161209d 100644 --- a/test/rump.lua +++ b/test/rump.lua @@ -3,7 +3,7 @@ local function init(S) -local helpers = require "syscall.helpers" +local helpers = require "test.helpers" local types = S.types local c = S.c local abi = S.abi diff --git a/include/strict/strict.lua b/test/strict.lua similarity index 100% rename from include/strict/strict.lua rename to test/strict.lua diff --git a/test/test.lua b/test/test.lua index 64c875dc70..fddee0a6be 100644 --- a/test/test.lua +++ b/test/test.lua @@ -4,12 +4,9 @@ arg = arg or {} --- only use this installation for tests -package.path = "./?.lua;" +local strict = require "test.strict" -local strict = require "include.strict.strict" - -local helpers = require "syscall.helpers" +local helpers = require "test.helpers" local assert = helpers.assert @@ -108,7 +105,7 @@ local function assert_equal(...) end USE_EXPECTED_ACTUAL_IN_ASSERT_EQUALS = true -- strict wants this to be set -local luaunit = require "include.luaunit.luaunit" +local luaunit = require "test.luaunit" local sysfile = debug.getinfo(S.open).source local cov = {active = {}, cov = {}} @@ -953,7 +950,7 @@ test_file_operations_at = { local fd = assert(S.open(".")) assert(util.writefile(tmpfile, teststring, "RWXU")) local stat = assert(fd:fstatat(tmpfile)) - assert(stat.size == #teststring, "expect length to br what was written") + assert(stat.size == #teststring, "expect length to be what was written") assert(fd:close()) assert(S.unlink(tmpfile)) end, @@ -1356,7 +1353,9 @@ test_sockets_pipes = { assert(ss:nonblock()) local sa = assert(t.sockaddr_in6(0, "loopback")) assert_equal(sa.family, c.AF.INET6) - assert(ss:bind(sa)) + ok, err = ss:bind(sa) + if not ok and err.ADDRNOTAVAIL then error "skipped" end + assert(ok, err) local ba = assert(ss:getsockname()) assert_equal(ba.family, c.AF.INET6) assert(ss:listen()) -- will fail if we did not bind @@ -1410,6 +1409,7 @@ test_sockets_pipes = { local ok, err = cs:connect(ba6) local as = ss:accept() local ok, err = cs:connect(ba6) + if err.ADDRNOTAVAIL or err.NETUNREACH then error "skipped" end assert(ok or err.ISCONN, "unexpected error " .. tostring(err)); assert(ss:block()) -- force accept to wait as = as or assert(ss:accept()) @@ -1455,7 +1455,9 @@ test_sockets_pipes = { assert(ss:setsockopt(c.IPPROTO.IPV6, c.IPV6.V6ONLY, 1)) local sa = assert(t.sockaddr_in6(0, "loopback")) assert_equal(sa.family, c.AF.INET6) - assert(ss:bind(sa)) + ok, err = ss:bind(sa) + if not ok and err.ADDRNOTAVAIL then error "skipped" end + assert(ok, err) local ba = assert(ss:getsockname()) assert_equal(ba.family, c.AF.INET6) assert(ss:listen()) -- will fail if we did not bind @@ -1497,7 +1499,9 @@ test_sockets_pipes = { assert(ss:setsockopt(c.IPPROTO.IPV6, c.IPV6.V6ONLY, 1)) local sa = assert(t.sockaddr_in6(0, "loopback")) assert_equal(sa.family, c.AF.INET6) - assert(ss:bind(sa)) + ok, err = ss:bind(sa) + if not ok and err.ADDRNOTAVAIL then error "skipped" end + assert(ok, err) local ba = assert(ss:getsockname()) assert_equal(ba.family, c.AF.INET6) assert(ss:listen()) -- will fail if we did not bind @@ -1534,7 +1538,9 @@ test_sockets_pipes = { local loop6 = "::1" local cs = assert(S.socket("inet6", "dgram")) local sa = assert(t.sockaddr_in6(0, loop6)) - assert(ss:bind(sa)) + ok, err = ss:bind(sa) + if not ok and err.ADDRNOTAVAIL then error "skipped" end + assert(ok, err) local bsa = ss:getsockname() -- find bound address local n = assert(cs:sendto(teststring, nil, c.MSG.NOSIGNAL or 0, bsa)) -- got a sigpipe here on MIPS local f = assert(ss:recv(buf, size)) @@ -1645,7 +1651,9 @@ test_sockets_pipes = { assert(s, err) local s = assert(S.socket("inet6", "stream")) local sa = t.sockaddr_in6(0, "loopback") - assert(s:bind(sa)) + ok, err = s:bind(sa) + if not ok and err.ADDRNOTAVAIL then error "skipped" end + assert(ok, err) assert_equal(s:getsockopt("socket", "keepalive"), 0) assert(s:setsockopt("socket", "keepalive", 1)) assert(s:getsockopt("socket", "keepalive") ~= 0) @@ -1668,7 +1676,9 @@ test_sockets_pipes = { assert(s, err) local s = assert(S.socket("inet6", "stream")) local sa = t.sockaddr_in6(0, "loopback") - assert(s:bind(sa)) + ok, err = s:bind(sa) + if not ok and err.ADDRNOTAVAIL then error "skipped" end + assert(ok, err) assert_equal(s:getsockopt(c.IPPROTO.TCP, c.TCP.NODELAY), 0) assert(s:setsockopt(c.IPPROTO.TCP, c.TCP.NODELAY, 1)) assert(s:getsockopt(c.IPPROTO.TCP, c.TCP.NODELAY) ~= 0) @@ -1881,7 +1891,8 @@ test_termios = { local ws, err = S.stdout:ioctl("TIOCGWINSZ") if not ws and err.NOTTY then error "skipped" end -- stdout might not be a tty in test env assert(ws, err) - assert(ws.row > 0 and ws.col > 0) + if ws.row == 0 and ws.col == 0 then error "skipped" end + assert(ws.row > 0 and ws.col > 0, "expect positive winsz") end, } @@ -1946,9 +1957,6 @@ test_raw_socket = { assert(cs == expected, "expect correct ip checksum: got " .. string.format("%%%04X", cs) .. " expected " .. string.format("%%%04X", expected)) end, test_raw_udp_root = function() -- TODO create some helper functions, this is not very nice - - local h = require "syscall.helpers" -- TODO should not have to use later - local loop = "127.0.0.1" local raw = assert(S.socket("inet", "raw", "raw")) -- needed if not on Linux @@ -1972,8 +1980,8 @@ test_raw_socket = { local ca = cl:getsockname() -- TODO iphdr should have __index helpers for endianness etc (note use raw s_addr) - iphdr[0] = {ihl = 5, version = 4, tos = 0, id = 0, frag_off = h.htons(0x4000), ttl = 64, protocol = c.IPPROTO.UDP, check = 0, - saddr = sa.sin_addr.s_addr, daddr = ca.sin_addr.s_addr, tot_len = h.htons(len)} + iphdr[0] = {ihl = 5, version = 4, tos = 0, id = 0, frag_off = helpers.htons(0x4000), ttl = 64, protocol = c.IPPROTO.UDP, check = 0, + saddr = sa.sin_addr.s_addr, daddr = ca.sin_addr.s_addr, tot_len = helpers.htons(len)} --udphdr[0] = {src = sport, dst = ca.port, length = udplen} -- doesnt work with metamethods udphdr[0].src = sport @@ -2215,9 +2223,6 @@ test_proc = { local found = false if #ps == 0 then error "skipped" end -- not mounted but mount point exists for i = 1, #ps do - if ps[i].pid == 1 then - assert(ps[i].cmdline:find("init") or ps[i].cmdline:find("systemd"), "expect init or systemd to be process 1 usually") - end if ps[i].pid == me then found = true end end assert(found, "expect to find my process in ps") @@ -2234,7 +2239,6 @@ test_proc = { local p = util.proc(1) if not p.cmdline then error "skipped" end -- no files found, /proc not mounted assert(p and p.cmdline, "expect init to have cmdline") - assert(p.cmdline:find("init") or p.cmdline:find("systemd"), "expect init or systemd to be process 1 usually") end, } @@ -2305,15 +2309,13 @@ test_mmap = { test_processes = { test_nice = function() local n = assert(S.getpriority("process")) - assert_equal(n, 0, "process should start at priority 0") - local nn = assert(S.nice(1)) - assert_equal(nn, 1) - local nn = assert(S.setpriority("process", 0, 1)) -- sets to 1, which it already is + --assert_equal(n, 0, "process should start at priority 0") + --local nn = assert(S.nice(1)) + --assert_equal(nn, 1) + --local nn = assert(S.setpriority("process", 0, n)) -- sets to 1, which it already is end, test_fork_wait = function() local pid0 = S.getpid() - assert(pid0 > 1, "expecting my pid to be larger than 1") - assert(S.getppid() > 1, "expecting my parent pid to be larger than 1") local pid = assert(S.fork()) if pid == 0 then -- child fork_assert(S.getppid() == pid0, "parent pid should be previous pid") @@ -2327,8 +2329,6 @@ test_processes = { end, test_fork_waitpid = function() local pid0 = S.getpid() - assert(pid0 > 1, "expecting my pid to be larger than 1") - assert(S.getppid() > 1, "expecting my parent pid to be larger than 1") local pid = assert(S.fork()) if pid == 0 then -- child fork_assert(S.getppid() == pid0, "parent pid should be previous pid") @@ -2357,8 +2357,6 @@ test_processes = { end, test_fork_wait4 = function() local pid0 = S.getpid() - assert(pid0 > 1, "expecting my pid to be larger than 1") - assert(S.getppid() > 1, "expecting my parent pid to be larger than 1") local pid = assert(S.fork()) if pid == 0 then -- child fork_assert(S.getppid() == pid0, "parent pid should be previous pid") @@ -2373,8 +2371,6 @@ test_processes = { end, test_fork_wait3 = function() local pid0 = S.getpid() - assert(pid0 > 1, "expecting my pid to be larger than 1") - assert(S.getppid() > 1, "expecting my parent pid to be larger than 1") local pid = assert(S.fork()) if pid == 0 then -- child fork_assert(S.getppid() == pid0, "parent pid should be previous pid") @@ -2492,7 +2488,8 @@ if S.geteuid() == 0 then local i = assert(nl.interfaces()) local lo = assert(i.lo) assert(lo:up()) - assert(S.mount("none", "/sys", "sysfs")) + -- Do not destroy "/sys" if it is mounted + assert(S.statfs("/sys/kernel") or S.mount("none", "/sys", "sysfs")) end else -- not Linux -- run all tests, no namespaces available From 09b29d986810113f85f84e76b6f75bfdd5cbf126 Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Wed, 6 Nov 2019 11:44:24 +0100 Subject: [PATCH 043/209] ljsyscall: add some XDP constants --- lib/ljsyscall/syscall/linux/constants.lua | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/lib/ljsyscall/syscall/linux/constants.lua b/lib/ljsyscall/syscall/linux/constants.lua index dcbf608c66..de723e2522 100644 --- a/lib/ljsyscall/syscall/linux/constants.lua +++ b/lib/ljsyscall/syscall/linux/constants.lua @@ -426,6 +426,7 @@ c.SOL = strflag { ATM = 264, AAL = 265, IRDA = 266, + XDP = 283 } if arch.SOLSOCKET then c.SOL.SOCKET = arch.SOLSOCKET else c.SOL.SOCKET = 1 end @@ -483,6 +484,14 @@ c.SO = strflag(arch.SO or { ATTACH_BPF = 50, ATTACH_REUSEPORT_CBPF = 51, ATTACH_REUSEPORT_EBPF = 52, + XDP_MMAP_OFFSETS = 1, + XDP_RX_RING = 2, + XDP_TX_RING = 3, + XDP_UMEM_REG = 4, + XDP_UMEM_FILL_RING = 5, + XDP_UMEM_COMPLETION_RING = 6, + XDP_STATISTICS = 7, + XDP_OPTIONS = 8 }) c.SO.GET_FILTER = c.SO.ATTACH_FILTER @@ -1433,6 +1442,7 @@ c.AF = strflag { CAIF = 37, ALG = 38, NFC = 39, + XDP = 44 } c.AF.UNIX = c.AF.LOCAL From 3f820107143c026ea72f486a474cdcf307735ce4 Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Fri, 8 Nov 2019 17:19:25 +0000 Subject: [PATCH 044/209] ljsyscall: fix a bug in getsockopt when querying for structs --- lib/ljsyscall/syscall/syscalls.lua | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lib/ljsyscall/syscall/syscalls.lua b/lib/ljsyscall/syscall/syscalls.lua index 1a29cd24d0..c6d3417028 100644 --- a/lib/ljsyscall/syscall/syscalls.lua +++ b/lib/ljsyscall/syscall/syscalls.lua @@ -327,7 +327,9 @@ function S.getsockopt(fd, level, optname, optval, optlen) local ret, err = C.getsockopt(getfd(fd), c.SOL[level], c.SO[optname], optval, len) if ret == -1 then return nil, t.error(err or errno()) end if len[0] ~= optlen then error("incorrect optlen for getsockopt: set " .. optlen .. " got " .. len[0]) end - return optval[0] -- TODO will not work if struct, eg see netfilter + local ok, ret = pcall(function () return optval[0] end) + if ok then return ret + else return optval end end function S.bind(sockfd, addr, addrlen) local saddr = pt.sockaddr(addr) From 1e176cb1ce3f8fd77e5b0728402e93901887f2cd Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Fri, 8 Nov 2019 17:20:38 +0000 Subject: [PATCH 045/209] ljsyscall: lookup strflags in linux bpf(2) wrappers --- lib/ljsyscall/syscall/linux/syscalls.lua | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/ljsyscall/syscall/linux/syscalls.lua b/lib/ljsyscall/syscall/linux/syscalls.lua index 0921241608..6e6e7d93cb 100644 --- a/lib/ljsyscall/syscall/linux/syscalls.lua +++ b/lib/ljsyscall/syscall/linux/syscalls.lua @@ -836,7 +836,7 @@ end if C.bpf then local function ptr_to_u64(p) return ffi.cast('uint64_t', ffi.cast('void *', p)) end function S.bpf(cmd, attr) - return C.bpf(cmd, attr) + return C.bpf(c.BPF_CMD[cmd], attr) end function S.bpf_prog_load(type, insns, len, license, version, log_level) if not license then license = "GPL" end -- Must stay alive during the syscall @@ -852,7 +852,7 @@ if C.bpf then end end local attr = t.bpf_attr1() - attr[0].prog_type = type + attr[0].prog_type = c.BPF_PROG[type] attr[0].insns = ptr_to_u64(insns) attr[0].insn_cnt = len attr[0].license = ptr_to_u64(license) @@ -868,7 +868,7 @@ if C.bpf then end function S.bpf_map_create(type, key_size, value_size, max_entries) local attr = t.bpf_attr1() - attr[0].map_type = type + attr[0].map_type = c.BPF_MAP[type] attr[0].key_size = key_size attr[0].value_size = value_size attr[0].max_entries = max_entries @@ -880,7 +880,7 @@ if C.bpf then end function S.bpf_map_op(op, fd, key, val_or_next, flags) local attr = t.bpf_attr1() - attr[0].map_fd = fd + attr[0].map_fd = getfd(fd) attr[0].key = ptr_to_u64(key) attr[0].value = ptr_to_u64(val_or_next) attr[0].flags = flags or 0 From 573a29b053dac437b97d43b84d9fedb1a0626e17 Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Fri, 8 Nov 2019 17:21:33 +0000 Subject: [PATCH 046/209] ljsyscall: add BPF_MAP type XSKMAP --- lib/ljsyscall/syscall/linux/constants.lua | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/ljsyscall/syscall/linux/constants.lua b/lib/ljsyscall/syscall/linux/constants.lua index de723e2522..ec0c32807b 100644 --- a/lib/ljsyscall/syscall/linux/constants.lua +++ b/lib/ljsyscall/syscall/linux/constants.lua @@ -2155,6 +2155,7 @@ c.BPF_MAP = strflag { DEVMAP = 14, SOCKMAP = 15, CPUMAP = 16, + XSKMAP = 17 } -- BPF syscall commands From 63f0cfeb205eed3e610bc15771ee97b721d93590 Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Mon, 14 Aug 2017 22:47:50 +0200 Subject: [PATCH 047/209] apps.test.synth: add packets option --- src/apps/test/README.md | 5 +++++ src/apps/test/synth.lua | 36 ++++++++++++++++++++---------------- 2 files changed, 25 insertions(+), 16 deletions(-) diff --git a/src/apps/test/README.md b/src/apps/test/README.md index 6d4ef5bb31..28dda5b634 100644 --- a/src/apps/test/README.md +++ b/src/apps/test/README.md @@ -79,6 +79,11 @@ Generate a random payload for each packet in `sizes`. Insert the packet number (32bit uint) directly after the ethertype. The packet number starts at 0 and is sequential on each output link. +— Key **packets** + +Emit *packets* (an array of *packets*) instead of synthesizing packets. When +this option is used *src*, *dst*, *sizes*, and *random_payload* are ignored. + ## Npackets (apps.test.npackets) The `Npackets` app allows are most N packets to flow through it. Any further diff --git a/src/apps/test/synth.lua b/src/apps/test/synth.lua index c75ebd2e93..6bf18ed1c9 100644 --- a/src/apps/test/synth.lua +++ b/src/apps/test/synth.lua @@ -15,28 +15,32 @@ Synth = { dst = {default='00:00:00:00:00:00'}, random_payload = { default = false }, packet_id = { default = false }, + packets = {} } } function Synth:new (conf) assert(#conf.sizes >= 1, "Needs at least one size.") - local packets = {} - for i, size in ipairs(conf.sizes) do - local payload_size = size - ethernet:sizeof() - assert(payload_size >= 0 and payload_size <= 1536, - "Invalid payload size: "..payload_size) - local data - if conf.random_payload then - data = lib.random_bytes(payload_size) - else - data = ffi.new("char[?]", payload_size) + local packets = conf.packets + if not packets then + packets = {} + for i, size in ipairs(conf.sizes) do + local payload_size = size - ethernet:sizeof() + assert(payload_size >= 0 and payload_size <= 1536, + "Invalid payload size: "..payload_size) + local data + if conf.random_payload then + data = lib.random_bytes(payload_size) + else + data = ffi.new("char[?]", payload_size) + end + local dgram = datagram:new(packet.from_pointer(data, payload_size)) + local ether = ethernet:new({ src = ethernet:pton(conf.src), + dst = ethernet:pton(conf.dst), + type = payload_size }) + dgram:push(ether) + packets[i] = dgram:packet() end - local dgram = datagram:new(packet.from_pointer(data, payload_size)) - local ether = ethernet:new({ src = ethernet:pton(conf.src), - dst = ethernet:pton(conf.dst), - type = payload_size }) - dgram:push(ether) - packets[i] = dgram:packet() end return setmetatable({packets=packets}, {__index=Synth}) end From f53552561acdbb4e5150f0c60ecffdee7d7bc891 Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Fri, 8 Mar 2019 12:34:50 +0100 Subject: [PATCH 048/209] apps.test.synth: emit all given packets --- src/apps/test/synth.lua | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/src/apps/test/synth.lua b/src/apps/test/synth.lua index 6bf18ed1c9..d4558dc14d 100644 --- a/src/apps/test/synth.lua +++ b/src/apps/test/synth.lua @@ -42,25 +42,29 @@ function Synth:new (conf) packets[i] = dgram:packet() end end - return setmetatable({packets=packets}, {__index=Synth}) + return setmetatable( + {cursor=0, pktid=(conf.packet_id and 0), packets=packets}, + {__index=Synth} + ) end function Synth:pull () + local burst = engine.pull_npackets + local packets, npackets = self.packets, #self.packets for _, o in ipairs(self.output) do - local n = 0 - while n < engine.pull_npackets do - for _, p in ipairs(self.packets) do - local c = packet.clone(p) - if self.packet_id then - -- 14 == sizeof(dstmac srcmac type) - ffi.cast("uint32_t *", clone.data+14)[0] = lib.htonl(self.pktid) - self.pktid = self.pktid + 1 - end - transmit(o, c) - n = n + 1 - end + local cursor = self.cursor + for _ = 1, burst do + local p = packet.clone(packets[1+cursor]) + if self.packet_id then + -- 14 == sizeof(dstmac srcmac type) + ffi.cast("uint32_t *", p.data+14)[0] = lib.htonl(self.pktid) + self.pktid = self.pktid + 1 + end + transmit(o, p) + cursor = (cursor + 1) % npackets end end + self.cursor = (self.cursor + burst) % npackets end function Synth:stop () From 61745637fb2bc8b7c4e5fce88ac3be9c8f577156 Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Thu, 14 Nov 2019 16:03:13 +0000 Subject: [PATCH 049/209] core.packet: expose constants for packet alignment and headroom --- src/core/packet.lua | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/core/packet.lua b/src/core/packet.lua index 1495483403..bc7a14cd5f 100644 --- a/src/core/packet.lua +++ b/src/core/packet.lua @@ -24,11 +24,11 @@ max_payload = tonumber(C.PACKET_PAYLOAD_SIZE) -- For operations that add or remove headers from the beginning of a -- packet, instead of copying around the payload we just move the -- packet structure as a whole around. -local packet_alignment = 512 -local default_headroom = 256 +packet_alignment = 512 +default_headroom = 256 -- The Intel82599 driver requires even-byte alignment, so let's keep -- things aligned at least this much. -local minimum_alignment = 2 +minimum_alignment = 2 local function get_alignment (addr, alignment) -- Precondition: alignment is a power of 2. From 984d4a41c28c606ab137358759bbfc1a857bdadb Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Wed, 13 Nov 2019 16:31:00 +0000 Subject: [PATCH 050/209] apps.xdp: driver for AF_XDP sockets --- src/apps/xdp/README.md | 58 ++++ src/apps/xdp/bpf.lua | 182 +++++++++++ src/apps/xdp/xdp.lua | 676 +++++++++++++++++++++++++++++++++++++++++ src/xdp.snabb | 407 +++++++++++++++++++++++++ 4 files changed, 1323 insertions(+) create mode 100644 src/apps/xdp/README.md create mode 100644 src/apps/xdp/bpf.lua create mode 100644 src/apps/xdp/xdp.lua create mode 100644 src/xdp.snabb diff --git a/src/apps/xdp/README.md b/src/apps/xdp/README.md new file mode 100644 index 0000000000..81a3f325ab --- /dev/null +++ b/src/apps/xdp/README.md @@ -0,0 +1,58 @@ +# XDP socket app (apps.xdp.xdp) + +The `XDP` app implements a driver for Linux `AF_XDP` sockets. + +Its links are named `input` and `output`. + + DIAGRAM: XDP + +-----------+ + | | + input ---->* XDP *----> output + | | + +-----------+ + +**Important:** To use the _XDP_ app, “Snabb XDP mode“ must be enabled by +calling `xdp.snabb_enable_xdp()`. Calling this function replaces Snabb's native +memory allocator with the _UMEM_ allocator. The caller must ensure that no +packets have been allocated via `packet.allocate()` prior to calling this +function. + +## _Caveats_ + + * Memory allocated by the UMEM allocator can not be used with _DMA_ + drivers: using the XDP app precludes the use of Snabb’s native + hardware drivers such as `apps.intel_mp.intel_mp`. + + * Memory allocated by the UMEM allocator can not be shared with + other Snabb processes in the same process group: using + snabb_enable_xdp precludes the use of Interlink apps + (`apps.interlink`). + +## Maximum MTU + +Due to a combination of how Snabb uses packet buffers and a limitation of +`AF_XDP` the effective maximum MTU of the XDP app is 3,582. + +## Configuration + +— Key **ifname** + +*Required*. The name of the interface as shown in `ip link`. + +— Key **queue** + +*Optional*. Queue to bind to (zero based). The default is queue 0. + +## Module functions + +— Function **snabb_enable_xdp** + +Enables “Snabb XDP mode”. See _Caveats_! + +## Setting up XDP capable devices under Linux + +``` +$ echo 0000:01:00.0 > /sys/bus/pci/drivers/ixgbe/bind +$ ip link set ens1f0 addr 02:00:00:00:00:00 +$ ethtool --set-channels ens1f0 combined 1 +``` diff --git a/src/apps/xdp/bpf.lua b/src/apps/xdp/bpf.lua new file mode 100644 index 0000000000..44b4d68059 --- /dev/null +++ b/src/apps/xdp/bpf.lua @@ -0,0 +1,182 @@ +-- Use of this source code is governed by the Apache 2.0 license; see COPYING. + +module(...,package.seeall) + +local ffi = require("ffi") +local bor = bit.bor + +-- BPF: just enough eBPF to assemble trivial XDP programs. +-- +-- See "BPF Architecture": +-- https://docs.cilium.io/en/v1.6/bpf/#bpf-architecture +-- +-- See Linux v4.19: +-- include/uapi/linux/bpf_common.h +-- include/uapi/linux/bpf.h +-- tools/include/linux/filter.h + +ins = ffi.typeof[[ + struct { + uint8_t op; /* opcode */ + uint8_t dst:4; /* dest register */ + uint8_t src:4; /* source register */ + int16_t off; /* signed offset */ + int32_t imm; /* signed immediate constant */ + } __attribute__((packed)) +]] + +c = { -- Op class + LD = 0x00, + LDX = 0x01, + ST = 0x02, + STX = 0x03, + ALU = 0x04, + JMP = 0x05, + RET = 0x06, + MISC = 0x07, + ALU64 = 0x07 -- alu mode in double word width +} + +f = { -- Load/store width + W = 0x00, -- 32-bit + H = 0x08, -- 16-bit + B = 0x10, -- 8-bit + DW = 0x18 -- 64-bit +} + +m = { -- Op mode + IMM = 0x00, + ABS = 0x20, + IND = 0x40, + MEM = 0x60, + LEN = 0x80, + MSH = 0xa0, + XADD = 0xc0 -- exclusive add +} + +a = { -- ALU mode + ADD = 0x00, + SUB = 0x10, + MUL = 0x20, + DIV = 0x30, + OR = 0x40, + AND = 0x50, + LSH = 0x60, + RSH = 0x70, + NEG = 0x80, + MOD = 0x90, + XOR = 0xa0, + MOV = 0xb0 +} + +s = { -- Src mode + K = 0x00, + X = 0x08, + MAP_FD = 0x01 +} + +j = { -- JMP mode + JA = 0x00, + JEQ = 0x10, + JGT = 0x20, + JGE = 0x30, + JSET = 0x40, + JNE = 0x50, + JLT = 0xa0, + JLE = 0xb0, + JSGT = 0x60, + JSGE = 0x70, + JSLT = 0xc0, + JSLE = 0xd0, + CALL = 0x80, + EXIT = 0x90 +} + +fn = { -- Built-in helpers + unspec = 0, + map_lookup_elem = 1, + map_update_elem = 2, + map_delete_elem = 3, + probe_read = 4, + ktime_get_ns = 5, + trace_printk = 6, + get_prandom_u32 = 7, + get_smp_processor_id = 8, + skb_store_bytes = 9, + l3_csum_replace = 10, + l4_csum_replace = 11, + tail_call = 12, + clone_redirect = 13, + get_current_pid_tgid = 14, + get_current_uid_gid = 15, + get_current_comm = 16, + get_cgroup_classid = 17, + skb_vlan_push = 18, + skb_vlan_pop = 19, + skb_get_tunnel_key = 20, + skb_set_tunnel_key = 21, + perf_event_read = 22, + redirect = 23, + get_route_realm = 24, + perf_event_output = 25, + skb_load_bytes = 26, + get_stackid = 27, + csum_diff = 28, + skb_get_tunnel_opt = 29, + skb_set_tunnel_opt = 30, + skb_change_proto = 31, + skb_change_type = 32, + skb_under_cgroup = 33, + get_hash_recalc = 34, + get_current_task = 35, + probe_write_user = 36, + current_task_under_cgroup = 37, + skb_change_tail = 38, + skb_pull_data = 39, + csum_update = 40, + set_hash_invalid = 41, + get_numa_node_id = 42, + skb_change_head = 43, + xdp_adjust_head = 44, + probe_read_str = 45, + get_socket_cookie = 46, + get_socket_uid = 47, + set_hash = 48, + setsockopt = 49, + skb_adjust_room = 50, + redirect_map = 51, + sk_redirect_map = 52, + sock_map_update = 53, + xdp_adjust_meta = 54, + perf_event_read_value = 55, + perf_prog_read_value = 56, + getsockopt = 57, + override_return = 58, + sock_ops_cb_flags_set = 59, + msg_redirect_map = 60, + msg_apply_bytes = 61, + msg_cork_bytes = 62, + msg_pull_data = 63, + bind = 64, + xdp_adjust_tail = 65, + skb_get_xfrm_state = 66, + get_stack = 67, + skb_load_bytes_relative = 68, + fib_lookup = 69, + sock_hash_update = 70, + msg_redirect_hash = 71, + sk_redirect_hash = 72, + lwt_push_encap = 73, + lwt_seg6_store_bytes = 74, + lwt_seg6_adjust_srh = 75, + lwt_seg6_action = 76, + rc_repeat = 77, + rc_keydown = 78, + skb_cgroup_id = 79, + get_current_cgroup_id = 80, + get_local_storage = 81, + sk_select_reuseport = 82, + skb_ancestor_cgroup_id = 83, +} + +function asm (insn) return ffi.typeof("$[?]", ins)(#insn, insn) end diff --git a/src/apps/xdp/xdp.lua b/src/apps/xdp/xdp.lua new file mode 100644 index 0000000000..9e0e99dd2e --- /dev/null +++ b/src/apps/xdp/xdp.lua @@ -0,0 +1,676 @@ +-- Use of this source code is governed by the Apache 2.0 license; see COPYING. + +module(...,package.seeall) + +local S = require("syscall") +local ffi = require("ffi") +local bpf = require("apps.xdp.bpf") +local lib = require("core.lib") +local bits = lib.bits +local band, bor, rshift, tobit = bit.band, bit.bor, bit.rshift, bit.tobit + +-- ---- XDP driver for Snabb -------------------------------------------- + +-- This is a Snabb driver for Linux AF_XDP[1][2] sockets. The XDP kernel +-- interface presents an ABI/API combination similar to what a hardware NIC +-- usually provides: a way to attach to hardware queues, and a set of +-- descriptor rings for each queue used to enqueue and dequeue packet memory +-- buffers. +-- +-- Like with hardware NICs, XDP imposes us with constraints on the kind of +-- memory buffers we can enqueue onto its descriptor rings. Instead of DMA +-- memory required to drive hardware NICs, XDP requires us to register a +-- special kind of memory called UMEM to use with an AF_XDP socket. Only +-- buffers in the UMEM registered with a given socket can be used for I/O with +-- that socket! +-- +-- To consolidate this and other constraints (see "UMEM allocation" below) with +-- Snabb's packet memory architecture this driver allocates a single contiguous +-- memory region used as UMEM for all of the process' AF_XDP sockets, and +-- replaces the memory allocation routine dma_alloc in core.memory with its own +-- UMEM allocator. Hence, the packet freelist will be filled with UMEM memory +-- buffers used for all packet allocations. +-- +-- snabb_enable_xdp() +-- +-- To use the XDP app, "Snabb XDP mode" must be enabled by calling this +-- function. Calling this function replaces Snabb's native memory +-- allocator with the UMEM allocator. +-- +-- The caller must ensure that no packets have been allocated via +-- packet.allocate() prior to calling this function. +-- +-- CAVEATS: +-- +-- * Memory allocated by the UMEM allocator can not be used with DMA +-- drivers: using the XDP app precludes the use of Snabb's native +-- hardware drivers. +-- +-- * Memory allocated by the UMEM allocator can not be shared with +-- other Snabb processes in the same process group: using +-- snabb_enable_xdp precludes the use of Interlink apps +-- (apps.interlink). +-- +-- * UMEM chunks can not be larger than the page size (4096 bytes). +-- This AD_XDP limitation plus the way Snabb implements packet +-- buffer shifting operations limits the effective MTU: the MTU of +-- the XDP app is limited to 3,582 bytes. See XDP:create_xsk(). +-- +-- The only means by which an AF_XDP socket can receive packets from a device +-- is by attaching an eBPF XDP program to the Linux interface. The XDP app +-- assembles a minimal BPF program to route packets from device queues to XDP +-- sockets. See XDP:initialize_xdp. +-- +-- References: +-- [1] https://www.kernel.org/doc/html/v5.3/networking/af_xdp.html +-- [2] The Linux kernel source repository + + +-- ---- UMEM allocation ------------------------------------------------- + +-- Must maintain invariants: chunk size must be <= page size and UMEM must be +-- aligned to page size. + +local page_size = S.getpagesize() +local chunk_size = page_size +local num_chunks = 200000 +local umem_backing, umem, umem_size, umem_used + +-- UMEM allocator: multiple UMEM chunks must be allocated to fit a full packet. +-- However, AF_XDP sockets will only ever see the first of the chunks that make +-- up a packet. The extra (two) UMEM chunks are effectively unused by the +-- socket (but used by Snabb to ensure that packets can actually use +-- packet.max_payload bytes of payload). +-- See core.packet, "XDP rings", XDP:create_xsk(). +local function umem_alloc (size, align) + -- NB: align parameter ignored as we align to chunk_size + assert(align <= chunk_size) + assert(umem_used + size <= umem_size, + "Out of packet buffer memory. Increase num_chunks?") + local chunk = umem + umem_used + umem_used = lib.align(umem_used + size, chunk_size) + return chunk +end + +-- Convert from pointer to relative UMEM offset. +local function to_umem (ptr) + return ffi.cast("uintptr_t", ptr) - ffi.cast("uintptr_t", umem) +end + +-- Convert relative UMEM offset to pointer. +local function from_umem (offset) + return umem + offset +end + +local snabb_xdp_enabled = false +function snabb_enable_xdp () + -- Allocate UMEM + umem_size = chunk_size * num_chunks + umem_backing = ffi.new("char[?]", umem_size + page_size) + umem = ffi.cast("char*", lib.align(ffi.cast("uintptr_t", umem_backing), page_size)) + umem_used = 0 + -- Hot-swap core.memory.dma_alloc + require("core.memory").dma_alloc = umem_alloc + snabb_xdp_enabled = true +end + + +-- ---- FFI types ------------------------------------------------------- + +local xdp_umem_reg_t = ffi.typeof[[ + struct { + void * addr; /* Start of packet data area */ + uint64_t len; /* Length of packet data area */ + uint32_t chunk_size; + uint32_t headroom; + uint32_t flags; /* Not available in 4.19 */ + } __attribute__((packed))]] + +local sockaddr_xdp_t = ffi.typeof[[ + struct { + uint16_t family; + uint16_t flags; + uint32_t ifindex; + uint32_t queue_id; + uint32_t shared_umem_fd; + } __attribute__((packed))]] + +local xdp_ring_offset_t = ffi.typeof[[ + struct { + uint64_t producer; + uint64_t consumer; + uint64_t desc; + uint64_t flags; /* Not available in 4.19 */ + } __attribute__((packed))]] + +local xdp_ring_offset_noflags_t = ffi.typeof[[ + struct { + uint64_t producer; + uint64_t consumer; + uint64_t desc; + } __attribute__((packed))]] + +local xdp_mmap_offsets_templ = [[ + struct { + $ rx, + tx, + fr, /* Fill */ + cr; /* Completion */ + } __attribute__((packed))]] +local xdp_mmap_offsets_noflags_t = + ffi.typeof(xdp_mmap_offsets_templ, xdp_ring_offset_noflags_t) +local xdp_mmap_offsets_t = + ffi.typeof(xdp_mmap_offsets_templ, xdp_ring_offset_t) + +local xdp_ring_t = ffi.typeof[[ + struct { + uint32_t *producer, *consumer, *flags; + void *desc; + uint32_t write, read; + }]] + +local xdp_desc_t = ffi.typeof[[ + struct { + uint64_t addr; + uint32_t len; + uint32_t options; + } __attribute__((packed))]] +local xdp_desc_ptr_t = ffi.typeof("$ *", xdp_desc_t) + +local netlink_set_link_xdp_request_t = ffi.typeof[[ + struct { + struct { /* nlmsghdr */ + uint32_t nlmsg_len; /* Length of message including header */ + uint16_t nlmsg_type; /* Message content */ + uint16_t nlmsg_flags; /* Additional flags */ + uint32_t nlmsg_seq; /* Sequence number */ + uint32_t nlmsg_pid; /* Sending process port ID */ + } nh; + struct { /* ifinfomsg */ + unsigned char ifi_family; + unsigned char __ifi_pad; + unsigned short ifi_type; /* ARPHRD_* */ + int ifi_index; /* Link index */ + unsigned ifi_flags; /* IFF_* flags */ + unsigned ifi_change; /* IFF_* change mask */ + } ifinfo; + struct { /* nlattr */ + uint16_t nla_len; + uint16_t nla_type; + } xdp; + struct { /* nlattr */ + uint16_t nla_len; + uint16_t nla_type; + int32_t fd; + } xdp_fd; + }__attribute__((packed))]] + + +-- ---- XDP rings ------------------------------------------------------- + +-- Ring operations for the single-producer single-consumer rings used for I/O +-- with AF_XDP sockets (xdp_ring_t). This is is a blend between an +-- "Array + two unmasked indices"[1] and MCRingBuffer[2] implementation. +-- +-- Only the "Array + two unmasked indices" half of the implementation is +-- actually exposed by the kernel via the pointers to shared consumer/producer +-- fields (see xdp_ring_t, XDP:xdp_map_ring()). The MCRingBuffer portion is +-- added by userspace (us) to optimize our CPU cache footprint. +-- +-- Each AF_XDP socket has two rings (rx, tx) and each UMEM has two rings +-- (fr - fill ring, cr - completion ring). This XDP driver registers a new UMEM +-- for each socket so that each socket effectively has four rings +-- (rx, tx, fr, cr). +-- +-- For the Linux kernel to be able to fill the rx ring we need to provide it +-- UMEM chunks via the fill ring (fr). Superfluous UMEM chunks held by the +-- kernel are fed back to the userspace application via the +-- completion ring (cr). +-- +-- It is important to note that XDP rings operate on chunks: the addr field +-- of xdp_desc_t points *into* a chunk, and its len field is, from the kernel’s +-- perspective, bounded to the end of that chunk. See "UMEM allocation" and +-- XDP:create_xsk() for how this affects Snabb. +-- +-- NB: Snabb packet payloads are preceded by a two byte length field, so we +-- have to account for this overhead when retrieving packets from XDP +-- descriptor rings. See receive(r) below and XDP:create_xsk(). +-- +-- References: +-- [1] https://www.snellman.net/blog/archive/2016-12-13-ring-buffers/ +-- [2] https://www.cse.cuhk.edu.hk/~pclee/www/pubs/ancs09poster.pdf + +local xdp_ring_ndesc = 2048 -- Number of descriptors in ring. + +local function mask (i) return band(i, xdp_ring_ndesc - 1) end +local function inc (i) return tobit(i + 1) end +local function full1 (r, w) return tobit(w - r) == xdp_ring_ndesc end + +function full (r) + if full1(r.read, r.write) then + if full1(r.consumer[0], r.write) then + return true + end + r.read = r.consumer[0] + end +end + +function transmit (r, p) + local desc = ffi.cast(xdp_desc_ptr_t, r.desc) + local idx = mask(r.write) + desc[idx].addr = to_umem(p.data) + desc[idx].len = p.length + r.write = inc(r.write) +end + +function fill (r, p) + local desc = ffi.cast("uint64_t *", r.desc) + local idx = mask(r.write) + desc[idx] = to_umem(p) + r.write = inc(r.write) +end + +function push (r) + -- NB: no need for memory barrier on x86 because of TSO. + r.producer[0] = r.write +end + +function empty (r) + if r.read == r.write then + if r.read == r.producer[0] then + return true + end + r.write = r.producer[0] + end +end + +local packet_overhead = 2 -- leading struct packet length field (uint16_t) +function receive (r) + local desc = ffi.cast(xdp_desc_ptr_t, r.desc) + local idx = mask(r.read) + local p = ffi.cast("struct packet *", + -- packet struct begins at payload - packet_overhead + from_umem(desc[idx].addr) - packet_overhead) + p.length = desc[idx].len + r.read = inc(r.read) + return p +end + +function reclaim (r) + local desc = ffi.cast("uint64_t *", r.desc) + local idx = mask(r.read) + local p = ffi.cast("struct packet *", from_umem(desc[idx])) + p.length = 0 + r.read = inc(r.read) + return p +end + +function pull (r) + -- NB: no need for memory barrier on x86 (see push.) + r.consumer[0] = r.read +end + +function needs_wakeup (r) + -- NB: Unavailable when kernel does not support ring flags. + -- See: XDP.kernel_has_ring_flags, XDP:create_xsk(), XDP:kick() + return band(r.flags[0], bits{XDP_RING_NEED_WAKEUP=1}) +end + + +-- ---- XDP App --------------------------------------------------------- + +XDP = { + config = { + ifname = {required=true}, -- interface name + queue = {default=0} -- interface queue (zero based) + }, + -- Class variables: + queues = {}, -- queue-to-socket maps for each interface + kernel_has_ring_flags = true -- feature detection status for descriptor ring flags +} + +-- Class methods + +function XDP:new (conf) + assert(snabb_xdp_enabled, "Snabb XDP mode must be enabled.") + -- Ensure interface is initialized for XDP usage. + if not self.queues[conf.ifname] then + self.queues[conf.ifname] = self:create_xskmap() + self:initialize_xdp(conf.ifname, self.queues[conf.ifname]) + end + -- Create XDP socket (xsk) for queue. + local xsk = self:create_xsk(conf.ifname, conf.queue) + -- Attach the socket to queue in the BPF map. + self:set_queue_socket(self.queues[conf.ifname], conf.queue, xsk) + -- Finish initialization. + return setmetatable(xsk, {__index=XDP}) +end + +function XDP:create_xskmap () + local klen, vlen = ffi.sizeof("int"), ffi.sizeof("int") + local nentries = 128 + local map, err + for _ = 1,7 do + -- Try to create BPF map. + map, err = S.bpf_map_create('xskmap', klen, vlen, nentries) + -- Return map on success. + if map then return map end + -- Failed to create map, increase MEMLOCK limit and retry. + -- See https://github.com/xdp-project/xdp-tutorial/issues/63 + local lim = assert(S.getrlimit('memlock')) + assert(S.setrlimit('memlock', {cur=lim.cur*2, max=lim.max*2})) + end + -- Exceeded retries, bail. + error("Failed to create BPF map: "..tostring(err)) +end + +function XDP:initialize_xdp (ifname, xskmap) + self:set_link_xdp(ifname, self:xdp_prog(xskmap)) +end + +function XDP:xdp_prog (xskmap) + -- Assemble and load XDP BPF program. + local c, f, m, a, s, j, fn = + bpf.c, bpf.f, bpf.m, bpf.a, bpf.s, bpf.j, bpf.fn + -- The program below looks up the incoming packet's queue index in xskmap to + -- find the corresponding XDP socket (xsk) to deliver the packet to. + local insns = bpf.asm{ + -- r3 = XDP_ABORTED + { op=bor(c.ALU, a.MOV, s.K), dst=3, imm=0 }, + -- r2 = ((struct xdp_md *)ctx)->rx_queue_index + { op=bor(c.LDX, f.W, m.MEM), dst=2, src=1, off=16 }, + -- r1 = xskmap + { op=bor(c.LD, f.DW, m.IMM), dst=1, src=s.MAP_FD, + imm=band(xskmap:getfd(), 2^32-1) }, + { imm=rshift(xskmap:getfd(), 32) }, + -- r0 = redirect_map(r1, r2, r3) + { op=bor(c.JMP, j.CALL), imm=fn.redirect_map }, + -- EXIT: + { op=bor(c.JMP, j.EXIT) } + } + local prog, err, log = S.bpf_prog_load( + 'xdp', insns, ffi.sizeof(insns) / ffi.sizeof(bpf.ins), "Apache 2.0" + ) + if prog then + return prog + else + error(tostring(err).."\n"..log) + end +end + +function XDP:set_link_xdp(ifname, prog) + -- Open a NETLINK socket, and transmit command that attaches XDP program + -- prog to link by ifname. + local netlink = assert(S.socket('netlink', 'raw', 'route')) + local SOL_NETLINK = 270 + local NETLINK_EXT_ACK = 11 + local ext_ack_on = ffi.new("int[1]", 1) + assert(S.setsockopt(netlink, SOL_NETLINK, NETLINK_EXT_ACK, + ext_ack_on, ffi.sizeof(ext_ack_on))) + local IFLA_XDP = 43 + local IFLA_XDP_FD = 1 + local IFLA_XDP_FLAGS = 3 + local request = ffi.new( + netlink_set_link_xdp_request_t, + { nh = { nlmsg_flags = bor(S.c.NLM_F.REQUEST, S.c.NLM_F.ACK), + nlmsg_type = S.c.RTM.SETLINK }, + ifinfo = { ifi_family = S.c.AF.UNSPEC, + ifi_index = S.util.if_nametoindex(ifname) }, + xdp = { nla_type = bor(bits{ NLA_F_NESTED=15 }, IFLA_XDP) }, + xdp_fd = { nla_type = IFLA_XDP_FD, + fd = prog:getfd() } } + ) + request.nh.nlmsg_len = ffi.sizeof(request) + request.xdp.nla_len = ffi.sizeof(request.xdp) + ffi.sizeof(request.xdp_fd) + request.xdp_fd.nla_len = ffi.sizeof(request.xdp_fd) + assert(netlink:send(request, ffi.sizeof(request))) + local response = assert(S.nl.read(netlink, nil, nil, true)) + if response.error then + error("NETLINK responded with error: "..tostring(response.error)) + end + netlink:close() +end + +function XDP:create_xsk (ifname, queue) + local xsk = { sock = assert(S.socket('xdp', 'raw')) } + -- Register UMEM. + local umem_reg = ffi.new( + xdp_umem_reg_t, + { addr = umem, + len = umem_size, + -- The chunk size is equal to the page size (4096 bytes, see + -- "UMEM allocation"), and XDP packet descriptors point to individual + -- chunks (see "XDP rings"). Hence, the MTU of AF_XDP sockets is + -- limited to the page size, and the effective MTU of the XDP app is + -- further limited by the way core.packet implements packet shifting + -- operations (see headroom below). The effective MTU is calculated as + -- 4096 - packet.packet_alignment (512) - packet_overhead (2) = 3582 + chunk_size = chunk_size, + -- By configuring the headroom according to core.packet we make sure + -- that XDP leaves enough headroom for the preceeding length field of + -- Snabb's struct packet as well as headroom for packet shifting + -- operations. + headroom = packet.default_headroom + packet_overhead, + -- flags = bits{ XDP_UMEM_UNALIGNED_CHUNK_FLAG=1 } + } + ) + assert(xsk.sock:setsockopt('xdp', 'xdp_umem_reg', umem_reg, ffi.sizeof(umem_reg))) + -- Configure XDP rings and map them into this process’ memory. + local ndesc = ffi.new("int[1]", xdp_ring_ndesc) + assert(xsk.sock:setsockopt('xdp', 'xdp_rx_ring', ndesc, ffi.sizeof(ndesc))) + assert(xsk.sock:setsockopt('xdp', 'xdp_tx_ring', ndesc, ffi.sizeof(ndesc))) + assert(xsk.sock:setsockopt('xdp', 'xdp_umem_fill_ring', ndesc, ffi.sizeof(ndesc))) + assert(xsk.sock:setsockopt('xdp', 'xdp_umem_completion_ring', ndesc, ffi.sizeof(ndesc))) + local layouts = ffi.new(xdp_mmap_offsets_t) + if not pcall(S.getsockopt, xsk.sock, 'xdp', 'xdp_mmap_offsets', layouts, ffi.sizeof(layouts)) then + -- Kernel appears not to support XDP ring flags field. Disable feature, + -- and retry with xdp_mmap_offsets_noflags_t. + self.kernel_has_ring_flags = false + layouts = ffi.new(xdp_mmap_offsets_noflags_t) + assert(xsk.sock:getsockopt('xdp', 'xdp_mmap_offsets', layouts, ffi.sizeof(layouts))) + end + xsk.rx = self:xdp_map_ring(xsk.sock, layouts.rx, xdp_desc_t, 0x000000000ULL) -- XDP_PGOFF_RX_RING + xsk.tx = self:xdp_map_ring(xsk.sock, layouts.tx, xdp_desc_t, 0x080000000ULL) -- XDP_PGOFF_TX_RING + -- NB: fill and completion rings do not carry full descriptors, only + -- relative UMEM offsets (addr). + xsk.fr = self:xdp_map_ring(xsk.sock, layouts.fr, "uint64_t", 0x100000000ULL) -- XDP_UMEM_PGOFF_FILL_RING + xsk.cr = self:xdp_map_ring(xsk.sock, layouts.cr, "uint64_t", 0x180000000ULL) -- XDP_UMEM_PGOFF_COMPLETION_RING + -- Bind socket to interface + local sa = ffi.new( + sockaddr_xdp_t, + { family = S.c.AF.XDP, + ifindex = S.util.if_nametoindex(ifname), + queue_id = queue, + -- flags = bits{ XDP_ZEROCOPY=2 } + } + ) + assert(xsk.sock:bind(sa, ffi.sizeof(sa))) + return xsk +end + +-- Map an XDP socket ring into this process’ memory. +function XDP:xdp_map_ring (socket, layout, desc_t, offset) + local prot = "read, write" + local flags = "shared, populate" + local length = layout.desc + xdp_ring_ndesc * ffi.sizeof(desc_t) + local map = ffi.cast("char*", assert(S.mmap(nil, length, prot, flags, socket, offset))) + local r = ffi.new(xdp_ring_t) + r.producer = ffi.cast("uint32_t *", map + layout.producer) + r.consumer = ffi.cast("uint32_t *", map + layout.consumer) + if self.kernel_has_ring_flags then + r.flags = ffi.cast("uint32_t *", map + layout.flags) + end + r.desc = map + layout.desc + return r +end + +function XDP:set_queue_socket(xskmap, queue, xsk) + assert(S.bpf_map_op('map_update_elem', xskmap, + ffi.new("int[1]", queue), + ffi.new("int[1]", xsk.sock:getfd()))) +end + +-- Instance methods + +function XDP:stop () + self.sock:close() +end + +function XDP:pull () + local output = self.output.output + local rx, fr = self.rx, self.fr + if not output then return end + while not full(fr) do + fill(fr, packet.allocate()) + end + push(fr) + for _ = 1, engine.pull_npackets do + if empty(rx) then break end + link.transmit(output, receive(rx)) + end + pull(rx) +end + +function XDP:push () + local input = self.input.input + local tx, cr = self.tx, self.cr + if not input then return end + while not empty(cr) do + packet.free(reclaim(cr)) + end + pull(cr) + while not link.empty(input) and not full(tx) do + local p = link.receive(input) + packet.account_free(p) + transmit(tx, p) + end + push(tx) + if self.kernel_has_ring_flags then + if needs_wakeup(tx) then self:kick() end + else + if not empty(tx) then self:kick() end + end +end + +function XDP:kick () + -- Wake up Linux kernel to process tx ring packets. + self.sock:sendto(nil, 0, 'dontwait', nil, 0) +end + + +-- ---- Tests ----------------------------------------------------------- + +-- Useful setup commands: +-- $ echo 0000:01:00.0 > /sys/bus/pci/drivers/ixgbe/bind +-- $ ip link set ens1f0 addr 02:00:00:00:00:00 +-- $ ethtool --set-channels ens1f0 combined 1 + +function selftest () + print("selftest: apps.xdp.xdp") + local xdpdeva = lib.getenv("SNABB_XDP0") + local xdpmaca = lib.getenv("SNABB_XDP_MAC0") + local xdpdevb = lib.getenv("SNABB_XDP1") + local xdpmacb = lib.getenv("SNABB_XDP_MAC1") + local nqueues = lib.getenv("SNABB_XDP_NQUEUES") or 1 + if not (xdpdeva and xdpmaca and xdpdevb and xdpmacb) then + print("SNABB_XDP0 and SNABB_XDP1 must be set. Skipping selftest.") + os.exit(engine.test_skipped_code) + end + snabb_enable_xdp() + print("test: rxtx") + selftest_rxtx(xdpdeva, xdpmaca, xdpdevb, xdpmacb, nqueues) + print("test: rxtx_match") + selftest_rxtx_match(xdpdeva, xdpmaca, xdpdevb, xdpmacb) + print("selftest ok") +end + +local function random_v4_packets (conf) + local ethernet = require("lib.protocol.ethernet") + local ipv4 = require("lib.protocol.ipv4") + local eth = ethernet:new{src = ethernet:pton(conf.src), + dst = ethernet:pton(conf.dst), + type = 0x0800} + local packets = {} + for _, size in ipairs(conf.sizes) do + for _=1,100 do + local ip = ipv4:new{src=lib.random_bytes(4), + dst=lib.random_bytes(4), + total_length=size-eth:sizeof()} + local payload_length = ip:total_length() - ip:sizeof() + local p = packet.allocate() + packet.append(p, eth:header(), eth:sizeof()) + packet.append(p, ip:header(), eth:sizeof()) + packet.append(p, lib.random_bytes(payload_length), payload_length) + table.insert(packets, p) + end + end + return packets +end + +function selftest_rxtx (xdpdeva, xdpmaca, xdpdevb, xdpmacb, nqueues) + local c = config.new() + local basic = require("apps.basic.basic_apps") + local synth = require("apps.test.synth") + config.app(c, "source", synth.Synth, { + packets = random_v4_packets{ + sizes = {60,64,67,128,133,192,256,384,512,777,1024,1500,2001}, + src = xdpmaca, + dst = xdpmacb + }}) + config.app(c, "sink", basic.Sink) + for queue = 0, nqueues-1 do + local queue_a = xdpdeva.."_q"..queue + local queue_b = xdpdevb.."_q"..queue + config.app(c, queue_a, XDP, { + ifname = xdpdeva, + queue = queue + }) + config.app(c, queue_b, XDP, { + ifname = xdpdevb, + queue = queue + }) + config.link(c, "source.output"..queue.." -> "..queue_a..".input") + config.link(c, queue_b..".output -> sink.input"..queue) + end + engine.configure(c) + print("kernel_has_ring_flags", XDP.kernel_has_ring_flags) + engine.main{ duration = 1 } + engine.report_links() + local txtotal, rxtotal = 0, 0 + for queue = 0, nqueues-1 do + local tx = link.stats(engine.app_table.source.output["output"..queue]) + local rx = link.stats(engine.app_table.sink.input["input"..queue]) + assert(tx.rxpackets > 0, "No packets sent on queue: "..queue) + assert(rx.rxpackets > 0, "No packets received on queue: "..queue) + txtotal = txtotal + tx.rxpackets + rxtotal = rxtotal + rx.rxpackets + end + assert(math.abs(txtotal - rxtotal) <= txtotal*.10, -- 10% tolerance + "Too little packets received") +end + +function selftest_rxtx_match (xdpdeva, xdpmaca, xdpdevb, xdpmacb) + local c = config.new() + local synth = require("apps.test.synth") + local npackets = require("apps.test.npackets") + local match = require("apps.test.match") + config.app(c, "source", synth.Synth, { + sizes = {60,64,67,128,133,192,256,384,512,777,1024,1500,1501}, + src = xdpmaca, + dst = xdpmacb, + random_payload = true + }) + config.app(c, "npackets", npackets.Npackets, {npackets=1000}) + config.app(c, "match", match.Match) + config.app(c, xdpdeva, XDP, {ifname=xdpdeva}) + config.app(c, xdpdevb, XDP, {ifname=xdpdevb}) + config.link(c, "source.output -> "..xdpdeva..".input") + config.link(c, xdpdevb..".output -> match.rx") + config.link(c, "source.copy -> npackets.input") + config.link(c, "npackets.output -> match.comparator") + engine.configure(c) + engine.main{ duration=.1 } + engine.report_links() + engine.report_apps() + assert(#engine.app_table.match:errors() == 0, "Match errors.") +end diff --git a/src/xdp.snabb b/src/xdp.snabb new file mode 100644 index 0000000000..9ef1f1d424 --- /dev/null +++ b/src/xdp.snabb @@ -0,0 +1,407 @@ +#!snabb snsh + +local S = require("syscall") +local ffi = require("ffi") +local bpf = require("apps.xdp.bpf") +local lib = require("core.lib") +local bits = lib.bits +local band, bor, rshift, tobit = bit.band, bit.bor, bit.rshift, bit.tobit + +-- BPF boilerplate. +function bpf_attach (ifname, queue, xsk) + assert(S.setrlimit('memlock', {cur=0x7fffffffffffffffULL, max=0x7fffffffffffffffULL})) + -- Create queue->xsk map + local map = assert(S.bpf_map_create('xskmap', 4, 4, 128)) + -- Assemble and load BPF mapper program + local c, f, m, a, s, j, fn = + bpf.c, bpf.f, bpf.m, bpf.a, bpf.s, bpf.j, bpf.fn + local insns = bpf.asm{ + -- r3 = XDP_ABORTED + { op=bor(c.ALU, a.MOV, s.K), dst=3, imm=0 }, + -- r2 = ((struct xdp_md *)ctx)->rx_queue_index + { op=bor(c.LDX, f.W, m.MEM), dst=2, src=1, off=16 }, + -- r1 = map + { op=bor(c.LD, f.DW, m.IMM), dst=1, src=s.MAP_FD, + imm=band(map:getfd(), 2^32-1) }, + { imm=rshift(map:getfd(), 32) }, + -- r0 = redirect_map(r1, r2, r3) + { op=bor(c.JMP, j.CALL), imm=fn.redirect_map }, + -- EXIT: + { op=bor(c.JMP, j.EXIT) } + } + local prog, err, log = S.bpf_prog_load( + 'xdp', insns, ffi.sizeof(insns) / ffi.sizeof(bpf.ins), "Apache 2.0" + ) + if not prog then + print(log) + error(err) + end + -- Attach BPF program to interface + local netlink = assert(S.socket('netlink', 'raw', 'route')) + -- SOL_NETLINK = 270, NETLINK_EXT_ACK = 11 + assert(S.setsockopt(netlink, 270, 11, ffi.new("int32_t[1]", 1), 4)) + assert(S.bind(netlink, S.t.sockaddr_nl())) + local req = ffi.new[[ + struct { + struct { /* nlmsghdr */ + uint32_t nlmsg_len; /* Length of message including header */ + uint16_t nlmsg_type; /* Message content */ + uint16_t nlmsg_flags; /* Additional flags */ + uint32_t nlmsg_seq; /* Sequence number */ + uint32_t nlmsg_pid; /* Sending process port ID */ + } nh; + struct { /* ifinfomsg */ + unsigned char ifi_family; + unsigned char __ifi_pad; + unsigned short ifi_type; /* ARPHRD_* */ + int ifi_index; /* Link index */ + unsigned ifi_flags; /* IFF_* flags */ + unsigned ifi_change; /* IFF_* change mask */ + } ifinfo; + struct { /* nlattr */ + uint16_t nla_len; + uint16_t nla_type; + } xdp; + struct { /* nlattr */ + uint16_t nla_len; + uint16_t nla_type; + int32_t fd; + } xdp_fd; + struct { /* nlattr */ + uint16_t nla_len; + uint16_t nla_type; + uint32_t flags; + } xdp_flags; + }__attribute__((packed))]] + + req.nh.nlmsg_flags = bor(S.c.NLM_F.REQUEST, S.c.NLM_F.ACK) + req.nh.nlmsg_type = S.c.RTM.SETLINK + req.nh.nlmsg_pid = 0 + req.nh.nlmsg_seq = 1 + req.nh.nlmsg_len = ffi.sizeof(req) + req.ifinfo.ifi_family = S.c.AF.UNSPEC + req.ifinfo.ifi_index = S.util.if_nametoindex(ifname) + req.xdp.nla_type = bor(bits{ NLA_F_NESTED=15 }, 43) -- IFLA_XDP + req.xdp.nla_len = ffi.sizeof(req.xdp) + + ffi.sizeof(req.xdp_fd) + + ffi.sizeof(req.xdp_flags) + req.xdp_fd.nla_type = 1 -- IFLA_XDP_FD + req.xdp_fd.fd = prog:getfd() + req.xdp_fd.nla_len = ffi.sizeof(req.xdp_fd) + req.xdp_flags.nla_type = 3 -- IFLA_XDP_FLAGS + req.xdp_flags.flags = bits{ XDP_FLAGS_DRV_MODE=2 } + req.xdp_flags.nla_len = ffi.sizeof(req.xdp_flags) + assert(netlink:send(req, ffi.sizeof(req))) + local res = assert(S.nl.read(netlink, nil, nil, true)) + if res.error then + error("NETLINK responded with error: "..res.error) + end + netlink:close() + -- Insert queue:xsk into map + local qno = ffi.new("uint32_t[1]", queue) + local sfd = ffi.new("uint32_t[1]", xsk:getfd()) + assert(S.bpf_map_op('map_update_elem', map, qno, sfd)) +end + +-- Types +ffi.cdef[[ + struct sockaddr_xdp { + uint16_t sxdp_family; + uint16_t sxdp_flags; + uint32_t sxdp_ifindex; + uint32_t sxdp_queue_id; + uint32_t sxdp_shared_umem_fd; + } __attribute__((packed)); + + struct xdp_umem_reg { + uint8_t *addr; /* Start of packet data area */ + uint64_t len; /* Length of packet data area */ + uint32_t chunk_size; + uint32_t headroom; + uint32_t flags; /* Not available in 4.19 */ + } __attribute__((packed)); + + struct xdp_ring_offset { + uint64_t producer; + uint64_t consumer; + uint64_t desc; + //uint64_t flags; /* Not available in 4.19 */ + } __attribute__((packed)); + + struct xdp_mmap_offsets { + struct xdp_ring_offset rx; + struct xdp_ring_offset tx; + struct xdp_ring_offset fr; /* Fill */ + struct xdp_ring_offset cr; /* Completion */ + } __attribute__((packed)); + + struct xdp_desc { + uint64_t addr; + uint32_t len; + uint32_t options; + } __attribute__((packed)); +]] + + +-- Create XDP socket + +local xsk = assert(S.socket('xdp', 'raw')) + +-- Socket operations + +function xsk_kick (xsk) + return S.sendto(xsk, nil, 0, 'dontwait', nil, 0) +end + +function xsk_bind (xsk, ifname, queue) + local sxdp = ffi.new("struct sockaddr_xdp") + sxdp.sxdp_family = S.c.AF.XDP + sxdp.sxdp_ifindex = S.util.if_nametoindex(ifname) + sxdp.sxdp_queue_id = queue or 0 + --sxdp.sxdp_flags = bits{XDP_ZEROCOPY=2} + assert(S.bind(xsk, sxdp, ffi.sizeof(sxdp))) +end + +function xsk_poll (xsk) + local pfds = S.types.t.pollfds{{ fd=xsk, events='in'}} + assert(S.poll(pfds, 1000)) +end + +-- Allocate UMEM (overload dma_alloc to trick Snabb into allocating from here) + +local packet_overhead = 2 -- leading struct packet length field (uint16_t) +local default_headroom = 256 -- See core/packet + +local page_size = S.getpagesize() +-- Chunk size must be <= page size and UMEM must be aligned to page size. +local num_chunks = 10000 +local chunk_size = page_size +local umem_size = chunk_size * num_chunks +local umem_backing = ffi.new("uint8_t[?]", umem_size + page_size) +local umem = ffi.cast("uint8_t*", lib.align(ffi.cast("uintptr_t", umem_backing), page_size)) + +local umem_used = 0 +require("core.memory").dma_alloc = function (_, align) + -- Hack: we ignore the requested size and return short memory regions. + -- User has to ensure + -- packet.length <= chunk_size-(default_headroom+packet_overhead) + assert(umem_used + chunk_size <= umem_size) + local chunk = umem + umem_used + umem_used = umem_used + chunk_size + return chunk +end + +local function to_umem (p) + local rel = ffi.cast("uint64_t", p) - ffi.cast("uint64_t", umem) + return rel - band(rel, chunk_size - 1) -- realign +end + +local function from_umem (u) + return umem + u +end + +-- Register UMEM + +local opt = ffi.new("struct xdp_umem_reg") +opt.addr = umem +opt.len = umem_size +opt.chunk_size = chunk_size +opt.headroom = default_headroom + packet_overhead +--opt.flags = bits{XDP_UMEM_UNALIGNED_CHUNK_FLAG=1} +assert(xsk:setsockopt('xdp', 'xdp_umem_reg', opt, ffi.sizeof(opt))) + +-- Map rings + +local ndesc = 2048 +local opt, optsize = ffi.new("uint32_t[1]", ndesc), 4 +assert(xsk:setsockopt('xdp', 'xdp_rx_ring', opt, optsize)) +assert(xsk:setsockopt('xdp', 'xdp_tx_ring', opt, optsize)) +assert(xsk:setsockopt('xdp', 'xdp_umem_fill_ring', opt, optsize)) +assert(xsk:setsockopt('xdp', 'xdp_umem_completion_ring', opt, optsize)) + +local offsets = ffi.new("struct xdp_mmap_offsets") +assert(xsk:getsockopt('xdp', 'xdp_mmap_offsets', offsets, ffi.sizeof(offsets))) + +local ring_t = ffi.typeof[[ + struct { + uint32_t *producer, *consumer, *flags; + void *desc; + uint32_t write, read; + } +]] + +local function map_ring (xsk, length, offset) + local prot = "read, write" + local flags = "shared, populate" + local map = assert(S.mmap(nil, length, prot, flags, xsk, offset)) + return ffi.cast("char *", map) +end + +local ringmaps = { + rx = map_ring( + xsk, + offsets.rx.desc + ndesc*ffi.sizeof("struct xdp_desc"), + 0x000000000ULL -- XDP_PGOFF_RX_RING + ), + tx = map_ring( + xsk, + offsets.tx.desc + ndesc*ffi.sizeof("struct xdp_desc"), + 0x080000000ULL -- XDP_PGOFF_TX_RING + ), + fr = map_ring( + xsk, + offsets.fr.desc + ndesc*ffi.sizeof("uintptr_t"), + 0x100000000ULL -- XDP_UMEM_PGOFF_FILL_RING + ), + cr = map_ring( + xsk, + offsets.cr.desc + ndesc*ffi.sizeof("uintptr_t"), + 0x180000000ULL -- XDP_UMEM_PGOFF_COMPLETION_RING + ) +} + +local function make_ring (map, offsets) + local r = ffi.new(ring_t) + r.producer = ffi.cast("uint32_t *", map + offsets.producer) + r.consumer = ffi.cast("uint32_t *", map + offsets.consumer) + --r.flags = ffi.cast("uint32_t *", map + offsets.flags) + r.desc = map + offsets.desc + return r +end + +local rx = make_ring(ringmaps.rx, offsets.rx) +local tx = make_ring(ringmaps.tx, offsets.tx) +local fr = make_ring(ringmaps.fr, offsets.fr) +local cr = make_ring(ringmaps.cr, offsets.cr) + +-- Ring operations + +local function mask (i) return band(i, ndesc - 1) end +local function inc (i) return tobit(i + 1) end +local function full1 (r, w) return tobit(w - r) == ndesc end + +function full (r) + if full1(r.read, r.write) then + if full1(r.consumer[0], r.write) then + return true + end + r.read = r.consumer[0] + end +end + +function transmit (r, p) + local desc = ffi.cast("struct xdp_desc *", r.desc) + local idx = mask(r.write) + desc[idx].addr = to_umem(p.data) + desc[idx].len = p.length + r.write = inc(r.write) +end + +function fill (r, p) + local desc = ffi.cast("uint64_t *", r.desc) + local idx = mask(r.write) + desc[idx] = to_umem(p) + r.write = inc(r.write) +end + +function push (r) + -- NB: no need for memory barrier on x86 because of TSO. + r.producer[0] = r.write +end + +function empty (r) + if r.read == r.write then + if r.read == r.producer[0] then + return true + end + r.write = r.producer[0] + end +end + +function receive (r) + local desc = ffi.cast("struct xdp_desc *", r.desc) + local idx = mask(r.read) + local p = ffi.cast("struct packet *", + -- packet struct begins at payload - packet_overhead + from_umem(desc[idx].addr) - packet_overhead) + p.length = desc[idx].len + r.read = inc(r.read) + return p +end + +function reclaim (r) + local desc = ffi.cast("uint64_t *", r.desc) + local idx = mask(r.read) + local p = ffi.cast("struct packet *", from_umem(desc[idx])) + p.length = 0 + r.read = inc(r.read) + return p +end + +function pull (r) + -- NB: no need for memory barrier on x86 (see push.) + r.consumer[0] = r.read +end + +function needs_wakeup (r) + return band(r.flags[0], bits{XDP_RING_NEED_WAKEUP=1}) +end + +-- ethtool --config-ntuple ens1f1 flow-type ip4 src-ip 172.16.172.3 action 0 +xsk_bind(xsk, "ens1f1", 0) +bpf_attach("ens1f1", 0, xsk) + +local throttle = lib.throttle(1) +local filled, reclaimed, received, sent, dropped = 0, 0, 0, 0, 0 +local last_sent, last_received, last_dropped = 0, 0, 0 + +local eth = require("lib.protocol.ethernet"):new{} + +while true do + --xsk_poll(xsk) + + if throttle() then + print("fill", filled, "comp", reclaimed) + print("recv", received, "sent", sent, "drop", dropped) + print(("RX %.6f Mpps"):format((received-last_received)/1e6)) + print(("TX %.6f Mpps"):format((sent-last_sent)/1e6)) + print(("DROP %.6f Mpps"):format((dropped-last_dropped)/1e6)) + last_received, last_sent, last_dropped = received, sent, dropped + end + + for _ = 1, 100 do + if not empty(cr) then + packet.free(reclaim(cr)) + reclaimed = reclaimed + 1 + end + if not full(fr) then + fill(fr, packet.allocate()) + filled = filled + 1 + end + if not empty(rx) then + local p = receive(rx) + received = received + 1 + if not full(tx) then + assert(eth:new_from_mem(p.data, p.length)) + eth:swap() + transmit(tx, p) + sent = sent + 1 + else + packet.free(p) + dropped = dropped + 1 + end + end + end + pull(cr) + push(fr) + pull(rx) + push(tx) + + if not empty(tx) then + xsk_kick(xsk) + end + + -- if needs_wakeup(tx) then + -- xsk_kick(xsk) + -- end +end From e056fa1ca69e7953e62fba6f02bcac30e72c37d1 Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Fri, 15 Nov 2019 10:58:11 +0000 Subject: [PATCH 051/209] Remove src/xdp.snabb (accidentally checked in xdp proof of concept) --- src/xdp.snabb | 407 -------------------------------------------------- 1 file changed, 407 deletions(-) delete mode 100644 src/xdp.snabb diff --git a/src/xdp.snabb b/src/xdp.snabb deleted file mode 100644 index 9ef1f1d424..0000000000 --- a/src/xdp.snabb +++ /dev/null @@ -1,407 +0,0 @@ -#!snabb snsh - -local S = require("syscall") -local ffi = require("ffi") -local bpf = require("apps.xdp.bpf") -local lib = require("core.lib") -local bits = lib.bits -local band, bor, rshift, tobit = bit.band, bit.bor, bit.rshift, bit.tobit - --- BPF boilerplate. -function bpf_attach (ifname, queue, xsk) - assert(S.setrlimit('memlock', {cur=0x7fffffffffffffffULL, max=0x7fffffffffffffffULL})) - -- Create queue->xsk map - local map = assert(S.bpf_map_create('xskmap', 4, 4, 128)) - -- Assemble and load BPF mapper program - local c, f, m, a, s, j, fn = - bpf.c, bpf.f, bpf.m, bpf.a, bpf.s, bpf.j, bpf.fn - local insns = bpf.asm{ - -- r3 = XDP_ABORTED - { op=bor(c.ALU, a.MOV, s.K), dst=3, imm=0 }, - -- r2 = ((struct xdp_md *)ctx)->rx_queue_index - { op=bor(c.LDX, f.W, m.MEM), dst=2, src=1, off=16 }, - -- r1 = map - { op=bor(c.LD, f.DW, m.IMM), dst=1, src=s.MAP_FD, - imm=band(map:getfd(), 2^32-1) }, - { imm=rshift(map:getfd(), 32) }, - -- r0 = redirect_map(r1, r2, r3) - { op=bor(c.JMP, j.CALL), imm=fn.redirect_map }, - -- EXIT: - { op=bor(c.JMP, j.EXIT) } - } - local prog, err, log = S.bpf_prog_load( - 'xdp', insns, ffi.sizeof(insns) / ffi.sizeof(bpf.ins), "Apache 2.0" - ) - if not prog then - print(log) - error(err) - end - -- Attach BPF program to interface - local netlink = assert(S.socket('netlink', 'raw', 'route')) - -- SOL_NETLINK = 270, NETLINK_EXT_ACK = 11 - assert(S.setsockopt(netlink, 270, 11, ffi.new("int32_t[1]", 1), 4)) - assert(S.bind(netlink, S.t.sockaddr_nl())) - local req = ffi.new[[ - struct { - struct { /* nlmsghdr */ - uint32_t nlmsg_len; /* Length of message including header */ - uint16_t nlmsg_type; /* Message content */ - uint16_t nlmsg_flags; /* Additional flags */ - uint32_t nlmsg_seq; /* Sequence number */ - uint32_t nlmsg_pid; /* Sending process port ID */ - } nh; - struct { /* ifinfomsg */ - unsigned char ifi_family; - unsigned char __ifi_pad; - unsigned short ifi_type; /* ARPHRD_* */ - int ifi_index; /* Link index */ - unsigned ifi_flags; /* IFF_* flags */ - unsigned ifi_change; /* IFF_* change mask */ - } ifinfo; - struct { /* nlattr */ - uint16_t nla_len; - uint16_t nla_type; - } xdp; - struct { /* nlattr */ - uint16_t nla_len; - uint16_t nla_type; - int32_t fd; - } xdp_fd; - struct { /* nlattr */ - uint16_t nla_len; - uint16_t nla_type; - uint32_t flags; - } xdp_flags; - }__attribute__((packed))]] - - req.nh.nlmsg_flags = bor(S.c.NLM_F.REQUEST, S.c.NLM_F.ACK) - req.nh.nlmsg_type = S.c.RTM.SETLINK - req.nh.nlmsg_pid = 0 - req.nh.nlmsg_seq = 1 - req.nh.nlmsg_len = ffi.sizeof(req) - req.ifinfo.ifi_family = S.c.AF.UNSPEC - req.ifinfo.ifi_index = S.util.if_nametoindex(ifname) - req.xdp.nla_type = bor(bits{ NLA_F_NESTED=15 }, 43) -- IFLA_XDP - req.xdp.nla_len = ffi.sizeof(req.xdp) - + ffi.sizeof(req.xdp_fd) - + ffi.sizeof(req.xdp_flags) - req.xdp_fd.nla_type = 1 -- IFLA_XDP_FD - req.xdp_fd.fd = prog:getfd() - req.xdp_fd.nla_len = ffi.sizeof(req.xdp_fd) - req.xdp_flags.nla_type = 3 -- IFLA_XDP_FLAGS - req.xdp_flags.flags = bits{ XDP_FLAGS_DRV_MODE=2 } - req.xdp_flags.nla_len = ffi.sizeof(req.xdp_flags) - assert(netlink:send(req, ffi.sizeof(req))) - local res = assert(S.nl.read(netlink, nil, nil, true)) - if res.error then - error("NETLINK responded with error: "..res.error) - end - netlink:close() - -- Insert queue:xsk into map - local qno = ffi.new("uint32_t[1]", queue) - local sfd = ffi.new("uint32_t[1]", xsk:getfd()) - assert(S.bpf_map_op('map_update_elem', map, qno, sfd)) -end - --- Types -ffi.cdef[[ - struct sockaddr_xdp { - uint16_t sxdp_family; - uint16_t sxdp_flags; - uint32_t sxdp_ifindex; - uint32_t sxdp_queue_id; - uint32_t sxdp_shared_umem_fd; - } __attribute__((packed)); - - struct xdp_umem_reg { - uint8_t *addr; /* Start of packet data area */ - uint64_t len; /* Length of packet data area */ - uint32_t chunk_size; - uint32_t headroom; - uint32_t flags; /* Not available in 4.19 */ - } __attribute__((packed)); - - struct xdp_ring_offset { - uint64_t producer; - uint64_t consumer; - uint64_t desc; - //uint64_t flags; /* Not available in 4.19 */ - } __attribute__((packed)); - - struct xdp_mmap_offsets { - struct xdp_ring_offset rx; - struct xdp_ring_offset tx; - struct xdp_ring_offset fr; /* Fill */ - struct xdp_ring_offset cr; /* Completion */ - } __attribute__((packed)); - - struct xdp_desc { - uint64_t addr; - uint32_t len; - uint32_t options; - } __attribute__((packed)); -]] - - --- Create XDP socket - -local xsk = assert(S.socket('xdp', 'raw')) - --- Socket operations - -function xsk_kick (xsk) - return S.sendto(xsk, nil, 0, 'dontwait', nil, 0) -end - -function xsk_bind (xsk, ifname, queue) - local sxdp = ffi.new("struct sockaddr_xdp") - sxdp.sxdp_family = S.c.AF.XDP - sxdp.sxdp_ifindex = S.util.if_nametoindex(ifname) - sxdp.sxdp_queue_id = queue or 0 - --sxdp.sxdp_flags = bits{XDP_ZEROCOPY=2} - assert(S.bind(xsk, sxdp, ffi.sizeof(sxdp))) -end - -function xsk_poll (xsk) - local pfds = S.types.t.pollfds{{ fd=xsk, events='in'}} - assert(S.poll(pfds, 1000)) -end - --- Allocate UMEM (overload dma_alloc to trick Snabb into allocating from here) - -local packet_overhead = 2 -- leading struct packet length field (uint16_t) -local default_headroom = 256 -- See core/packet - -local page_size = S.getpagesize() --- Chunk size must be <= page size and UMEM must be aligned to page size. -local num_chunks = 10000 -local chunk_size = page_size -local umem_size = chunk_size * num_chunks -local umem_backing = ffi.new("uint8_t[?]", umem_size + page_size) -local umem = ffi.cast("uint8_t*", lib.align(ffi.cast("uintptr_t", umem_backing), page_size)) - -local umem_used = 0 -require("core.memory").dma_alloc = function (_, align) - -- Hack: we ignore the requested size and return short memory regions. - -- User has to ensure - -- packet.length <= chunk_size-(default_headroom+packet_overhead) - assert(umem_used + chunk_size <= umem_size) - local chunk = umem + umem_used - umem_used = umem_used + chunk_size - return chunk -end - -local function to_umem (p) - local rel = ffi.cast("uint64_t", p) - ffi.cast("uint64_t", umem) - return rel - band(rel, chunk_size - 1) -- realign -end - -local function from_umem (u) - return umem + u -end - --- Register UMEM - -local opt = ffi.new("struct xdp_umem_reg") -opt.addr = umem -opt.len = umem_size -opt.chunk_size = chunk_size -opt.headroom = default_headroom + packet_overhead ---opt.flags = bits{XDP_UMEM_UNALIGNED_CHUNK_FLAG=1} -assert(xsk:setsockopt('xdp', 'xdp_umem_reg', opt, ffi.sizeof(opt))) - --- Map rings - -local ndesc = 2048 -local opt, optsize = ffi.new("uint32_t[1]", ndesc), 4 -assert(xsk:setsockopt('xdp', 'xdp_rx_ring', opt, optsize)) -assert(xsk:setsockopt('xdp', 'xdp_tx_ring', opt, optsize)) -assert(xsk:setsockopt('xdp', 'xdp_umem_fill_ring', opt, optsize)) -assert(xsk:setsockopt('xdp', 'xdp_umem_completion_ring', opt, optsize)) - -local offsets = ffi.new("struct xdp_mmap_offsets") -assert(xsk:getsockopt('xdp', 'xdp_mmap_offsets', offsets, ffi.sizeof(offsets))) - -local ring_t = ffi.typeof[[ - struct { - uint32_t *producer, *consumer, *flags; - void *desc; - uint32_t write, read; - } -]] - -local function map_ring (xsk, length, offset) - local prot = "read, write" - local flags = "shared, populate" - local map = assert(S.mmap(nil, length, prot, flags, xsk, offset)) - return ffi.cast("char *", map) -end - -local ringmaps = { - rx = map_ring( - xsk, - offsets.rx.desc + ndesc*ffi.sizeof("struct xdp_desc"), - 0x000000000ULL -- XDP_PGOFF_RX_RING - ), - tx = map_ring( - xsk, - offsets.tx.desc + ndesc*ffi.sizeof("struct xdp_desc"), - 0x080000000ULL -- XDP_PGOFF_TX_RING - ), - fr = map_ring( - xsk, - offsets.fr.desc + ndesc*ffi.sizeof("uintptr_t"), - 0x100000000ULL -- XDP_UMEM_PGOFF_FILL_RING - ), - cr = map_ring( - xsk, - offsets.cr.desc + ndesc*ffi.sizeof("uintptr_t"), - 0x180000000ULL -- XDP_UMEM_PGOFF_COMPLETION_RING - ) -} - -local function make_ring (map, offsets) - local r = ffi.new(ring_t) - r.producer = ffi.cast("uint32_t *", map + offsets.producer) - r.consumer = ffi.cast("uint32_t *", map + offsets.consumer) - --r.flags = ffi.cast("uint32_t *", map + offsets.flags) - r.desc = map + offsets.desc - return r -end - -local rx = make_ring(ringmaps.rx, offsets.rx) -local tx = make_ring(ringmaps.tx, offsets.tx) -local fr = make_ring(ringmaps.fr, offsets.fr) -local cr = make_ring(ringmaps.cr, offsets.cr) - --- Ring operations - -local function mask (i) return band(i, ndesc - 1) end -local function inc (i) return tobit(i + 1) end -local function full1 (r, w) return tobit(w - r) == ndesc end - -function full (r) - if full1(r.read, r.write) then - if full1(r.consumer[0], r.write) then - return true - end - r.read = r.consumer[0] - end -end - -function transmit (r, p) - local desc = ffi.cast("struct xdp_desc *", r.desc) - local idx = mask(r.write) - desc[idx].addr = to_umem(p.data) - desc[idx].len = p.length - r.write = inc(r.write) -end - -function fill (r, p) - local desc = ffi.cast("uint64_t *", r.desc) - local idx = mask(r.write) - desc[idx] = to_umem(p) - r.write = inc(r.write) -end - -function push (r) - -- NB: no need for memory barrier on x86 because of TSO. - r.producer[0] = r.write -end - -function empty (r) - if r.read == r.write then - if r.read == r.producer[0] then - return true - end - r.write = r.producer[0] - end -end - -function receive (r) - local desc = ffi.cast("struct xdp_desc *", r.desc) - local idx = mask(r.read) - local p = ffi.cast("struct packet *", - -- packet struct begins at payload - packet_overhead - from_umem(desc[idx].addr) - packet_overhead) - p.length = desc[idx].len - r.read = inc(r.read) - return p -end - -function reclaim (r) - local desc = ffi.cast("uint64_t *", r.desc) - local idx = mask(r.read) - local p = ffi.cast("struct packet *", from_umem(desc[idx])) - p.length = 0 - r.read = inc(r.read) - return p -end - -function pull (r) - -- NB: no need for memory barrier on x86 (see push.) - r.consumer[0] = r.read -end - -function needs_wakeup (r) - return band(r.flags[0], bits{XDP_RING_NEED_WAKEUP=1}) -end - --- ethtool --config-ntuple ens1f1 flow-type ip4 src-ip 172.16.172.3 action 0 -xsk_bind(xsk, "ens1f1", 0) -bpf_attach("ens1f1", 0, xsk) - -local throttle = lib.throttle(1) -local filled, reclaimed, received, sent, dropped = 0, 0, 0, 0, 0 -local last_sent, last_received, last_dropped = 0, 0, 0 - -local eth = require("lib.protocol.ethernet"):new{} - -while true do - --xsk_poll(xsk) - - if throttle() then - print("fill", filled, "comp", reclaimed) - print("recv", received, "sent", sent, "drop", dropped) - print(("RX %.6f Mpps"):format((received-last_received)/1e6)) - print(("TX %.6f Mpps"):format((sent-last_sent)/1e6)) - print(("DROP %.6f Mpps"):format((dropped-last_dropped)/1e6)) - last_received, last_sent, last_dropped = received, sent, dropped - end - - for _ = 1, 100 do - if not empty(cr) then - packet.free(reclaim(cr)) - reclaimed = reclaimed + 1 - end - if not full(fr) then - fill(fr, packet.allocate()) - filled = filled + 1 - end - if not empty(rx) then - local p = receive(rx) - received = received + 1 - if not full(tx) then - assert(eth:new_from_mem(p.data, p.length)) - eth:swap() - transmit(tx, p) - sent = sent + 1 - else - packet.free(p) - dropped = dropped + 1 - end - end - end - pull(cr) - push(fr) - pull(rx) - push(tx) - - if not empty(tx) then - xsk_kick(xsk) - end - - -- if needs_wakeup(tx) then - -- xsk_kick(xsk) - -- end -end From ee4e42d23fcdf60c38c31e3a6a53ce19da8c5c70 Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Mon, 18 Nov 2019 11:21:28 +0000 Subject: [PATCH 052/209] core.packet: expose free_internal --- src/README.md | 17 ++++++++++++++--- src/core/packet.lua | 2 +- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/src/README.md b/src/README.md index 3f1f34ff7d..e02bddb988 100644 --- a/src/README.md +++ b/src/README.md @@ -422,10 +422,21 @@ Allocate packet and fill it with *length* bytes from *pointer*. Allocate packet and fill it with the contents of *string*. -— Function **packet.clone_to_memory* *pointer* *packet* +— Function **packet.account_free** *packet* + +Increment internal engine statistics (*frees*, *freebytes*, *freebits*) as if +*packet* were freed, but do not actually put it back onto the freelist. + +This function is intended to be used by I/O apps in special cases that need +more finegrained control over packet freeing. + +— Function **packet.free_internal** *packet* + +Free *packet* and put it back onto the freelist, but do not increment internal +engine statistics (*frees*, *freebytes*, *freebits*). + +See **packet.account_free**, **packet.free**. -Creates an exact copy of at memory pointed to by *pointer*. *Pointer* must -point to a `packet.packet_t`. ## Memory (core.memory) diff --git a/src/core/packet.lua b/src/core/packet.lua index bc7a14cd5f..c24e3e2f35 100644 --- a/src/core/packet.lua +++ b/src/core/packet.lua @@ -257,7 +257,7 @@ end function from_string (d) return from_pointer(d, #d) end -- Free a packet that is no longer in use. -local function free_internal (p) +function free_internal (p) local ptr = ffi.cast("char*", p) p = ffi.cast(packet_ptr_t, ptr - get_headroom(ptr) + default_headroom) p.length = 0 From b67729262a4f2148ac9a5b1cffe9bb8628e78aab Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Mon, 18 Nov 2019 12:13:57 +0000 Subject: [PATCH 053/209] apps.xdp: fix free stats accounting, cleanup fill/completion mngmnt --- src/apps/xdp/xdp.lua | 62 ++++++++++++++++++++++++++++++++++---------- 1 file changed, 48 insertions(+), 14 deletions(-) diff --git a/src/apps/xdp/xdp.lua b/src/apps/xdp/xdp.lua index 9e0e99dd2e..2b2c967d2e 100644 --- a/src/apps/xdp/xdp.lua +++ b/src/apps/xdp/xdp.lua @@ -223,9 +223,9 @@ local netlink_set_link_xdp_request_t = ffi.typeof[[ -- (rx, tx, fr, cr). -- -- For the Linux kernel to be able to fill the rx ring we need to provide it --- UMEM chunks via the fill ring (fr). Superfluous UMEM chunks held by the --- kernel are fed back to the userspace application via the --- completion ring (cr). +-- UMEM chunks via the fill ring (fr). Chunks used by us to send packets via +-- the tx ring are returned by the kernel back to the userspace application via +-- the completion ring (cr). -- -- It is important to note that XDP rings operate on chunks: the addr field -- of xdp_desc_t points *into* a chunk, and its len field is, from the kernel’s @@ -297,10 +297,11 @@ function receive (r) end function reclaim (r) + -- NB: reclaim does not (re)set the payload length field. + -- Reclaimed packets do *not* have known payload lengths! local desc = ffi.cast("uint64_t *", r.desc) local idx = mask(r.read) local p = ffi.cast("struct packet *", from_umem(desc[idx])) - p.length = 0 r.read = inc(r.read) return p end @@ -519,11 +520,8 @@ end function XDP:pull () local output = self.output.output local rx, fr = self.rx, self.fr + self:refill() if not output then return end - while not full(fr) do - fill(fr, packet.allocate()) - end - push(fr) for _ = 1, engine.pull_npackets do if empty(rx) then break end link.transmit(output, receive(rx)) @@ -535,21 +533,57 @@ function XDP:push () local input = self.input.input local tx, cr = self.tx, self.cr if not input then return end - while not empty(cr) do - packet.free(reclaim(cr)) - end - pull(cr) while not link.empty(input) and not full(tx) do local p = link.receive(input) - packet.account_free(p) transmit(tx, p) + -- Stimulate breathing: after the kernel is done with the packet buffer + -- it will either fed back from the completion ring onto the free ring, + -- or put back onto the freelist via packet.free_internal; hence, account + -- statistics for freed packet here. + packet.account_free(p) end push(tx) if self.kernel_has_ring_flags then if needs_wakeup(tx) then self:kick() end else - if not empty(tx) then self:kick() end + if full(tx) then self:kick() end + end +end + +function XDP:refill () + local input, output = self.input.input, self.output.output + local fr, cr = self.fr, self.cr + -- If the queue operates in duplex mode (i.e., has both input and output + -- links attached) we feed packet buffers from the completion ring back onto + -- the fill ring. + if input and output then + while not (empty(cr) or full(fr)) do + fill(fr, reclaim(cr)) + end + end + -- If the queue has its output attached we make sure that the kernel does + -- not run out of packet buffers to fill the rx ring with by keeping the + -- fill ring topped up with fresh packets. + -- (If no input is attached, the completion ring is not used, and + -- all packet buffers for rx will be allocated here.) + if output then + while not full(fr) do + fill(fr, packet.allocate()) + end end + -- If the queue has its input attached we release any packet buffers + -- remaining in the completion ring back to the packet freelist. + -- (If not output is attached, the fill ring is not used, and + -- all packet buffers used for tx will be reclaimed here.) + if input then + while not empty(cr) do + -- NB: mandatory free_internal since we do not know the payload length + -- of reclaimed packets. + packet.free_internal(reclaim(cr)) + end + end + push(fr) + pull(cr) end function XDP:kick () From 1b3593f47d3a3d91cf576a71cd6b593aa4ccdc70 Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Mon, 18 Nov 2019 12:15:49 +0000 Subject: [PATCH 054/209] apps.xdp: add duplex test case, fix random_v4_packets (random_v4_packets generated wrong sized packets) --- src/apps/xdp/xdp.lua | 68 ++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 63 insertions(+), 5 deletions(-) diff --git a/src/apps/xdp/xdp.lua b/src/apps/xdp/xdp.lua index 2b2c967d2e..4ad0282389 100644 --- a/src/apps/xdp/xdp.lua +++ b/src/apps/xdp/xdp.lua @@ -611,8 +611,11 @@ function selftest () os.exit(engine.test_skipped_code) end snabb_enable_xdp() + engine.report_load() print("test: rxtx") selftest_rxtx(xdpdeva, xdpmaca, xdpdevb, xdpmacb, nqueues) + print("test: duplex") + selftest_duplex(xdpdeva, xdpmaca, xdpdevb, xdpmacb, nqueues) print("test: rxtx_match") selftest_rxtx_match(xdpdeva, xdpmaca, xdpdevb, xdpmacb) print("selftest ok") @@ -628,12 +631,12 @@ local function random_v4_packets (conf) for _, size in ipairs(conf.sizes) do for _=1,100 do local ip = ipv4:new{src=lib.random_bytes(4), - dst=lib.random_bytes(4), - total_length=size-eth:sizeof()} + dst=lib.random_bytes(4)} + ip:total_length(size - eth:sizeof()) local payload_length = ip:total_length() - ip:sizeof() local p = packet.allocate() packet.append(p, eth:header(), eth:sizeof()) - packet.append(p, ip:header(), eth:sizeof()) + packet.append(p, ip:header(), ip:sizeof()) packet.append(p, lib.random_bytes(payload_length), payload_length) table.insert(packets, p) end @@ -647,7 +650,7 @@ function selftest_rxtx (xdpdeva, xdpmaca, xdpdevb, xdpmacb, nqueues) local synth = require("apps.test.synth") config.app(c, "source", synth.Synth, { packets = random_v4_packets{ - sizes = {60,64,67,128,133,192,256,384,512,777,1024,1500,2001}, + sizes = {60}, src = xdpmaca, dst = xdpmacb }}) @@ -668,7 +671,7 @@ function selftest_rxtx (xdpdeva, xdpmaca, xdpdevb, xdpmacb, nqueues) end engine.configure(c) print("kernel_has_ring_flags", XDP.kernel_has_ring_flags) - engine.main{ duration = 1 } + engine.main{ duration=1 } engine.report_links() local txtotal, rxtotal = 0, 0 for queue = 0, nqueues-1 do @@ -683,6 +686,61 @@ function selftest_rxtx (xdpdeva, xdpmaca, xdpdevb, xdpmacb, nqueues) "Too little packets received") end +function selftest_duplex (xdpdeva, xdpmaca, xdpdevb, xdpmacb, nqueues) + local c = config.new() + local basic = require("apps.basic.basic_apps") + local synth = require("apps.test.synth") + config.app(c, "source_a", synth.Synth, { + packets = random_v4_packets{ + sizes = {60}, + src = xdpmaca, + dst = xdpmacb + }}) + config.app(c, "source_b", synth.Synth, { + packets = random_v4_packets{ + sizes = {60}, + src = xdpmacb, + dst = xdpmaca + }}) + config.app(c, "sink", basic.Sink) + for queue = 0, nqueues-1 do + local queue_a = xdpdeva.."_q"..queue + local queue_b = xdpdevb.."_q"..queue + config.app(c, queue_a, XDP, { + ifname = xdpdeva, + queue = queue + }) + config.app(c, queue_b, XDP, { + ifname = xdpdevb, + queue = queue + }) + config.link(c, "source_a.output"..queue.." -> "..queue_a..".input") + config.link(c, "source_b.output"..queue.." -> "..queue_b..".input") + config.link(c, queue_a..".output -> sink.input_a"..queue) + config.link(c, queue_b..".output -> sink.input_b"..queue) + end + engine.configure(c) + print("kernel_has_ring_flags", XDP.kernel_has_ring_flags) + engine.main{ duration=1 } + engine.report_links() + for label, stream in ipairs{ + ['a->b'] = {'a','b'}, + ['b->a'] = {'b','a'} + } do + local txtotal, rxtotal = 0, 0 + for queue = 0, nqueues-1 do + local tx = link.stats(engine.app_table["source_"..stream[0]].output["output_"..queue]) + local rx = link.stats(engine.app_table.sink.input["input_"..stream[1]..queue]) + assert(tx.rxpackets > 0, "["..label"..] No packets sent on queue: "..queue) + assert(rx.rxpackets > 0, "["..label"..] No packets received on queue: "..queue) + txtotal = txtotal + tx.rxpackets + rxtotal = rxtotal + rx.rxpackets + end + assert(math.abs(txtotal - rxtotal) <= txtotal*.10, -- 10% tolerance + "["..label"..] Too little packets received") + end +end + function selftest_rxtx_match (xdpdeva, xdpmaca, xdpdevb, xdpmacb) local c = config.new() local synth = require("apps.test.synth") From 22d754c14a3e297fdb13b837cdedaf664e54b4bb Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Mon, 18 Nov 2019 12:17:19 +0000 Subject: [PATCH 055/209] apps.xdp: cleanup bpf_ld_imm64/LD_MAP_FD construction --- src/apps/xdp/xdp.lua | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/apps/xdp/xdp.lua b/src/apps/xdp/xdp.lua index 4ad0282389..fa3c733046 100644 --- a/src/apps/xdp/xdp.lua +++ b/src/apps/xdp/xdp.lua @@ -381,9 +381,8 @@ function XDP:xdp_prog (xskmap) -- r2 = ((struct xdp_md *)ctx)->rx_queue_index { op=bor(c.LDX, f.W, m.MEM), dst=2, src=1, off=16 }, -- r1 = xskmap - { op=bor(c.LD, f.DW, m.IMM), dst=1, src=s.MAP_FD, - imm=band(xskmap:getfd(), 2^32-1) }, - { imm=rshift(xskmap:getfd(), 32) }, + { op=bor(c.LD, f.DW, m.IMM), dst=1, src=s.MAP_FD, imm=xskmap:getfd() }, + { imm=0 }, -- nb: upper 32 bits of 64-bit (DW) immediate -- r0 = redirect_map(r1, r2, r3) { op=bor(c.JMP, j.CALL), imm=fn.redirect_map }, -- EXIT: From 4e318568e33a5bbd1c54c3c62c72171e0b1b56c7 Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Mon, 18 Nov 2019 12:19:29 +0000 Subject: [PATCH 056/209] apps.xdp: generate more descriptive errors on bind(2) failure --- src/apps/xdp/xdp.lua | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/apps/xdp/xdp.lua b/src/apps/xdp/xdp.lua index fa3c733046..06fbe6e4f3 100644 --- a/src/apps/xdp/xdp.lua +++ b/src/apps/xdp/xdp.lua @@ -484,7 +484,11 @@ function XDP:create_xsk (ifname, queue) -- flags = bits{ XDP_ZEROCOPY=2 } } ) - assert(xsk.sock:bind(sa, ffi.sizeof(sa))) + local ok, err = xsk.sock:bind(sa, ffi.sizeof(sa)) + if not ok then + error(("Unable to bind AF_XDP socket to %s queue %d (%s)") + :format(ifname, queue, err)) + end return xsk end From e53f314b0a8ba27144c0872783df43970125e9c9 Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Mon, 18 Nov 2019 14:35:36 +0000 Subject: [PATCH 057/209] apps.xdp: fixup comment about engine breath stimulation --- src/apps/xdp/xdp.lua | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/apps/xdp/xdp.lua b/src/apps/xdp/xdp.lua index 06fbe6e4f3..49823f3f90 100644 --- a/src/apps/xdp/xdp.lua +++ b/src/apps/xdp/xdp.lua @@ -540,9 +540,10 @@ function XDP:push () local p = link.receive(input) transmit(tx, p) -- Stimulate breathing: after the kernel is done with the packet buffer - -- it will either fed back from the completion ring onto the free ring, - -- or put back onto the freelist via packet.free_internal; hence, account - -- statistics for freed packet here. + -- it will either be fed back from the completion ring onto the free + -- ring, or put back onto the freelist via packet.free_internal; hence, + -- account statistics for freed packet here in order to signal to the + -- engine that throughput is happening. packet.account_free(p) end push(tx) From d225e75b58fb741a827ba79a6b0b9d2b6ca469a2 Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Wed, 20 Nov 2019 13:57:52 +0000 Subject: [PATCH 058/209] apps.xdp: make num_chunks configurable in snabb_enable_xdp(opt) --- src/apps/xdp/README.md | 13 ++++++++++++- src/apps/xdp/xdp.lua | 7 ++++++- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/src/apps/xdp/README.md b/src/apps/xdp/README.md index 81a3f325ab..e8335e18ae 100644 --- a/src/apps/xdp/README.md +++ b/src/apps/xdp/README.md @@ -45,10 +45,21 @@ Due to a combination of how Snabb uses packet buffers and a limitation of ## Module functions -— Function **snabb_enable_xdp** +— Function **snabb_enable_xdp** *options* Enables “Snabb XDP mode”. See _Caveats_! +### *Options* + +*Options* is a table of configuration options. The following parameters are +supported: + + - `num_chunks`—number of UMEM chunks to allocate. The default is 200,000 which + might not be enough depending on the number of XDP sockets used by the + process. Each instance of the XDP app uses up to around 25,000 chunks at any + time. However, generous over-provisioning (at least double of the expected + residency) is recommended due to buffering in the Snabb engine. + ## Setting up XDP capable devices under Linux ``` diff --git a/src/apps/xdp/xdp.lua b/src/apps/xdp/xdp.lua index 49823f3f90..25b3bdf522 100644 --- a/src/apps/xdp/xdp.lua +++ b/src/apps/xdp/xdp.lua @@ -103,7 +103,12 @@ local function from_umem (offset) end local snabb_xdp_enabled = false -function snabb_enable_xdp () +function snabb_enable_xdp (opt) + opt = opt or {} + if opt.num_chunks then + num_chunks = math.ceil(assert(tonumber(opt.num_chunks), + "num_chunks must be a number")) + end -- Allocate UMEM umem_size = chunk_size * num_chunks umem_backing = ffi.new("char[?]", umem_size + page_size) From d44fb1d5601af6cfb28865cc3d700541cfdc93bd Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Wed, 20 Nov 2019 14:02:29 +0000 Subject: [PATCH 059/209] apps.xdp: do not leak packets in XDP:stop() --- src/apps/xdp/xdp.lua | 77 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 75 insertions(+), 2 deletions(-) diff --git a/src/apps/xdp/xdp.lua b/src/apps/xdp/xdp.lua index 25b3bdf522..cb54cdbad2 100644 --- a/src/apps/xdp/xdp.lua +++ b/src/apps/xdp/xdp.lua @@ -322,6 +322,25 @@ function needs_wakeup (r) return band(r.flags[0], bits{XDP_RING_NEED_WAKEUP=1}) end +-- Rewind routines for transmit/fill. These are used by XDP:stop() to reclaim +-- packet buffers left in-fight after shutdown. + +function rewind_transmit (r) + r.write = tobit(r.write - 1) + local desc = ffi.cast(xdp_desc_ptr_t, r.desc) + local idx = mask(r.write) + return ffi.cast("struct packet *", + -- packet struct begins at payload - packet_overhead + from_umem(desc[idx].addr) - packet_overhead) +end + +function rewind_fill (r) + r.write = tobit(r.write - 1) + local desc = ffi.cast("uint64_t *", r.desc) + local idx = mask(r.write) + return ffi.cast("struct packet *", from_umem(desc[idx])) +end + -- ---- XDP App --------------------------------------------------------- @@ -480,6 +499,19 @@ function XDP:create_xsk (ifname, queue) -- relative UMEM offsets (addr). xsk.fr = self:xdp_map_ring(xsk.sock, layouts.fr, "uint64_t", 0x100000000ULL) -- XDP_UMEM_PGOFF_FILL_RING xsk.cr = self:xdp_map_ring(xsk.sock, layouts.cr, "uint64_t", 0x180000000ULL) -- XDP_UMEM_PGOFF_COMPLETION_RING + -- Counters to track packets in-flight through kernel. + -- - rxq is incremented when a packet buffer is enqueued onto the + -- fill ring and decremented when a packet buffer is dequeued from the + -- tx ring. I.e., it tracks the number of unused buffers currently left + -- on the fill ring. + -- - txq is incremented when a packet buffer is enqueued onto the tx ring + -- and decremented then a packet buffer is dequeued from the + -- completion ring. I.e, it tracks number of unused buffers currently + -- left on the tx ring. + -- The rxq and txq tallies are used by XDP:stop() to perform a clean + -- socket shutdown without leaking packet buffers. + xsk.rxq = 0 + xsk.txq = 0 -- Bind socket to interface local sa = ffi.new( sockaddr_xdp_t, @@ -522,28 +554,65 @@ end -- Instance methods function XDP:stop () + -- Close socket. self.sock:close() + -- Reclaim packet buffers left on rings. + -- + -- Problem: we need a way to tell apart which packets buffers on the + -- (write-only) tx and fill rings need to be freed, and which packet buffers + -- were already enqueued to the (read-only) rx and completions rings. + -- Otherwise, we might cause memory corruption by double-freeing packets. + -- + -- We can not however reliably inspect the kernel's internal read cursors + -- for the tx and fill rings. Instead we solve this with a *hack* based on + -- the assumptions that 1) the kernel does not modify the rings after + -- closing the XDP socket; 2) the kernel moves packets from fill to rx rings + -- and tx to completion rings *in-order*; 3) the kernel does not clobber + -- descriptors that have not yet moved to an rx or completion ring. + -- + -- First we flush the rx and completion rings, freeing any dequeued packets, + -- while updating the rxq and txq tallies (see XDP:create_xsk()). + while not empty(self.rx) do + packet.free_internal(receive(self.rx)) + self.rxq = self.rxq - 1 + end + while not empty(self.cr) do + packet.free_internal(reclaim(self.cr)) + self.txq = self.txq - 1 + end + -- Then, we use the final rxq/txq tallies to infer how many packets on the + -- transmit and fill rings are left dangling, and free those amounts of + -- packets (starting from the most recently enqueued, going backwards) from + -- each ring individually. + for _ = 1, self.txq do + packet.free_internal(rewind_transmit(self.tx)) + end + for _ = 1, self.rxq do + packet.free_internal(rewind_fill(self.fr)) + end end function XDP:pull () local output = self.output.output - local rx, fr = self.rx, self.fr + local rx = self.rx self:refill() if not output then return end for _ = 1, engine.pull_npackets do if empty(rx) then break end link.transmit(output, receive(rx)) + self.rxq = self.rxq - 1 end pull(rx) end function XDP:push () local input = self.input.input - local tx, cr = self.tx, self.cr + local tx = self.tx if not input then return end while not link.empty(input) and not full(tx) do local p = link.receive(input) transmit(tx, p) + self.txq = self.txq + 1 -- Stimulate breathing: after the kernel is done with the packet buffer -- it will either be fed back from the completion ring onto the free -- ring, or put back onto the freelist via packet.free_internal; hence, @@ -568,6 +637,8 @@ function XDP:refill () if input and output then while not (empty(cr) or full(fr)) do fill(fr, reclaim(cr)) + self.txq = self.txq - 1 + self.rxq = self.rxq + 1 end end -- If the queue has its output attached we make sure that the kernel does @@ -578,6 +649,7 @@ function XDP:refill () if output then while not full(fr) do fill(fr, packet.allocate()) + self.rxq = self.rxq + 1 end end -- If the queue has its input attached we release any packet buffers @@ -589,6 +661,7 @@ function XDP:refill () -- NB: mandatory free_internal since we do not know the payload length -- of reclaimed packets. packet.free_internal(reclaim(cr)) + self.txq = self.txq - 1 end end push(fr) From f8cb80c8cd7c308f8720c0e91630ff2334bcda61 Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Wed, 20 Nov 2019 14:03:36 +0000 Subject: [PATCH 060/209] apps.xdp: munmap(2) rings in XDP:stop() (do not leak rings mappings) --- src/apps/xdp/xdp.lua | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/src/apps/xdp/xdp.lua b/src/apps/xdp/xdp.lua index cb54cdbad2..fb15c19290 100644 --- a/src/apps/xdp/xdp.lua +++ b/src/apps/xdp/xdp.lua @@ -169,6 +169,8 @@ local xdp_mmap_offsets_t = local xdp_ring_t = ffi.typeof[[ struct { + char *map; + size_t maplen; uint32_t *producer, *consumer, *flags; void *desc; uint32_t write, read; @@ -533,15 +535,15 @@ end function XDP:xdp_map_ring (socket, layout, desc_t, offset) local prot = "read, write" local flags = "shared, populate" - local length = layout.desc + xdp_ring_ndesc * ffi.sizeof(desc_t) - local map = ffi.cast("char*", assert(S.mmap(nil, length, prot, flags, socket, offset))) local r = ffi.new(xdp_ring_t) - r.producer = ffi.cast("uint32_t *", map + layout.producer) - r.consumer = ffi.cast("uint32_t *", map + layout.consumer) + r.maplen = layout.desc + xdp_ring_ndesc * ffi.sizeof(desc_t) + r.map = assert(S.mmap(nil, r.maplen, prot, flags, socket, offset)) + r.producer = ffi.cast("uint32_t *", r.map + layout.producer) + r.consumer = ffi.cast("uint32_t *", r.map + layout.consumer) if self.kernel_has_ring_flags then - r.flags = ffi.cast("uint32_t *", map + layout.flags) + r.flags = ffi.cast("uint32_t *", r.map + layout.flags) end - r.desc = map + layout.desc + r.desc = r.map + layout.desc return r end @@ -590,6 +592,11 @@ function XDP:stop () for _ = 1, self.rxq do packet.free_internal(rewind_fill(self.fr)) end + -- Unmap rings. + assert(S.munmap(self.rx.map, self.rx.maplen)) + assert(S.munmap(self.tx.map, self.tx.maplen)) + assert(S.munmap(self.fr.map, self.fr.maplen)) + assert(S.munmap(self.cr.map, self.cr.maplen)) end function XDP:pull () From e06c82fab0e81938d44a13066922453e53da3366 Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Wed, 20 Nov 2019 15:18:09 +0000 Subject: [PATCH 061/209] apps.intel_avf: use new PCI acccess API (fix integration breakage) This updates the AVF driver to use the new PCI resource acquisition API from https://github.com/snabbco/snabb/pull/1436 Fixes the currently broken :new() routine. The breakage happened when the avf branch was integrated with next, and I failed to test the merge. ^^' --- src/apps/intel_avf/intel_avf.lua | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/apps/intel_avf/intel_avf.lua b/src/apps/intel_avf/intel_avf.lua index cef2993d2e..ed684a0cef 100644 --- a/src/apps/intel_avf/intel_avf.lua +++ b/src/apps/intel_avf/intel_avf.lua @@ -730,11 +730,11 @@ function Intel_avf:new(conf) self = setmetatable(self, { __index = Intel_avf }) self:supported_hardware() - self.base, self.fd = pci.map_pci_memory_unlocked(self.pciaddress, 0) - self:load_registers() + self.fd = pci.open_pci_resource_unlocked(self.pciaddress, 0) pci.unbind_device_from_linux(self.pciaddress) pci.set_bus_master(self.pciaddress, true) - pci.disable_bus_master_cleanup(self.pciaddress) + self.base = pci.map_pci_memory(self.fd) + self:load_registers() -- wait for the nic to be ready, setup the mailbox and then reset it -- that way it doesn't matter what state you where given the card From 7722f0d8bf8a5de88de3853a967a14161a35b285 Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Wed, 20 Nov 2019 15:25:49 +0000 Subject: [PATCH 062/209] apps.intel_avf: free resources on shutdown (fix leak) Free PCI resources and remaining packets on rx/tx queues on shutdown. This fixes a resource leak. --- src/apps/intel_avf/intel_avf.lua | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/src/apps/intel_avf/intel_avf.lua b/src/apps/intel_avf/intel_avf.lua index ed684a0cef..52716a282f 100644 --- a/src/apps/intel_avf/intel_avf.lua +++ b/src/apps/intel_avf/intel_avf.lua @@ -740,13 +740,13 @@ function Intel_avf:new(conf) -- that way it doesn't matter what state you where given the card self:wait_for_vfgen_rstat() self:mbox_setup() - self:stop() + self:reset() -- FIXME -- I haven't worked out why the sleep is required but without it -- self_mbox_set_version hangs indefinitely --C.sleep(1) - -- See elaboration in Intel_avf:stop() + -- See elaboration in Intel_avf:reset() -- setup the nic for real self:mbox_setup() @@ -771,7 +771,7 @@ function Intel_avf:link() end end -function Intel_avf:stop() +function Intel_avf:reset() -- From "Appendix A Virtual Channel Protocol": -- VF sends this request to PF with no parameters PF does NOT respond! VF -- driver must delay then poll VFGEN_RSTAT register until reset completion @@ -783,6 +783,21 @@ function Intel_avf:stop() -- enough in some cases, two seconds has always worked so far. C.usleep(2e6) self:wait_for_vfgen_rstat() +end + +function Intel_avf:stop() + self:reset() + pci.set_bus_master(self.pciaddress, false) + pci.close_pci_resource(self.fd, self.base) + -- Free packets remaining in TX/RX queues. + for i = 0, self.ring_buffer_size-1 do + if self.txqueue[i] ~= nil then + packet.free(self.txqueue[i]) + end + end + for i = 0, self.ring_buffer_size-1 do + packet.free(self.rxqueue[i]) + end -- Unlink SHM alias. shm.unlink("pci/"..self.pciaddress) end From 6c330ecf5211fc2cbc80ce4bafb8c5e63285bd45 Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Thu, 21 Nov 2019 21:54:15 +0000 Subject: [PATCH 063/209] lj_syscall: add BPF_OBJ_* syscall wrappers, extend bpf_attr union --- lib/ljsyscall/syscall/linux/ffi.lua | 1 + lib/ljsyscall/syscall/linux/syscalls.lua | 25 ++++++++++++++++++++++++ 2 files changed, 26 insertions(+) diff --git a/lib/ljsyscall/syscall/linux/ffi.lua b/lib/ljsyscall/syscall/linux/ffi.lua index 86026c9141..2df6267494 100644 --- a/lib/ljsyscall/syscall/linux/ffi.lua +++ b/lib/ljsyscall/syscall/linux/ffi.lua @@ -538,6 +538,7 @@ union bpf_attr { struct { uint64_t pathname __attribute__((aligned(8))); uint32_t bpf_fd; + uint32_t file_flags; }; } __attribute__((aligned(8))); struct perf_event_attr { diff --git a/lib/ljsyscall/syscall/linux/syscalls.lua b/lib/ljsyscall/syscall/linux/syscalls.lua index 17e8911e57..8766481376 100644 --- a/lib/ljsyscall/syscall/linux/syscalls.lua +++ b/lib/ljsyscall/syscall/linux/syscalls.lua @@ -887,6 +887,31 @@ if C.bpf then end return ret end + function S.bpf_obj_pin(path, fd, flags) + local attr = t.bpf_attr1() + local pathname = ffi.new("char[?]", #path+1) + ffi.copy(pathname, path) + attr[0].pathname = ptr_to_u64(pathname) + attr[0].bpf_fd = getfd(fd) + attr[0].file_flags = flags or 0 + local ret = S.bpf(c.BPF_CMD.OBJ_PIN, attr) + if ret ~= 0 then + return nil, t.error(errno()) + end + return ret + end + function S.bpf_obj_get(path, flags) + local attr = t.bpf_attr1() + local pathname = ffi.new("char[?]", #path+1) + ffi.copy(pathname, path) + attr[0].pathname = ptr_to_u64(pathname) + attr[0].file_flags = flags or 0 + local ret = S.bpf(c.BPF_CMD.OBJ_GET, attr) + if ret < 0 then + return nil, t.error(errno()) + end + return retfd(ret) + end end -- Linux performance monitoring From ea955a8bca79e255f1aab24a85b859677963e4fa Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Thu, 21 Nov 2019 21:57:06 +0000 Subject: [PATCH 064/209] apps.xdp: allow sharing interfaces between Snabb processes This commit uses BPF object pinning to expose per-interface xskmaps to other Snabb processes so they can attach to free queues on interfaces initialized by other Snabb processes. --- src/apps/xdp/xdp.lua | 127 ++++++++++++++++++++++++++++++++++++++----- 1 file changed, 113 insertions(+), 14 deletions(-) diff --git a/src/apps/xdp/xdp.lua b/src/apps/xdp/xdp.lua index fb15c19290..7ce28c8dc9 100644 --- a/src/apps/xdp/xdp.lua +++ b/src/apps/xdp/xdp.lua @@ -352,7 +352,6 @@ XDP = { queue = {default=0} -- interface queue (zero based) }, -- Class variables: - queues = {}, -- queue-to-socket maps for each interface kernel_has_ring_flags = true -- feature detection status for descriptor ring flags } @@ -361,18 +360,58 @@ XDP = { function XDP:new (conf) assert(snabb_xdp_enabled, "Snabb XDP mode must be enabled.") -- Ensure interface is initialized for XDP usage. - if not self.queues[conf.ifname] then - self.queues[conf.ifname] = self:create_xskmap() - self:initialize_xdp(conf.ifname, self.queues[conf.ifname]) - end + local lockfd, mapfd = self:open_interface(conf.ifname) -- Create XDP socket (xsk) for queue. - local xsk = self:create_xsk(conf.ifname, conf.queue) + local xsk = self:create_xsk(conf.ifname, lockfd, conf.queue) -- Attach the socket to queue in the BPF map. - self:set_queue_socket(self.queues[conf.ifname], conf.queue, xsk) + self:set_queue_socket(mapfd, conf.queue, xsk) + mapfd:close() -- not longer needed -- Finish initialization. return setmetatable(xsk, {__index=XDP}) end +function XDP:open_interface (ifname) + -- Open an interface-dependent file we know should exist to use as a + -- Snabb-wide lock. The contents of the file are really irrelevant here. + -- However, we depend on the file not being locked by other applications in + -- general. :-) + local lockfd = S.open("/sys/class/net/"..ifname.."/operstate", "rdonly") + local mapfd, progfd + local xskmap_path = "/sys/fs/bpf/snabb/"..ifname.."/xskmap" + local prog_path = "/sys/fs/bpf/snabb/"..ifname.."/xdp" + -- If the open above failed we assume that no device by ifname exists. + assert(lockfd, "Could not open interface: "..ifname.." (does it exist?)") + if lockfd:flock("ex, nb") then + -- If we get an exclusive lock we know that no other Snabb processes are + -- using the interface so its safe to setup the interface and replace any + -- existsing BPF XDP program/maps attached to it. + S.mkdir("/sys/fs/bpf/snabb", "rwxu, rgrp, xgrp, roth, xoth") + S.util.rm("/sys/fs/bpf/snabb/"..ifname) + S.mkdir("/sys/fs/bpf/snabb/"..ifname, "rwxu, rgrp, xgrp, roth, xoth") + -- Create xskmap and XDP program to run on the NIC. + mapfd = self:create_xskmap() + progfd = self:xdp_prog(mapfd) + self:set_link_xdp(ifname, progfd) + -- Pin xskmap so it can be accessed by other Snabb processes to attach to + -- the interface. Also pin the XDP program, just 'cause. + assert(S.bpf_obj_pin(xskmap_path, mapfd)) + assert(S.bpf_obj_pin(prog_path, progfd)) + progfd:close() -- no longer needed + lockfd:flock("sh") -- share lock + else + lockfd:flock("sh") + -- Wait for the lock to be shared: once it is no longer held exclusively + -- we know that the interface is setup and ready to use. + -- Get the currently pinned xskmap to insert our XDP socket into. + mapfd = assert(S.bpf_obj_get(xskmap_path)) + end + -- lockfd: holds a shared lock for as long as we do not close it, signaling + -- other Snabb processes that the interface is in use. + -- mapfd: the xskmap for the interface used to + -- attach XDP sockets to queues. + return lockfd, mapfd +end + function XDP:create_xskmap () local klen, vlen = ffi.sizeof("int"), ffi.sizeof("int") local nentries = 128 @@ -391,10 +430,6 @@ function XDP:create_xskmap () error("Failed to create BPF map: "..tostring(err)) end -function XDP:initialize_xdp (ifname, xskmap) - self:set_link_xdp(ifname, self:xdp_prog(xskmap)) -end - function XDP:xdp_prog (xskmap) -- Assemble and load XDP BPF program. local c, f, m, a, s, j, fn = @@ -457,8 +492,8 @@ function XDP:set_link_xdp(ifname, prog) netlink:close() end -function XDP:create_xsk (ifname, queue) - local xsk = { sock = assert(S.socket('xdp', 'raw')) } +function XDP:create_xsk (ifname, lockfd, queue) + local xsk = { sock = assert(S.socket('xdp', 'raw')), lockfd = lockfd } -- Register UMEM. local umem_reg = ffi.new( xdp_umem_reg_t, @@ -597,6 +632,8 @@ function XDP:stop () assert(S.munmap(self.tx.map, self.tx.maplen)) assert(S.munmap(self.fr.map, self.fr.maplen)) assert(S.munmap(self.cr.map, self.cr.maplen)) + -- Close interface lockfd. See XDP:open_interface(). + self.lockfd:close() end function XDP:pull () @@ -694,7 +731,7 @@ function selftest () local xdpmaca = lib.getenv("SNABB_XDP_MAC0") local xdpdevb = lib.getenv("SNABB_XDP1") local xdpmacb = lib.getenv("SNABB_XDP_MAC1") - local nqueues = lib.getenv("SNABB_XDP_NQUEUES") or 1 + local nqueues = tonumber(lib.getenv("SNABB_XDP_NQUEUES")) or 1 if not (xdpdeva and xdpmaca and xdpdevb and xdpmacb) then print("SNABB_XDP0 and SNABB_XDP1 must be set. Skipping selftest.") os.exit(engine.test_skipped_code) @@ -707,6 +744,10 @@ function selftest () selftest_duplex(xdpdeva, xdpmaca, xdpdevb, xdpmacb, nqueues) print("test: rxtx_match") selftest_rxtx_match(xdpdeva, xdpmaca, xdpdevb, xdpmacb) + if nqueues > 1 then + print("test: share_interface") + selftest_share_interface(xdpdeva, xdpmaca, xdpdevb, xdpmacb, nqueues) + end print("selftest ok") end @@ -855,3 +896,61 @@ function selftest_rxtx_match (xdpdeva, xdpmaca, xdpdevb, xdpmacb) engine.report_apps() assert(#engine.app_table.match:errors() == 0, "Match errors.") end + +function selftest_share_interface_worker (xdpdev, queue) + snabb_enable_xdp() + local c = config.new() + local basic = require("apps.basic.basic_apps") + local recv = xdpdev.."_q"..queue + config.app(c, recv, XDP, { + ifname = xdpdev, + queue = queue + }) + config.app(c, "sink", basic.Sink) + config.link(c, recv..".output -> sink.input") + engine.configure(c) + engine.main{ duration=.1, no_report = true } + print("[worker links]") + engine.report_links() + assert(link.stats(engine.app_table.sink.input.input).rxpackets > 0, + "No packets received on "..recv.." in worker.") +end + +function selftest_share_interface (xdpdeva, xdpmaca, xdpdevb, xdpmacb, nqueues) + local c = config.new() + local worker = require("core.worker") + local basic = require("apps.basic.basic_apps") + local synth = require("apps.test.synth") + config.app(c, "source", synth.Synth, { + packets = random_v4_packets{ + sizes = {60}, + src = xdpmaca, + dst = xdpmacb + }}) + config.app(c, "sink", basic.Sink) + for queue = 0, nqueues-2 do + local queue_a = xdpdeva.."_q"..queue + local queue_b = xdpdevb.."_q"..queue + config.app(c, queue_a, XDP, { + ifname = xdpdeva, + queue = queue + }) + config.app(c, queue_b, XDP, { + ifname = xdpdevb, + queue = queue + }) + config.link(c, "source.output"..queue.." -> "..queue_a..".input") + config.link(c, queue_b..".output -> sink.input"..queue) + end + engine.configure(c) + worker.start('worker', ("require('apps.xdp.xdp').selftest_share_interface_worker('%s', %d)") + :format(xdpdevb, nqueues-1)) + engine.main{ done=function () return not worker.status().worker.alive end, + no_report = true } + local worker_status = worker.status().worker.status + print("[parent links]") + engine.report_links() + if worker_status ~= 0 then + os.exit(worker_status) + end +end From 01f11e9705e2adcea0423eb53dbca04952caf224 Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Mon, 25 Nov 2019 18:18:28 +0100 Subject: [PATCH 065/209] pflua: do not cdef struct bpf_ins (collides with ljsyscall) --- lib/pflua/src/pf/types.lua | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/lib/pflua/src/pf/types.lua b/lib/pflua/src/pf/types.lua index 4ad1758864..5a727b14f8 100644 --- a/lib/pflua/src/pf/types.lua +++ b/lib/pflua/src/pf/types.lua @@ -38,10 +38,6 @@ struct pcap_pkthdr { -- with the high-bit set as negative int32_t values, so we do the same -- for all of our 32-bit values including the "k" field in BPF -- instructions. -ffi.cdef[[ -struct bpf_insn { uint16_t code; uint8_t jt, jf; int32_t k; }; -struct bpf_program { uint32_t bf_len; struct bpf_insn *bf_insns; }; -]] local bpf_program_mt = { __len = function (program) return program.bf_len end, __index = function (program, idx) @@ -50,8 +46,8 @@ local bpf_program_mt = { end } -bpf_insn = ffi.typeof("struct bpf_insn") -bpf_program = ffi.metatype("struct bpf_program", bpf_program_mt) +bpf_insn = ffi.typeof("struct { uint16_t code; uint8_t jt, jf; int32_t k; }") +bpf_program = ffi.metatype("struct { uint32_t bf_len; struct bpf_insn *bf_insns; }", bpf_program_mt) pcap_record = ffi.typeof("struct pcap_record") pcap_pkthdr = ffi.typeof("struct pcap_pkthdr") From ae35674a1aecde34fb89430f20d2a547805039a7 Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Tue, 3 Dec 2019 13:57:09 +0100 Subject: [PATCH 066/209] core.app: add clearvmprofiles() function --- src/core/app.lua | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/src/core/app.lua b/src/core/app.lua index 44dd724a59..80a4a9a29c 100644 --- a/src/core/app.lua +++ b/src/core/app.lua @@ -82,7 +82,7 @@ int vmprofile_get_profile_size(); void vmprofile_set_profile(void *counters); ]] -local vmprofile_t = ffi.new("uint8_t["..C.vmprofile_get_profile_size().."]") +local vmprofile_t = ffi.typeof("uint8_t["..C.vmprofile_get_profile_size().."]") local vmprofiles = {} local function getvmprofile (name) @@ -96,6 +96,16 @@ function setvmprofile (name) C.vmprofile_set_profile(getvmprofile(name)) end +function clearvmprofiles () + jit.vmprofile.stop() + for name, profile in pairs(vmprofiles) do + shm.unmap(profile) + shm.unlink("vmprofile/"..name..".vmprofile") + vmprofiles[name] = nil + end + jit.vmprofile.start() +end + -- True when the engine is running the breathe loop. local running = false From 1124b80f702ce46e3a7818b15ff7f0da3a7f3780 Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Tue, 3 Dec 2019 13:57:54 +0100 Subject: [PATCH 067/209] lib.timers.ingress_drop_monitor: clear vmprofiles on flush --- src/lib/timers/ingress_drop_monitor.lua | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/lib/timers/ingress_drop_monitor.lua b/src/lib/timers/ingress_drop_monitor.lua index c1268fc04a..b476111070 100644 --- a/src/lib/timers/ingress_drop_monitor.lua +++ b/src/lib/timers/ingress_drop_monitor.lua @@ -88,7 +88,10 @@ function IngressDropMonitor:jit_flush_if_needed () print(msg) self.ingress_packet_drop_alarm:raise({alarm_text=msg}) - if self.action == 'flush' then jit.flush() end + if self.action == 'flush' then + jit.flush() + engine.clearvmprofiles() + end end function IngressDropMonitor:timer(interval) From 8f2a2161da091205a1ec3f4350fec75eab7b993d Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Tue, 3 Dec 2019 16:13:52 +0100 Subject: [PATCH 068/209] lib.timers.ingress_drop_monitor: reset last_value on drop-free period --- src/lib/timers/ingress_drop_monitor.lua | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/lib/timers/ingress_drop_monitor.lua b/src/lib/timers/ingress_drop_monitor.lua index b476111070..b8b82e6e53 100644 --- a/src/lib/timers/ingress_drop_monitor.lua +++ b/src/lib/timers/ingress_drop_monitor.lua @@ -21,11 +21,13 @@ local IngressDropMonitor = {} function new(args) local ret = { threshold = args.threshold or 100000, + threshold_timeout = args.threshold_timeout or 10, wait = args.wait or 30, grace_period = args.grace_period or 10, action = args.action or 'flush', tips_url = args.tips_url or default_tips_url, last_flush = now(), -- Start in the grace period. + last_drop = now(), last_value = ffi.new('uint64_t[1]'), current_value = ffi.new('uint64_t[1]'), } @@ -71,6 +73,12 @@ function IngressDropMonitor:jit_flush_if_needed () self.last_value[0] = self.current_value[0] return end + if self.last_value[0] < self.current_value[0] then + self.last_drop = now() + elseif now() - self.last_drop > self.threshold_timeout then + -- Reset last_value if no drops occurred within threshold_timeout. + self.last_value[0] = self.current_value[0] + end if self.current_value[0] - self.last_value[0] < self.threshold then self.ingress_packet_drop_alarm:clear() return From 63968f69c0d3c730d2a7d173c4fc83066795f5da Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Tue, 8 Oct 2019 15:13:00 +0200 Subject: [PATCH 069/209] raptorjit: clear all hotcounts every second We reset all hotcounts every second. This is a rough way to establish a relation with elapsed time so that hotcounts provide a measure of frequency. The concrete goal is to ensure that the JIT will trace code that becomes hot over a short duration, but not code that becomes hot over, say, the course of an hour. --- lib/luajit/src/lj_trace.c | 46 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/lib/luajit/src/lj_trace.c b/lib/luajit/src/lj_trace.c index ccb7629841..7de3a4eaf4 100644 --- a/lib/luajit/src/lj_trace.c +++ b/lib/luajit/src/lj_trace.c @@ -6,6 +6,8 @@ #define lj_trace_c #define LUA_CORE +#include + #include "lj_obj.h" @@ -47,6 +49,42 @@ void lj_trace_err_info(jit_State *J, TraceError e) lj_err_throw(J->L, LUA_ERRRUN); } +/* -- Hotcount decay ------------------------------------------------------ */ + +/* We reset all hotcounts every second. This is a rough way to establish a +** relation with elapsed time so that hotcounts provide a measure of frequency. +** +** The concrete goal is to ensure that the JIT will trace code that becomes hot +** over a short duration, but not code that becomes hot over, say, the course +** of an hour. +** +** The "one second" constant is certainly tunable. +** */ + +static inline uint64_t gettime_ns (void) +{ + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC, &ts); + return ts.tv_sec * 1000000000LL + ts.tv_nsec; +} + +/* Timestamp (ns) of last hotcount reset. */ +static uint64_t hotcount_decay_ts; + +/* Decay hotcounts every second. */ +int hotcount_decay (jit_State *J) +{ + uint64_t ts = gettime_ns(); + int decay = (ts - hotcount_decay_ts) > 1000000000LL; /* 1s elapsed? */ + if (decay) { + /* Reset hotcounts. */ + lj_dispatch_init_hotcount(J2G(J)); + hotcount_decay_ts = ts; + } + return decay; +} + + /* -- Trace management ---------------------------------------------------- */ /* The current trace is first assembled in J->cur. The variable length @@ -277,6 +315,8 @@ int lj_trace_flushall(lua_State *L) memset(J->penalty, 0, sizeof(J->penalty)); /* Reset hotcounts. */ lj_dispatch_init_hotcount(J2G(J)); + /* Initialize hotcount decay timestamp. */ + hotcount_decay_ts = gettime_ns(); /* Free the whole machine code and invalidate all exit stub groups. */ lj_mcode_free(J); memset(J->exitstubgroup, 0, sizeof(J->exitstubgroup)); @@ -655,6 +695,9 @@ void lj_trace_ins(jit_State *J, const BCIns *pc) void lj_trace_hot(jit_State *J, const BCIns *pc) { /* Note: pc is the interpreter bytecode PC here. It's offset by 1. */ + if (hotcount_decay(J)) + /* Check for hotcount decay, do nothing if hotcounts have decayed. */ + return; ERRNO_SAVE /* Reset hotcount. */ hotcount_set(J2GG(J), pc, J->param[JIT_P_hotloop]*HOTCOUNT_LOOP); @@ -671,6 +714,9 @@ void lj_trace_hot(jit_State *J, const BCIns *pc) /* Check for a hot side exit. If yes, start recording a side trace. */ static void trace_hotside(jit_State *J, const BCIns *pc) { + if (hotcount_decay(J)) + /* Check for hotcount decay, do nothing if hotcounts have decayed. */ + return; SnapShot *snap = &traceref(J, J->parent)->snap[J->exitno]; if (!(J2G(J)->hookmask & HOOK_GC) && isluafunc(curr_func(J->L)) && From 9a787b544bb87d0ed1e86dc7cf93845d4371ecad Mon Sep 17 00:00:00 2001 From: Andy Wingo Date: Wed, 4 Dec 2019 12:45:44 +0000 Subject: [PATCH 070/209] Fix snabb top interface view for multiple queues This change makes it so that statistics are correctly core-specific. --- src/program/top/top.lua | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/program/top/top.lua b/src/program/top/top.lua index 38c6d2ea20..4ae37c5286 100644 --- a/src/program/top/top.lua +++ b/src/program/top/top.lua @@ -606,9 +606,24 @@ function compute_display_tree.interface(tree, prev, dt, t) -- \- pci device, macaddr, mtu, speed -- RX: PPS, bps, %, [drops/s] -- TX: PPS, bps, %, [drops/s] + function queue_local_key(key, counters) + local queue_key + local stem = ({rxdrop='rxdrops'})[key] or key + for i=0,15 do + local k = 'q'..i..'_'..stem + if counters[k] then + if queue_key then + return key + end + queue_key = k + end + end + return queue_key or key + end local function rate(key, counters, prev) if not counters then return 0/0 end if not counters[key] then return 0/0 end + key = queue_local_key(key, counters) local v, rrd = counters[key], nil prev = prev and prev[key] if is_leaf(v) then From 7a0c6c80c6e6377b7a7f80aa096b2757c2bb1f70 Mon Sep 17 00:00:00 2001 From: Andy Wingo Date: Wed, 4 Dec 2019 13:32:53 +0000 Subject: [PATCH 071/209] Disable vmprofile on lwAFTR by default Add a --profile option to control whether vmprofile is enabled or not. --- src/core/app.lua | 20 +++++++++++++------- src/lib/ptree/worker.lua | 8 +++----- src/lib/scheduling.lua | 7 +++++++ src/program/lwaftr/run/README | 5 ++--- src/program/lwaftr/run/run.lua | 8 ++++---- 5 files changed, 29 insertions(+), 19 deletions(-) diff --git a/src/core/app.lua b/src/core/app.lua index 80a4a9a29c..b997307586 100644 --- a/src/core/app.lua +++ b/src/core/app.lua @@ -76,6 +76,8 @@ busywait = false -- Profiling with vmprofile -------------------------------- +vmprofile_enabled = true + -- Low-level FFI ffi.cdef[[ int vmprofile_get_profile_size(); @@ -93,17 +95,21 @@ local function getvmprofile (name) end function setvmprofile (name) - C.vmprofile_set_profile(getvmprofile(name)) + if vmprofile_enabled then + C.vmprofile_set_profile(getvmprofile(name)) + end end function clearvmprofiles () - jit.vmprofile.stop() - for name, profile in pairs(vmprofiles) do - shm.unmap(profile) - shm.unlink("vmprofile/"..name..".vmprofile") - vmprofiles[name] = nil + if vmprofile_enabled then + jit.vmprofile.stop() + for name, profile in pairs(vmprofiles) do + shm.unmap(profile) + shm.unlink("vmprofile/"..name..".vmprofile") + vmprofiles[name] = nil + end + jit.vmprofile.start() end - jit.vmprofile.start() end -- True when the engine is running the breathe loop. diff --git a/src/lib/ptree/worker.lua b/src/lib/ptree/worker.lua index 971213f230..3925fc642d 100644 --- a/src/lib/ptree/worker.lua +++ b/src/lib/ptree/worker.lua @@ -21,6 +21,7 @@ local worker_config_spec = { duration = {}, measure_latency = {default=true}, measure_memory = {default=true}, + profile = {default=true}, no_report = {default=false}, report = {default={showapps=true,showlinks=true}}, Hz = {default=1000}, @@ -46,6 +47,7 @@ function new_worker (conf) if conf.measure_memory then timer.activate(memory_info.HeapSizeMonitor.new():timer()) end + engine.vmprofile_enabled = conf.profile return ret end @@ -100,16 +102,12 @@ function Worker:handle_actions_from_manager() end function Worker:main () - local vmprofile = require("jit.vmprofile") local stop = engine.now() + self.duration local next_time = engine.now() - -- Setup vmprofile. - engine.setvmprofile("engine") - vmprofile.start() - if not engine.auditlog_enabled then engine.enable_auditlog() end + engine.setvmprofile("engine") repeat self.breathe() if next_time < engine.now() then diff --git a/src/lib/scheduling.lua b/src/lib/scheduling.lua index ab3ac2f0d8..2ddb9f3811 100644 --- a/src/lib/scheduling.lua +++ b/src/lib/scheduling.lua @@ -16,6 +16,7 @@ local scheduling_opts = { cpu = {}, -- CPU index (integer). real_time = {}, -- Boolean. ingress_drop_monitor = {}, -- Action string: one of 'flush' or 'warn'. + profile = {default=true}, -- Boolean. busywait = {default=true}, -- Boolean. eval = {} -- String. } @@ -42,6 +43,12 @@ function sched_apply.busywait (busywait) engine.busywait = busywait end +function sched_apply.profile (profile) + engine.vmprofile_enabled = profile + local vmprofile = require('jit.vmprofile') + if profile then vmprofile.start() else vmprofile.stop() end +end + function sched_apply.eval (str) loadstring(str)() end diff --git a/src/program/lwaftr/run/README b/src/program/lwaftr/run/README index 62d0eef32e..534b20e581 100644 --- a/src/program/lwaftr/run/README +++ b/src/program/lwaftr/run/README @@ -49,11 +49,10 @@ Optional arguments: Optional arguments for debugging and profiling: -v Verbose (repeat for more verbosity). + --profile Enable the low-overhead sampling + profiler. -t FILE, --trace FILE Record a trace of any run-time "snabb config" commands to FILE. - -jv, -jv=FILE Print out when traces are recorded. - -jp, -jp=MODE,FILE Profile the system by method. - -jtprof Profile the system by trace. -b FILENAME, --bench-file FILENAME Write any benchmarking data to FILENAME. -D SECONDS Stop after SECONDS, for debugging diff --git a/src/program/lwaftr/run/run.lua b/src/program/lwaftr/run/run.lua index cdf11b9f72..5690aac2b1 100644 --- a/src/program/lwaftr/run/run.lua +++ b/src/program/lwaftr/run/run.lua @@ -50,7 +50,7 @@ function parse_args(args) local conf_file, v4, v6 local ring_buffer_size local opts = { verbosity = 0 } - local scheduling = { ingress_drop_monitor = 'flush' } + local scheduling = { ingress_drop_monitor = 'flush', profile = false } local handlers = {} function handlers.n (arg) opts.name = assert(arg) end function handlers.v () opts.verbosity = opts.verbosity + 1 end @@ -100,13 +100,13 @@ function parse_args(args) .." (valid values: flush, warn, off)") end end - function handlers.j(arg) scheduling.j = arg end + function handlers.profile() scheduling.profile = true end function handlers.h() show_usage(0) end - lib.dogetopt(args, handlers, "b:c:vD:yhir:n:j:t:", + lib.dogetopt(args, handlers, "b:c:vD:yhir:n:t:", { conf = "c", name = "n", cpu = 1, v4 = 1, v6 = 1, ["on-a-stick"] = 1, virtio = "i", ["ring-buffer-size"] = "r", ["real-time"] = 0, mirror = 1, ["ingress-drop-monitor"] = 1, - verbose = "v", trace = "t", ["bench-file"] = "b", + verbose = "v", trace = "t", ["bench-file"] = "b", ["profile"] = 0, duration = "D", hydra = "y", help = "h" }) if ring_buffer_size ~= nil then if opts.virtio_net then From dfadac832fcbd730fc90c24a8d5641b29ac1916f Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Wed, 4 Dec 2019 15:19:38 +0100 Subject: [PATCH 072/209] raptorjit: amend prev. commit, also clear trace exit hotcounts --- lib/luajit/src/lj_trace.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/lib/luajit/src/lj_trace.c b/lib/luajit/src/lj_trace.c index 7de3a4eaf4..8829296862 100644 --- a/lib/luajit/src/lj_trace.c +++ b/lib/luajit/src/lj_trace.c @@ -61,6 +61,8 @@ void lj_trace_err_info(jit_State *J, TraceError e) ** The "one second" constant is certainly tunable. ** */ +static void trace_clearsnapcounts(jit_State *J); /* Forward decl. */ + static inline uint64_t gettime_ns (void) { struct timespec ts; @@ -79,6 +81,7 @@ int hotcount_decay (jit_State *J) if (decay) { /* Reset hotcounts. */ lj_dispatch_init_hotcount(J2G(J)); + trace_clearsnapcounts(J); hotcount_decay_ts = ts; } return decay; @@ -358,6 +361,20 @@ void lj_trace_freestate(global_State *g) lj_mcode_free(J); } +/* Clear all trace snap counts (side-exit hot counters). */ +static void trace_clearsnapcounts(jit_State *J) +{ + int i, s; + GCtrace *t; + /* Clear hotcounts for all snapshots of all traces. */ + for (i = 1; i < TRACE_MAX; i++) { + t = traceref(J, i); + if (t != NULL) + for (s = 0; s < t->nsnap; s++) + t->snap[s].count = 0; + } +} + /* -- Penalties and blacklisting ------------------------------------------ */ /* Blacklist a bytecode instruction. */ From 648c67a5c78b4806befa1097cf12f31ae3caede0 Mon Sep 17 00:00:00 2001 From: Andy Wingo Date: Wed, 4 Dec 2019 15:18:29 +0000 Subject: [PATCH 073/209] Add 2019.06.02 changelog entry --- src/program/lwaftr/doc/CHANGELOG.md | 45 +++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/src/program/lwaftr/doc/CHANGELOG.md b/src/program/lwaftr/doc/CHANGELOG.md index 6635173ce4..1256a7ca0e 100644 --- a/src/program/lwaftr/doc/CHANGELOG.md +++ b/src/program/lwaftr/doc/CHANGELOG.md @@ -1,5 +1,50 @@ # Change Log +## [2019.06.02] + +### Notable changes + + * Fix `snabb top` to correctly display per-worker statistics for + instances of the lwAFTR running with receive-side scaling (RSS). + See https://github.com/Igalia/snabb/pull/1237. + + * Fix a problem related to an interaction between late trace + compilation and the ingress drop monitor. + + For context, Snabb uses LuaJIT, which is a just-in-time compiler. + LuaJIT compiles program segments called traces. Traces can jump to + each other, and thereby form a graph. The shape of the trace graph + can have important performance impacts on a network function, but + building the optimal graph shape is fundamentally hard. Usually + LuaJIT does a good job, but if a network function is dropping + packets, Snabb's "ingress drop monitor" will and ask LuaJIT to + re-learn the graph of traces, in the hopes that this self-healing + process will fix the packet loss situation. + + Unfortunately, the self-healing process has some poor interactions + with so-called "long tail" traces -- traces that aren't taking an + important amount of time, but which LuaJIT might decide to compile a + few seconds into the running of a network function. Compiling a + trace can cause a latency spike and dropped packets, so the work of + compiling these long-tail traces can in fact be interpreted as a + packet loss situation, thereby triggering the self-healing process, + leading to a pathologically repeating large packet loss situation. + + The right answer is for LuaJIT to avoid the latency cost for + long-tail trace compilation. While this might make long-tail traces + run not as fast as they would if they were compiled, these traces + take so little time anyway that it doesn't matter enough to pay the + cost of trace compilation. + + See https://github.com/Igalia/snabb/pull/1236 and + https://github.com/Igalia/snabb/pull/1239 for full details. + + * Disable profiling by default. The version of LuaJIT that Snabb uses + includes a facility for online profiling of network functions. This + facility is low-overhead but not no-overhead. We have disabled it by + default on the lwAFTR; it can be enabled by passing the --profile + option. See https://github.com/Igalia/snabb/pull/1238. + ## [2019.06.01] ### Notable changes From 6524dec902671467e1434cfd679401323ec081fc Mon Sep 17 00:00:00 2001 From: Andy Wingo Date: Wed, 4 Dec 2019 15:22:50 +0000 Subject: [PATCH 074/209] fix typo --- src/program/lwaftr/doc/CHANGELOG.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/program/lwaftr/doc/CHANGELOG.md b/src/program/lwaftr/doc/CHANGELOG.md index 1256a7ca0e..1bc6c6a115 100644 --- a/src/program/lwaftr/doc/CHANGELOG.md +++ b/src/program/lwaftr/doc/CHANGELOG.md @@ -17,9 +17,9 @@ can have important performance impacts on a network function, but building the optimal graph shape is fundamentally hard. Usually LuaJIT does a good job, but if a network function is dropping - packets, Snabb's "ingress drop monitor" will and ask LuaJIT to - re-learn the graph of traces, in the hopes that this self-healing - process will fix the packet loss situation. + packets, Snabb's "ingress drop monitor" will ask LuaJIT to re-learn + the graph of traces, in the hopes that this self-healing process will + fix the packet loss situation. Unfortunately, the self-healing process has some poor interactions with so-called "long tail" traces -- traces that aren't taking an From b7c50cbba861afde71b5d20e278c34f23ae1ebdb Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Wed, 4 Dec 2019 19:25:16 +0100 Subject: [PATCH 075/209] raptorjit: amend prev. commit, do not clear SNAPCOUNT_DONE --- lib/luajit/src/lj_trace.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/luajit/src/lj_trace.c b/lib/luajit/src/lj_trace.c index 8829296862..3f32710c20 100644 --- a/lib/luajit/src/lj_trace.c +++ b/lib/luajit/src/lj_trace.c @@ -371,7 +371,8 @@ static void trace_clearsnapcounts(jit_State *J) t = traceref(J, i); if (t != NULL) for (s = 0; s < t->nsnap; s++) - t->snap[s].count = 0; + if (t->snap[s].count != SNAPCOUNT_DONE) + t->snap[s].count = 0; } } From 828fa4f4fe35a18579e35aabf52f1cf4ed42d30b Mon Sep 17 00:00:00 2001 From: Andy Wingo Date: Thu, 5 Dec 2019 08:40:59 +0000 Subject: [PATCH 076/209] lwAFTR version 2019.06.02 --- .version | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.version b/.version index 16ff8e1a24..73ea2eb7dc 100644 --- a/.version +++ b/.version @@ -1 +1 @@ -2019.06.01 +2019.06.02 From bd84224c215de356a284ad0109fcdd6eaef17184 Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Wed, 11 Dec 2019 12:39:58 +0100 Subject: [PATCH 077/209] apps.xdp: expose driver reference --- src/apps/xdp/xdp.lua | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/apps/xdp/xdp.lua b/src/apps/xdp/xdp.lua index 7ce28c8dc9..d33f0d0662 100644 --- a/src/apps/xdp/xdp.lua +++ b/src/apps/xdp/xdp.lua @@ -355,6 +355,10 @@ XDP = { kernel_has_ring_flags = true -- feature detection status for descriptor ring flags } +-- The `driver' variable is used as a reference to the driver class in +-- order to interchangeably use NIC drivers. +driver = XDP + -- Class methods function XDP:new (conf) From 1a9eba5237c806e27ce8f908642e146fb308fc92 Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Wed, 11 Dec 2019 12:40:23 +0100 Subject: [PATCH 078/209] lib.scheduling: add enable_xdp option --- src/lib/scheduling.lua | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/lib/scheduling.lua b/src/lib/scheduling.lua index ab3ac2f0d8..33f05eb5e6 100644 --- a/src/lib/scheduling.lua +++ b/src/lib/scheduling.lua @@ -17,6 +17,7 @@ local scheduling_opts = { real_time = {}, -- Boolean. ingress_drop_monitor = {}, -- Action string: one of 'flush' or 'warn'. busywait = {default=true}, -- Boolean. + enable_xdp = {}, -- Enable Snabb XDP mode (see apps.xdp.xdp). eval = {} -- String. } @@ -42,6 +43,10 @@ function sched_apply.busywait (busywait) engine.busywait = busywait end +function sched_apply.enable_xdp (opt) + if opt then require('apps.xdp.xdp').snabb_enable_xdp(opt) end +end + function sched_apply.eval (str) loadstring(str)() end From 18cc3658f427760c00745c893943b785b85cbe12 Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Wed, 11 Dec 2019 16:18:17 +0000 Subject: [PATCH 079/209] apps.xdp: kick tx queue if not empty This fixes a bug where packets would not be transmitted unless the tx queue was full. --- src/apps/xdp/xdp.lua | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/apps/xdp/xdp.lua b/src/apps/xdp/xdp.lua index d33f0d0662..52c9a5606c 100644 --- a/src/apps/xdp/xdp.lua +++ b/src/apps/xdp/xdp.lua @@ -672,7 +672,7 @@ function XDP:push () if self.kernel_has_ring_flags then if needs_wakeup(tx) then self:kick() end else - if full(tx) then self:kick() end + if not empty(tx) then self:kick() end end end From 19072b4949741782b032f5cf072fee316eb8ba80 Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Wed, 11 Dec 2019 15:33:29 +0000 Subject: [PATCH 080/209] lib.yang.data: reorder translated members before printing This fixes a bug in which data_printer_from_grammar would not print choice case members when an order is given to body_printer. XXX: it is not clear to me why some invocations of body_printer take orderings, since it seems that the pre-ordering always matches the internal ordering logic of body_printer. \o/ --- src/lib/yang/data.lua | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/lib/yang/data.lua b/src/lib/yang/data.lua index 2d172eb0a7..fbc0d76b5b 100644 --- a/src/lib/yang/data.lua +++ b/src/lib/yang/data.lua @@ -880,6 +880,7 @@ function xpath_printer_from_grammar(production, print_default, root) if translator ~= nil then local statements = translator(keyword, production) for k,v in pairs(statements) do translated[k] = v end + order = nil else translated[keyword] = production end @@ -1130,6 +1131,7 @@ function influxdb_printer_from_grammar(production, print_default, root) if translator ~= nil then local statements = translator(keyword, production) for k,v in pairs(statements) do translated[k] = v end + order = nil else translated[keyword] = production end @@ -1371,6 +1373,7 @@ function data_printer_from_grammar(production, print_default) if translator ~= nil then local statements = translator(keyword, production) for k,v in pairs(statements) do translated[k] = v end + order = nil else translated[keyword] = production end From ae4f734fe717cc5ea56208e98e783327bc8c851b Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Thu, 12 Dec 2019 14:54:10 +0000 Subject: [PATCH 081/209] snabb config: use consistency checker in command parsing Used to be that configurations read from the command line or stdin bypassed the consistency checker. --- src/program/config/common.lua | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/program/config/common.lua b/src/program/config/common.lua index 067608adf8..9b6b7d513b 100644 --- a/src/program/config/common.lua +++ b/src/program/config/common.lua @@ -10,6 +10,7 @@ local file = require("lib.stream.file") local rpc = require("lib.yang.rpc") local yang = require("lib.yang.yang") local data = require("lib.yang.data") +local path_data = require("lib.yang.path_data") local path_resolver = require("lib.yang.path_data").resolver function show_usage(command, status, err_msg) @@ -38,7 +39,14 @@ end function data_parser(schema_name, path, is_config) local grammar = path_grammar(schema_name, path, is_config) - return data.data_parser_from_grammar(grammar) + local parser = data.data_parser_from_grammar(grammar) + local validator = path_data.consistency_checker_from_grammar(grammar) + return function (data) + local config = parser(data) + validator(config) + print("validated") + return config + end end function config_parser(schema_name, path) From 05b495ea6d33f95e9deae4bcec8e0418d584eb60 Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Thu, 12 Dec 2019 14:55:31 +0000 Subject: [PATCH 082/209] lib.yang.path_data: expand choices in consistency checker This updates the consistency checkers to expand choices and visit the bodies of all cases. --- src/lib/yang/path_data.lua | 95 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 91 insertions(+), 4 deletions(-) diff --git a/src/lib/yang/path_data.lua b/src/lib/yang/path_data.lua index c85bd352f4..6ed6cde7bd 100644 --- a/src/lib/yang/path_data.lua +++ b/src/lib/yang/path_data.lua @@ -562,6 +562,25 @@ local function pairs_from_grammar(grammar) end end +local function expanded_pairs(values) + -- Return an iterator for each non-choice pair in values and each pair of + -- all choice bodies recursively. + local expanded = {} + local function expand(values) + for name, value in pairs(values) do + if value.type == 'choice' then + for _, body in pairs(value.choices) do + expand(body) + end + else + expanded[name] = value + end + end + end + expand(values) + return pairs(expanded) +end + function uniqueness_checker_from_grammar(grammar) -- Generate checker for table local function unique_assertion(leaves, grammar) @@ -593,7 +612,7 @@ function uniqueness_checker_from_grammar(grammar) elseif grammar.type == 'table' then local pairs = pairs_from_grammar(grammar) -- visit values - for name, value in _G.pairs(grammar.values) do + for name, value in expanded_pairs(grammar.values) do for k, datum in pairs(data) do visit_unique_and_check(value, datum[normalize_id(name)]) end @@ -604,7 +623,7 @@ function uniqueness_checker_from_grammar(grammar) end elseif grammar.type == 'struct' then -- visit members - for name, member in pairs(grammar.members) do + for name, member in expanded_pairs(grammar.members) do visit_unique_and_check(member, data[normalize_id(name)]) end end @@ -648,7 +667,7 @@ function minmax_elements_checker_from_grammar(grammar) elseif grammar.type == 'table' then -- visit values local pairs = pairs_from_grammar(grammar) - for name, value in _G.pairs(grammar.values) do + for name, value in expanded_pairs(grammar.values) do for k, datum in pairs(data) do visit_minmax_and_check(value, datum[normalize_id(name)], name) end @@ -657,7 +676,7 @@ function minmax_elements_checker_from_grammar(grammar) minmax_assertion(grammar, name)(data) elseif grammar.type == 'struct' then -- visit members - for name, member in pairs(grammar.members) do + for name, member in expanded_pairs(grammar.members) do visit_minmax_and_check(member, data[normalize_id(name)], name) end end @@ -1002,5 +1021,73 @@ function selftest() assert(not success) print(result) + -- Test unique restrictions in choice body: + local choice_unique_schema = schema.load_schema([[module choice-unique-schema { + namespace "urn:ietf:params:xml:ns:yang:choice-unique-schema"; + prefix "test"; + + choice ab { + list unique_test { + key "testkey"; unique "testleaf testleaf2"; + leaf testkey { type string; mandatory true; } + leaf testleaf { type string; mandatory true; } + leaf testleaf2 { type string; mandatory true; } + } + list duplicate_test { + key "testkey"; + leaf testkey { type string; mandatory true; } + leaf testleaf { type string;} + leaf testleaf2 { type string;} + } + } + }]]) + local checker = consistency_checker_from_schema(choice_unique_schema, true) + + -- Test unique validation in choice body (should fail) + local success, result = pcall( + checker, + data.load_config_for_schema(choice_unique_schema, + mem.open_input_string [[ + unique_test { + testkey "foo"; + testleaf "bar"; + testleaf2 "baz"; + } + unique_test { + testkey "foo2"; + testleaf "bar"; + testleaf2 "baz"; + } + ]])) + assert(not success) + + -- Test unique validation in choice body (should succeed) + checker(data.load_config_for_schema(choice_unique_schema, + mem.open_input_string [[ + unique_test { + testkey "foo"; + testleaf "bar"; + testleaf2 "baz"; + } + unique_test { + testkey "foo2"; + testleaf "bar2"; + testleaf2 "baz"; + } + ]])) + + -- Test unique validation in choice body (should succeed) + checker(data.load_config_for_schema(choice_unique_schema, + mem.open_input_string [[ + duplicate_test { + testkey "foo"; + testleaf "bar"; + } + duplicate_test { + testkey "foo2"; + testleaf "bar"; + } + ]])) + print("selftest: ok") end From e96ae1a71bd5dadd74534929d23dc5fdbea98855 Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Thu, 12 Dec 2019 15:25:15 +0000 Subject: [PATCH 083/209] lib.yang.data: rework 19072b494 (remove order parameter) add tests This removes the order parameter in the affected functions altogether. Test suite passes so I am assuming this was dead code from the past. --- src/lib/yang/data.lua | 82 +++++++++++++++++++------------------------ 1 file changed, 36 insertions(+), 46 deletions(-) diff --git a/src/lib/yang/data.lua b/src/lib/yang/data.lua index fbc0d76b5b..735174d6c0 100644 --- a/src/lib/yang/data.lua +++ b/src/lib/yang/data.lua @@ -870,7 +870,7 @@ function xpath_printer_from_grammar(production, print_default, root) print_yang_string(k, file) file:write(' ') end - local function body_printer(productions, order) + local function body_printer(productions) -- Iterate over productions trying to translate to other statements. This -- is used for example in choice statements raising the lower statements -- in case blocks up to the level of the choice, in place of the choice. @@ -880,17 +880,14 @@ function xpath_printer_from_grammar(production, print_default, root) if translator ~= nil then local statements = translator(keyword, production) for k,v in pairs(statements) do translated[k] = v end - order = nil else translated[keyword] = production end end productions = translated - if not order then - order = {} - for k,_ in pairs(productions) do table.insert(order, k) end - table.sort(order) - end + local order = {} + for k,_ in pairs(productions) do table.insert(order, k) end + table.sort(order) local printers = {} for keyword,production in pairs(productions) do local printer = printer(keyword, production, printers) @@ -905,8 +902,8 @@ function xpath_printer_from_grammar(production, print_default, root) end end end - local function key_composer (productions, order) - local printer = body_printer(productions, order) + local function key_composer (productions) + local printer = body_printer(productions) local file = {t={}} function file:write (str) str = str:match("([^%s]+)") @@ -959,13 +956,8 @@ function xpath_printer_from_grammar(production, print_default, root) -- As a special case, the table handler allows the keyword to be nil, -- for printing tables at the top level without keywords. function handlers.table(keyword, production) - local key_order, value_order = {}, {} - for k,_ in pairs(production.keys) do table.insert(key_order, k) end - for k,_ in pairs(production.values) do table.insert(value_order, k) end - table.sort(key_order) - table.sort(value_order) - local compose_key = key_composer(production.keys, key_order) - local print_value = body_printer(production.values, value_order) + local compose_key = key_composer(production.keys) + local print_value = body_printer(production.values) if production.key_ctype and production.value_ctype then return function(data, file, path) path = path or '' @@ -1121,7 +1113,7 @@ function influxdb_printer_from_grammar(production, print_default, root) file:write(file.is_tag and value or ' value='..value) file:write('\n') end - local function body_printer(productions, order) + local function body_printer(productions) -- Iterate over productions trying to translate to other statements. This -- is used for example in choice statements raising the lower statements -- in case blocks up to the level of the choice, in place of the choice. @@ -1131,17 +1123,14 @@ function influxdb_printer_from_grammar(production, print_default, root) if translator ~= nil then local statements = translator(keyword, production) for k,v in pairs(statements) do translated[k] = v end - order = nil else translated[keyword] = production end end productions = translated - if not order then - order = {} - for k,_ in pairs(productions) do table.insert(order, k) end - table.sort(order) - end + local order = {} + for k,_ in pairs(productions) do table.insert(order, k) end + table.sort(order) local printers = {} for keyword,production in pairs(productions) do local printer = printer(keyword, production, printers) @@ -1161,8 +1150,8 @@ function influxdb_printer_from_grammar(production, print_default, root) :gsub(',', '\\,') :gsub(' ', '\\ ') end - local function key_composer (productions, order) - local printer = body_printer(productions, order) + local function key_composer (productions) + local printer = body_printer(productions) local file = {t={}, is_tag=true} function file:write (str) str = str:match("([^%s]+)") @@ -1226,14 +1215,9 @@ function influxdb_printer_from_grammar(production, print_default, root) -- As a special case, the table handler allows the keyword to be nil, -- for printing tables at the top level without keywords. function handlers.table(keyword, production) - local key_order, value_order = {}, {} - for k,_ in pairs(production.keys) do table.insert(key_order, k) end - for k,_ in pairs(production.values) do table.insert(value_order, k) end - table.sort(key_order) - table.sort(value_order) local is_key_unique = is_key_unique(production) - local compose_key = key_composer(production.keys, key_order) - local print_value = body_printer(production.values, value_order) + local compose_key = key_composer(production.keys) + local print_value = body_printer(production.values) if production.key_ctype and production.value_ctype then return function(data, file, path) path = path or '' @@ -1363,7 +1347,7 @@ function data_printer_from_grammar(production, print_default) print_yang_string(k, file) file:write(' ') end - local function body_printer(productions, order) + local function body_printer(productions) -- Iterate over productions trying to translate to other statements. This -- is used for example in choice statements raising the lower statements -- in case blocks up to the level of the choice, in place of the choice. @@ -1373,17 +1357,14 @@ function data_printer_from_grammar(production, print_default) if translator ~= nil then local statements = translator(keyword, production) for k,v in pairs(statements) do translated[k] = v end - order = nil else translated[keyword] = production end end productions = translated - if not order then - order = {} - for k,_ in pairs(productions) do table.insert(order, k) end - table.sort(order) - end + local order = {} + for k,_ in pairs(productions) do table.insert(order, k) end + table.sort(order) local printers = {} for keyword,production in pairs(productions) do local printer = printer(keyword, production, printers) @@ -1429,13 +1410,8 @@ function data_printer_from_grammar(production, print_default) -- As a special case, the table handler allows the keyword to be nil, -- for printing tables at the top level without keywords. function handlers.table(keyword, production) - local key_order, value_order = {}, {} - for k,_ in pairs(production.keys) do table.insert(key_order, k) end - for k,_ in pairs(production.values) do table.insert(value_order, k) end - table.sort(key_order) - table.sort(value_order) - local print_key = body_printer(production.keys, key_order) - local print_value = body_printer(production.values, value_order) + local print_key = body_printer(production.keys) + local print_value = body_printer(production.values) if production.key_ctype and production.value_ctype then return function(data, file, indent) for entry in data:iterate() do @@ -1764,6 +1740,15 @@ function selftest() description "Address prefixes bound to this interface."; } + + list choices { + key id; + leaf id { type string; } + choice choice { + leaf red { type string; } + leaf blue { type string; } + } + } }]]) local data = load_config_for_schema(test_schema, @@ -1777,6 +1762,9 @@ function selftest() } addr 1.2.3.4; address 1.2.3.4/24; + choices { id "one"; blue "hey"; } + choices { id "two"; red "bye"; } + ]]) for i =1,2 do assert(data.fruit_bowl.description == 'ohai') @@ -1789,6 +1777,8 @@ function selftest() assert(contents.baz.score == 9) assert(contents.baz.tree_grown == true) assert(data.addr == util.ipv4_pton('1.2.3.4')) + assert(data.choices.one.blue == "hey") + assert(data.choices.two.red == "bye") local stream = mem.tmpfile() print_config_for_schema(test_schema, data, stream) From 95d3a741ecf2595cb0ed29112ab90771fc5f0350 Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Thu, 30 Jan 2020 15:49:47 +0100 Subject: [PATCH 084/209] lib.yang: remove leftover debug print --- src/program/config/common.lua | 1 - 1 file changed, 1 deletion(-) diff --git a/src/program/config/common.lua b/src/program/config/common.lua index 9b6b7d513b..689c7d6359 100644 --- a/src/program/config/common.lua +++ b/src/program/config/common.lua @@ -44,7 +44,6 @@ function data_parser(schema_name, path, is_config) return function (data) local config = parser(data) validator(config) - print("validated") return config end end From 3fdf926dbd1fb972440b30c5ac9a85ff9b2154d9 Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Thu, 30 Jan 2020 16:44:19 +0100 Subject: [PATCH 085/209] pflua: fix typo bug in native codegen (xor/or emission) --- lib/pflua/src/pf/codegen.dasl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/pflua/src/pf/codegen.dasl b/lib/pflua/src/pf/codegen.dasl index 7359d8ed9b..8a2b2f194d 100644 --- a/lib/pflua/src/pf/codegen.dasl +++ b/lib/pflua/src/pf/codegen.dasl @@ -299,7 +299,7 @@ local function compile(instructions, alloc, dump) local reg1, reg2 = alloc[instr[2]], alloc[instr[3]] | or Rq(reg1), Rq(reg2) - elseif itype == "or" then + elseif itype == "xor" then local reg1, reg2 = alloc[instr[2]], alloc[instr[3]] | xor Rq(reg1), Rq(reg2) From 07acce271f3c66e328f5aa8e858f4b358a4b5dab Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Thu, 30 Jan 2020 16:45:18 +0100 Subject: [PATCH 086/209] pflua: make regalloc architecture independent Default to x86_64 registers, but allow passing of alternative register set specifications. --- lib/pflua/src/pf/regalloc.lua | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/lib/pflua/src/pf/regalloc.lua b/lib/pflua/src/pf/regalloc.lua index 8696b7a5df..52edfcdf17 100644 --- a/lib/pflua/src/pf/regalloc.lua +++ b/lib/pflua/src/pf/regalloc.lua @@ -129,11 +129,6 @@ local function live_intervals(instrs) return order end --- All available registers, tied to unix x64 ABI -local caller_regs = {11, 10, 9, 8, 6, 2, 1, 0} -local callee_regs = {15, 14, 13, 12, 3} -local num_regs = #caller_regs + #callee_regs - -- Check if a register is free in the freelist local function is_free(seq, reg) for _, val in ipairs(seq) do @@ -182,23 +177,31 @@ local function delete_useless_movs(ir, alloc) end end +-- All available registers, tied to unix x64 ABI +x86_regs = { + caller_regs = {11, 10, 9, 8, 6, 2, 1, 0}, + callee_regs = {15, 14, 13, 12, 3}, + len = 6 -- %rsi +} + -- Do register allocation with the given IR -- Returns a register allocation and potentially mutates -- the ir for optimizations -function allocate(ir) +function allocate(ir, regs) + regs = regs or x86_regs local intervals = live_intervals(ir) local active = {} local next_spill = 0 -- caller-save registers, use these first - local free_caller = utils.dup(caller_regs) + local free_caller = utils.dup(regs.caller_regs) -- callee-save registers, if we have to - local free_callee = utils.dup(callee_regs) + local free_callee = utils.dup(regs.callee_regs) - local allocation = { len = 6, -- %rsi + local allocation = { len = regs.len, callee_saves = {}, spills = {} } - remove_free(free_caller, 6) + remove_free(free_caller, allocation.len) local function expire_old(interval) local to_expire = {} @@ -213,9 +216,9 @@ function allocate(ir) table.insert(to_expire, idx) -- figure out which free list this register is supposed to be on - if is_free(caller_regs, reg) then + if is_free(regs.caller_regs, reg) then table.insert(free_caller, reg) - elseif is_free(callee_regs, reg) then + elseif is_free(regs.callee_regs, reg) then table.insert(free_callee, reg) else error("unknown register") From cd4d773816efeac383feedb483f06a181b70c207 Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Thu, 30 Jan 2020 16:46:40 +0100 Subject: [PATCH 087/209] apps.xdp.bpf: add basic disassembler, endiannes opcodes --- src/apps/xdp/bpf.lua | 134 ++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 126 insertions(+), 8 deletions(-) diff --git a/src/apps/xdp/bpf.lua b/src/apps/xdp/bpf.lua index 44b4d68059..34107a9dda 100644 --- a/src/apps/xdp/bpf.lua +++ b/src/apps/xdp/bpf.lua @@ -3,7 +3,7 @@ module(...,package.seeall) local ffi = require("ffi") -local bor = bit.bor +local band, bor = bit.band, bit.bor -- BPF: just enough eBPF to assemble trivial XDP programs. -- @@ -33,15 +33,16 @@ c = { -- Op class ALU = 0x04, JMP = 0x05, RET = 0x06, - MISC = 0x07, - ALU64 = 0x07 -- alu mode in double word width + ALU64 = 0x07, -- alu mode in double word width + mask = 0x07 } f = { -- Load/store width W = 0x00, -- 32-bit H = 0x08, -- 16-bit B = 0x10, -- 8-bit - DW = 0x18 -- 64-bit + DW = 0x18, -- 64-bit + mask = 0x18 } m = { -- Op mode @@ -51,7 +52,8 @@ m = { -- Op mode MEM = 0x60, LEN = 0x80, MSH = 0xa0, - XADD = 0xc0 -- exclusive add + XADD = 0xc0, -- exclusive add + mask = 0xe0 } a = { -- ALU mode @@ -66,13 +68,18 @@ a = { -- ALU mode NEG = 0x80, MOD = 0x90, XOR = 0xa0, - MOV = 0xb0 + MOV = 0xb0, + END = 0xd0, -- Endianness conversion: + LE = 0x00, -- * to little endian + BE = 0x08, -- * to big endian + mask = 0xf0 } s = { -- Src mode K = 0x00, X = 0x08, - MAP_FD = 0x01 + MAP_FD = 0x01, + mask = 0x08 } j = { -- JMP mode @@ -89,7 +96,8 @@ j = { -- JMP mode JSLT = 0xc0, JSLE = 0xd0, CALL = 0x80, - EXIT = 0x90 + EXIT = 0x90, + mask = 0xf0 } fn = { -- Built-in helpers @@ -180,3 +188,113 @@ fn = { -- Built-in helpers } function asm (insn) return ffi.typeof("$[?]", ins)(#insn, insn) end + +function dis (insn) + local pc = 0 + local function which (v, typ) + return band(v, typ.mask) + end + local function name (x, typ) + for k, v in pairs(typ) do + if k ~= "mask" and x == v then + return k + end + end + end + local function dis_ins (ins) + local str = "" + -- Class + local class = which(ins.op, c) + str = str..name(class, c) + if class <= c.STX then + -- Load/store + local width = which(ins.op, f) + str = str.." "..name(width, f) + local mode = which(ins.op, m) + --str = str.." "..name(mode, m) + str = str..("\tr%d"):format(ins.dst) + if class > c.LDX then + -- Store offset. + str = str..("+%d"):format(ins.off) + end + if mode == m.IMM then + str = str..(" %d %s"):format(ins.imm, name(ins.src, s)) + else + str = str..(" r%d"):format(ins.src) + if class <= c.LDX then + -- Load offset. + str = str..("+%d"):format(ins.off) + end + end + if mode == m.ABS then + str = str..("+%d"):format(ins.imm) + end + elseif class == c.ALU or class == c.ALU64 then + -- ALU + local alu = which(ins.op, a) + str = str.." "..name(alu, a) + local src = which(ins.op, s) + str = str..("\tr%d"):format(ins.dst) + if src == s.K then + -- Immediate operand + str = str..(" %d"):format(ins.imm) + else + -- Register operand + str = str..(" r%d"):format(ins.src) + end + elseif class == c.JMP then + -- Jump + local jmp = which(ins.op, j) + str = str.." "..name(jmp, j) + if jmp == j.EXIT then + elseif jmp == j.CALL then + -- Call + str = str.."\t"..(name(ins.imm, fn) or ("%x"):format(ins.imm)) + else + -- Relative jump + str = str.."\t" + if jmp > j.JA then + -- Conditional + str = str..("r%d"):format(ins.dst) + if which(ins.op, s) == s.K then + -- Immediate operand + str = str..(" %d"):format(ins.imm) + else + -- Register operand + str = str..(" r%d"):format(ins.src) + end + end + str = str..("\t=> %d"):format(pc + 1 + ins.off) + end + else + -- Return + local mode = which(ins.op, m) + if mode == m.IMM then + str = str.." "..name(mode, m) + str = str..("\t%d"):format(ins.imm) + end + end + return str + end + while pc < ffi.sizeof(insn) / ffi.sizeof(ins) do + print(pc, dis_ins(insn[pc])) + pc = pc + 1 + end +end + +function selftest () + local insns = asm{ + -- r3 = XDP_ABORTED + { op=bor(c.ALU, a.MOV, s.K), dst=3, imm=0 }, + -- r2 = ((struct xdp_md *)ctx)->rx_queue_index + { op=bor(c.LDX, f.W, m.MEM), dst=2, src=1, off=16 }, + -- r1 = xskmap + { op=bor(c.LD, f.DW, m.IMM), dst=1, src=s.MAP_FD, imm=4 }, + { imm=0 }, -- nb: upper 32 bits of 64-bit (DW) immediate + -- r0 = redirect_map(r1, r2, r3) + { op=bor(c.JMP, j.CALL), imm=fn.redirect_map }, + -- EXIT: + { op=bor(c.JMP, j.EXIT) } + } + dis(insns) +end From 48ff71cd929a9eee365f23f434b25a12c9adc064 Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Thu, 30 Jan 2020 16:47:59 +0100 Subject: [PATCH 088/209] apps.xdp.pf_ebpf_codegen: add eBPF backend for pflua --- src/apps/xdp/pf_ebpf_codegen.lua | 373 +++++++++++++++++++++++++++++++ 1 file changed, 373 insertions(+) create mode 100644 src/apps/xdp/pf_ebpf_codegen.lua diff --git a/src/apps/xdp/pf_ebpf_codegen.lua b/src/apps/xdp/pf_ebpf_codegen.lua new file mode 100644 index 0000000000..6db10c2465 --- /dev/null +++ b/src/apps/xdp/pf_ebpf_codegen.lua @@ -0,0 +1,373 @@ +-- Use of this source code is governed by the Apache 2.0 license; see COPYING. + +module(...,package.seeall) + +-- This module implements code generation for the XDP/eBPF backend of +-- Pflua. It takes the result of instruction selection (selection.lua) +-- and register allocation (regalloc.lua) and generates a function with +-- eBPF bytecode. + +local parse = require('pf.parse').parse +local expand = require('pf.expand').expand +local optimize = require('pf.optimize').optimize +local anf = require('pf.anf') +local ssa = require('pf.ssa') +local sel = require("pf.selection") +local ra = require("pf.regalloc") +local bpf = require("apps.xdp.bpf") + +local c, f, m, a, s, j = bpf.c, bpf.f, bpf.m, bpf.a, bpf.s, bpf.j + +local tobit, band, bor, rshift = bit.tobit, bit.band, bit.bor, bit.rshift + +-- eBPF register allocation: +-- * mark r1 callee save: holds the xdp_md context we wish to preserve +-- * omit r0: we will keep a pointer to the packet payload in here +-- * omit r2: we will use this register to perform length checks +-- * use r3 as len: we will store data_end here (used in length checks) +local ebpf_regs = { + caller_regs = { 9, 8, 7, 6, 5, 4, 3 }, + callee_regs = { 1 }, + len = 3 +} + +-- Generate a eBPF XDP program that will return XDP_PASS unless filter expr +-- matches, and otherwise "fall-though" as to allow execution of a further eBPF +-- program that is to be appended. +function codegen (ir, alloc) + -- push callee-save registers if we use any + local to_pop = {} + for reg, _ in pairs(alloc.callee_saves) do + error("NYI: callee saves") + -- we need to record the order in which to pop + -- b/c while the push order doesn't matter, the + -- pop order must be reverse (and callee_saves + -- is an unordered set) + table.insert(to_pop, reg) + end + + -- in bytes + local stack_slot_size = 8 + + -- allocate space for all spilled vars + local spilled_space = 0 + for _, _ in pairs(alloc.spills) do + spilled_space = spilled_space + stack_slot_size + end + if spilled_space > 0 then + error("NYI: spilled space") + end + + -- if the length variable got spilled, we need to explicitly initialize + -- the stack slot for it + if alloc.spills["len"] then + error("NYI: spilled length") + end + + local pc, tr = 1, {} + local function emit (ins) + tr[pc] = ins + pc = pc+1 + end + + local label_offset, labels = 2, {} + + local cmp + local function emit_cjmp (cond, target) + assert(cmp, "cjmp needs preceeding cmp") + local jmp = cmp; cmp = nil + jmp.op = bor(c.JMP, cond, jmp.op) + if target == "true-label" then + jmp.off = 0 + elseif target == "false-label" then + jmp.off = 1 + else + jmp.off = label_offset+target + end + emit(jmp) + end + + -- Setup: move data start and end pointers into r0 and r(alloc.len) + -- r0 = ((struct xdp_md *)ctx)->data + emit{ op=bor(c.LDX, f.W, m.MEM), dst=0, src=1, off=0 } + -- r(alloc.len) = ((struct xdp_md *)ctx)->data_end + emit{ op=bor(c.LDX, f.W, m.MEM), dst=alloc.len, src=1, off=4 } + + for idx, instr in ipairs(ir) do + local itype = instr[1] + + --- FIXME: handle spills + + -- the core code generation logic starts here + if itype == "label" then + local lnum = instr[2] + labels[label_offset+lnum] = pc + + elseif itype == "cjmp" then + local op, target = instr[2], instr[3] + + if op == "=" then + emit_cjmp(j.JEQ, target) + elseif op == "!=" then + emit_cjmp(j.JNE, target) + elseif op == ">=" then + emit_cjmp(j.JGE, target) + elseif op == "<=" then + emit_cjmp(j.JLE, target) + elseif op == ">" then + emit_cjmp(j.JGT, target) + elseif op == "<" then + emit_cjmp(j.JLT, target) + end + + elseif itype == "jmp" then + local next_instr = ir[idx+1] + -- if the jump target is immediately after this in the instruction + -- sequence then don't generate the jump + if (type(instr[2]) == "number" and + next_instr[1] == "label" and + next_instr[2] == instr[2]) then + -- don't output anything + else + if instr[2] == "true-label" then + if next_instr[1] ~= "ret-true" then + emit{ op=bor(c.JMP, j.JA), off=0 } + end + elseif instr[2] == "false-label" then + if next_instr[1] ~= "ret-false" then + emit{ op=bor(c.JMP, j.JA), off=1 } + end + else + emit{ op=bor(c.JMP, j.JA), off=label_offset+instr[2] } + end + end + + elseif itype == "cmp" and instr[2] == "len" then + local lhs_reg = alloc.len + local rhs = instr[3] + assert(rhs ~= "len", "NYI: cmp with rhs len") + + -- Perform eBPF friendly length check. + -- mov r2, r0 + emit{ op=bor(c.ALU64, a.MOV, s.X), dst=2, src=0 } + -- add r2, rhs + if type(rhs) == "number" then + emit{ op=bor(c.ALU64, a.ADD, s.K), dst=2, imm=rhs } + else + emit{ op=bor(c.ALU64, a.ADD, s.X), dst=2, src=alloc[rhs] } + end + -- cmp r6, r2 + cmp = { op=s.X, dst=lhs_reg, src=2 } + + elseif itype == "cmp" then + -- the lhs should never be an immediate so this should be non-nil + local lhs_reg = assert(alloc[instr[2]]) + local rhs = instr[3] + assert(rhs ~= "len", "NYI: cmp with rhs len") + + if type(rhs) == "number" then + cmp = { op=s.K, dst=lhs_reg, imm=rhs } + else + local rhs_reg = alloc[rhs] + cmp = { op=s.X, dst=lhs_reg, src=rhs_reg } + end + + elseif itype == "load" then + local target = alloc[instr[2]] + assert(not alloc.spills[instr[2]], "NYI: load spill") + local offset = instr[3] + local bytes = instr[4] + + if type(offset) == "number" then + if bytes == 1 then + emit{ op=bor(c.LDX, f.B, m.MEM), dst=target, off=offset } + elseif bytes == 2 then + emit{ op=bor(c.LDX, f.H, m.MEM), dst=target, off=offset } + else + emit{ op=bor(c.LDX, f.W, m.MEM), dst=target, off=offset } + end + else + local reg = alloc[offset] + assert(not alloc.spills[offset], "NYI: load spill") + + emit{ op=bor(c.ALU64, a.ADD, s.X), dst=reg } + if bytes == 1 then + emit{ op=bor(c.LDX, f.B, m.MEM), dst=target, src=reg } + elseif bytes == 2 then + emit{ op=bor(c.LDX, f.H, m.MEM), dst=target, src=reg } + else + emit{ op=bor(c.LDX, f.W, m.MEM), dst=target, src=reg } + end + emit{ op=bor(c.ALU64, a.SUB, s.X), dst=reg } + end + + elseif itype == "mov" then + local dst = alloc[instr[2]] + assert(not alloc.spills[instr[2]], "NYI: mov spill") + local arg = instr[3] + + if type(arg) == "number" then + emit{ op=bor(c.ALU, a.MOV, s.K), dst=dst, imm=arg } + else + assert(not alloc.spills[arg], "NYI: mov spill") + emit{ op=bor(c.ALU64, a.MOV, s.X), dst=dst, src=alloc[arg] } + end + + elseif itype == "mov64" then + local dst = alloc[instr[2]] + local imm = instr[3] + emit{ op=bor(c.LD, f.DW, m.IMM), dst=dst, src=s.K, imm=tobit(imm) } + emit{ imm=rshift(imm, 32) } + + elseif itype == "add" then + local reg1, reg2 = alloc[instr[2]], alloc[instr[3]] + emit{ op=bor(c.ALU64, a.ADD, s.X), dst=reg1, src=reg2 } + + elseif itype == "sub" then + local reg1, reg2 = alloc[instr[2]], alloc[instr[3]] + emit{ op=bor(c.ALU64, a.SUB, s.X), dst=reg1, src=reg2 } + + elseif itype == "mul" then + local reg1, reg2 = alloc[instr[2]], alloc[instr[3]] + emit{ op=bor(c.ALU64, a.MUL, s.X), dst=reg1, src=reg2 } + + -- For division we use floating point division to avoid having + -- to deal with the %eax register for the div instruction. + elseif itype == "div" then + local reg1, reg2 = alloc[instr[2]], alloc[instr[3]] + emit{ op=bor(c.ALU64, a.DIV, s.X), dst=reg1, src=reg2 } + + elseif itype == "and" then + local reg1, reg2 = alloc[instr[2]], alloc[instr[3]] + emit{ op=bor(c.ALU64, a.AND, s.X), dst=reg1, src=reg2 } + + elseif itype == "or" then + local reg1, reg2 = alloc[instr[2]], alloc[instr[3]] + emit{ op=bor(c.ALU64, a.OR, s.X), dst=reg1, src=reg2 } + + elseif itype == "xor" then + local reg1, reg2 = alloc[instr[2]], alloc[instr[3]] + emit{ op=bor(c.ALU64, a.XOR, s.X), dst=reg1, src=reg2 } + + elseif itype == "shl" then + local reg1, reg2 = alloc[instr[2]], alloc[instr[3]] + emit{ op=bor(c.ALU64, a.LSH, s.X), dst=reg1, src=reg2 } + + elseif itype == "shr" then + local reg1, reg2 = alloc[instr[2]], alloc[instr[3]] + emit{ op=bor(c.ALU64, a.RSH, s.X), dst=reg1, src=reg2 } + + elseif itype == "add-i" then + local reg = alloc[instr[2]] + emit{ op=bor(c.ALU64, a.ADD, s.K), dst=reg, imm=instr[3] } + + elseif itype == "sub-i" then + local reg = alloc[instr[2]] + emit{ op=bor(c.ALU64, a.SUB, s.K), dst=reg, imm=instr[3] } + + elseif itype == "mul-i" then + local r = alloc[instr[2]] + emit{ op=bor(c.ALU64, a.MUL, s.K), dst=reg, imm=instr[3] } + + elseif itype == "and-i" then + local reg = alloc[instr[2]] + assert(type(reg) == "number") + assert(type(instr[3]) == "number") + emit{ op=bor(c.ALU64, a.AND, s.K), dst=reg, imm=instr[3] } + + elseif itype == "or-i" then + local reg = alloc[instr[2]] + assert(type(reg) == "number") + assert(type(instr[3]) == "number") + emit{ op=bor(c.ALU64, a.OR, s.K), dst=reg, imm=instr[3] } + + elseif itype == "xor-i" then + local reg = alloc[instr[2]] + assert(type(reg) == "number") + assert(type(instr[3]) == "number") + emit{ op=bor(c.ALU64, a.XOR, s.K), dst=reg, imm=instr[3] } + + elseif itype == "shl-i" then + local reg = alloc[instr[2]] + emit{ op=bor(c.ALU64, a.LSH, s.K), dst=reg, imm=instr[3] } + + elseif itype == "shr-i" then + local reg = alloc[instr[2]] + emit{ op=bor(c.ALU64, a.RSH, s.K), dst=reg, imm=instr[3] } + + elseif itype == "ntohs" then + local reg = alloc[instr[2]] + emit{ op=bor(c.ALU, a.END, a.BE), dst=reg, imm=16 } + + elseif itype == "ntohl" then + local reg = alloc[instr[2]] + emit{ op=bor(c.ALU, a.END, a.BE), dst=reg, imm=32 } + + elseif itype == "uint32" then + local reg = alloc[instr[2]] + emit{ op=bor(c.ALU, a.AND, s.X), dst=reg, src=reg } + + elseif itype == "ret-true" then + labels[0] = pc + -- In the end, we will turn this into a jump to the first instruction + -- beyond the end of the emitted sequence. + emit{ op=bor(c.JMP, j.JA) } + + elseif itype == "ret-false" then + labels[1] = pc + -- r0 = XDP_PASS + emit{ op=bor(c.ALU, a.MOV, s.K), dst=0, imm=2 } + -- EXIT: + emit{ op=bor(c.JMP, j.EXIT) } + + elseif itype == "nop" then + -- don't output anything + + else + error(string.format("NYI instruction %s", itype)) + end + end + + -- Fixup true-label + local true_label = labels[0] + if true_label == #tr then + -- True-label is last instruction: remove its target instruction + tr[true_label] = nil + elseif true_label then + -- Set the jump offset to the first ins. beyond the emitted sequence + tr[true_label].off = #tr - true_label + end + + -- Fixup jump offsets + for pc, ins in ipairs(tr) do + if band(ins.op, c.JMP) == c.JMP and ins.off then + ins.off = labels[ins.off] - (pc+1) + end + end + + return tr +end + +function compile(filter, dump) + local expr = optimize(expand(parse(filter), "EN10MB")) + local ssa = ssa.convert_ssa(anf.convert_anf(expr)) + local ir = sel.select(ssa) + local alloc = ra.allocate(ir, ebpf_regs) + local code = codegen(ir, alloc) + if dump then + require("core.lib").print_object(alloc) + require("core.lib").print_object(ir) + print(filter) + bpf.dis(bpf.asm(code)) + end + return code +end + +function selftest() + compile("ip proto esp or ip proto 99 or arp", "dump") + compile("ip6[6] = 50 or ip6[6] = 99 or ".. + "(ip6[6] = 58 and (ip6[40] = 135 or ip6[40] = 136))", + "dump") + compile("1 = 2", + "dump") +end From c158ac824638be31ab3231a800a08c8ec19b592f Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Thu, 30 Jan 2020 16:51:14 +0100 Subject: [PATCH 089/209] apps.xdp: add filter option (offload non-matching packets to kernel) --- src/apps/xdp/README.md | 6 +++ src/apps/xdp/xdp.lua | 91 +++++++++++++++++++++++++++++++++--------- 2 files changed, 79 insertions(+), 18 deletions(-) diff --git a/src/apps/xdp/README.md b/src/apps/xdp/README.md index e8335e18ae..76af18e5d2 100644 --- a/src/apps/xdp/README.md +++ b/src/apps/xdp/README.md @@ -39,6 +39,12 @@ Due to a combination of how Snabb uses packet buffers and a limitation of *Required*. The name of the interface as shown in `ip link`. +— Key **filter** + +*Optional*. A `pcap-filter(7)` expression. If given, packets that do not match +the filter will we passed on to the host networking stack. Must be the same for +all instances of the XDP app on a given interface! + — Key **queue** *Optional*. Queue to bind to (zero based). The default is queue 0. diff --git a/src/apps/xdp/xdp.lua b/src/apps/xdp/xdp.lua index 52c9a5606c..7d34a2961d 100644 --- a/src/apps/xdp/xdp.lua +++ b/src/apps/xdp/xdp.lua @@ -5,6 +5,7 @@ module(...,package.seeall) local S = require("syscall") local ffi = require("ffi") local bpf = require("apps.xdp.bpf") +local pf = require("apps.xdp.pf_ebpf_codegen") local lib = require("core.lib") local bits = lib.bits local band, bor, rshift, tobit = bit.band, bit.bor, bit.rshift, bit.tobit @@ -349,6 +350,7 @@ end XDP = { config = { ifname = {required=true}, -- interface name + filter = {}, -- interface pcap-filter(7) (optional) queue = {default=0} -- interface queue (zero based) }, -- Class variables: @@ -364,7 +366,7 @@ driver = XDP function XDP:new (conf) assert(snabb_xdp_enabled, "Snabb XDP mode must be enabled.") -- Ensure interface is initialized for XDP usage. - local lockfd, mapfd = self:open_interface(conf.ifname) + local lockfd, mapfd = self:open_interface(conf.ifname, conf.filter) -- Create XDP socket (xsk) for queue. local xsk = self:create_xsk(conf.ifname, lockfd, conf.queue) -- Attach the socket to queue in the BPF map. @@ -374,7 +376,7 @@ function XDP:new (conf) return setmetatable(xsk, {__index=XDP}) end -function XDP:open_interface (ifname) +function XDP:open_interface (ifname, filter) -- Open an interface-dependent file we know should exist to use as a -- Snabb-wide lock. The contents of the file are really irrelevant here. -- However, we depend on the file not being locked by other applications in @@ -394,7 +396,7 @@ function XDP:open_interface (ifname) S.mkdir("/sys/fs/bpf/snabb/"..ifname, "rwxu, rgrp, xgrp, roth, xoth") -- Create xskmap and XDP program to run on the NIC. mapfd = self:create_xskmap() - progfd = self:xdp_prog(mapfd) + progfd = self:xdp_prog(mapfd, filter) self:set_link_xdp(ifname, progfd) -- Pin xskmap so it can be accessed by other Snabb processes to attach to -- the interface. Also pin the XDP program, just 'cause. @@ -434,13 +436,34 @@ function XDP:create_xskmap () error("Failed to create BPF map: "..tostring(err)) end -function XDP:xdp_prog (xskmap) +function XDP:xdp_prog (xskmap, filter) -- Assemble and load XDP BPF program. + -- If we have a filter argument, compile a filter that passes non-matching + -- packets on to the kernel networking stack (XDP_PASS). Append to it our + -- regular XSK forwarding code (XDP:xdp_forward) so packets that pass + -- the filter are forwarded to attached XDP sockets. + local flt = (filter and pf.compile(filter)) or {} + for _, ins in ipairs(self:xdp_forward(xskmap)) do + -- Append forwarding logic to filter. + table.insert(flt, ins) + end + local asm = bpf.asm(flt) + local prog, err, log = S.bpf_prog_load( + 'xdp', asm, ffi.sizeof(asm) / ffi.sizeof(bpf.ins), "Apache 2.0" + ) + if prog then + return prog + else + error(tostring(err).."\n"..log) + end +end + +function XDP:xdp_forward (xskmap) local c, f, m, a, s, j, fn = bpf.c, bpf.f, bpf.m, bpf.a, bpf.s, bpf.j, bpf.fn -- The program below looks up the incoming packet's queue index in xskmap to -- find the corresponding XDP socket (xsk) to deliver the packet to. - local insns = bpf.asm{ + return { -- r3 = XDP_ABORTED { op=bor(c.ALU, a.MOV, s.K), dst=3, imm=0 }, -- r2 = ((struct xdp_md *)ctx)->rx_queue_index @@ -453,17 +476,9 @@ function XDP:xdp_prog (xskmap) -- EXIT: { op=bor(c.JMP, j.EXIT) } } - local prog, err, log = S.bpf_prog_load( - 'xdp', insns, ffi.sizeof(insns) / ffi.sizeof(bpf.ins), "Apache 2.0" - ) - if prog then - return prog - else - error(tostring(err).."\n"..log) - end end -function XDP:set_link_xdp(ifname, prog) +function XDP:set_link_xdp (ifname, prog) -- Open a NETLINK socket, and transmit command that attaches XDP program -- prog to link by ifname. local netlink = assert(S.socket('netlink', 'raw', 'route')) @@ -748,6 +763,8 @@ function selftest () selftest_duplex(xdpdeva, xdpmaca, xdpdevb, xdpmacb, nqueues) print("test: rxtx_match") selftest_rxtx_match(xdpdeva, xdpmaca, xdpdevb, xdpmacb) + print("test: rxtx_match_filter") + selftest_rxtx_match_filter(xdpdeva, xdpmaca, xdpdevb, xdpmacb) if nqueues > 1 then print("test: share_interface") selftest_share_interface(xdpdeva, xdpmaca, xdpdevb, xdpmacb, nqueues) @@ -766,6 +783,7 @@ local function random_v4_packets (conf) for _=1,100 do local ip = ipv4:new{src=lib.random_bytes(4), dst=lib.random_bytes(4)} + if conf.protocol then ip:protocol(conf.protocol) end ip:total_length(size - eth:sizeof()) local payload_length = ip:total_length() - ip:sizeof() local p = packet.allocate() @@ -796,10 +814,10 @@ function selftest_rxtx (xdpdeva, xdpmaca, xdpdevb, xdpmacb, nqueues) ifname = xdpdeva, queue = queue }) - config.app(c, queue_b, XDP, { - ifname = xdpdevb, - queue = queue - }) + config.app(c, queue_b, XDP, { + ifname = xdpdevb, + queue = queue + }) config.link(c, "source.output"..queue.." -> "..queue_a..".input") config.link(c, queue_b..".output -> sink.input"..queue) end @@ -901,6 +919,43 @@ function selftest_rxtx_match (xdpdeva, xdpmaca, xdpdevb, xdpmacb) assert(#engine.app_table.match:errors() == 0, "Match errors.") end +function selftest_rxtx_match_filter (xdpdeva, xdpmaca, xdpdevb, xdpmacb) + local c = config.new() + local synth = require("apps.test.synth") + local npackets = require("apps.test.npackets") + local match = require("apps.test.match") + config.app(c, "source", synth.Synth, { + packets = random_v4_packets{ + sizes = {60,64,67,128,133,192,256,384,512,777,1024,1500,1501}, + src = xdpmaca, + dst = xdpmacb, + protocol = 42 + }}) + config.app(c, "npackets", npackets.Npackets, {npackets=1000}) + config.app(c, "match", match.Match) + config.app(c, xdpdeva, XDP, {ifname=xdpdeva}) + config.app(c, xdpdevb, XDP, {ifname=xdpdevb, filter="ip proto 42"}) + config.link(c, "source.output -> "..xdpdeva..".input") + config.link(c, xdpdevb..".output -> match.rx") + config.link(c, "source.copy -> npackets.input") + config.link(c, "npackets.output -> match.comparator") + -- Test redirect + engine.configure(c) + engine.main{ duration=.1 } + engine.report_links() + engine.report_apps() + assert(#engine.app_table.match:errors() == 0, "Match errors.") + -- Test pass + engine.configure(config.new()) + config.app(c, xdpdevb, XDP, {ifname=xdpdevb, filter="ip6 proto 77"}) + engine.configure(c) + engine.main{ duration=.1 } + engine.report_links() + assert(#engine.app_table.match:errors() == 1000, "Matched packets.") + assert(link.stats(engine.app_table[xdpdevb].output.output).rxpackets == 0, + "Too many packets received on "..xdpdevb) +end + function selftest_share_interface_worker (xdpdev, queue) snabb_enable_xdp() local c = config.new() From 6b5851a4c5b18a5ed56e4e90eb5dbf4353d7ba78 Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Fri, 28 Feb 2020 17:30:56 +0100 Subject: [PATCH 090/209] lib.numa: gracefully handle failure to read CPU performance governor --- src/lib/numa.lua | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/lib/numa.lua b/src/lib/numa.lua index 55b510be24..04756c9f6e 100644 --- a/src/lib/numa.lua +++ b/src/lib/numa.lua @@ -158,12 +158,20 @@ local function assert_irqbalanced_disabled (warn) end end +local function read_cpu_performance_governor (cpu) + local path = '/sys/devices/system/cpu/cpu'..cpu..'/cpufreq/scaling_governor' + local f = io.open(path) + if not f then return "unknown" end + local gov = f:read() + f:close() + return gov +end + local function check_cpu_performance_tuning (cpu, strict) local warn = warn if strict then warn = die end assert_irqbalanced_disabled(warn) - local path = '/sys/devices/system/cpu/cpu'..cpu..'/cpufreq/scaling_governor' - local gov = assert(io.open(path)):read() + local gov = read_cpu_performance_governor(cpu) if not gov:match('performance') then warn('Expected performance scaling governor for CPU %s, but got "%s"', cpu, gov) From 16c99eda80ebb862d03886fe501f4abcb727a2e2 Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Thu, 5 Mar 2020 16:53:45 +0100 Subject: [PATCH 091/209] lib.ptree: cleanup obsolete aggregated stats --- src/lib/ptree/ptree.lua | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/src/lib/ptree/ptree.lua b/src/lib/ptree/ptree.lua index a6aa22232a..1fa509e67b 100644 --- a/src/lib/ptree/ptree.lua +++ b/src/lib/ptree/ptree.lua @@ -374,6 +374,14 @@ function Manager:monitor_worker_stats(id) counters.archived[0] = counters.archived[0] + val counter.delete(qualified_name) S.unlink(strip_suffix(qualified_name, ".counter")..".rrd") + local last_in_set = true + for _ in pairs(counters.active) do + last_in_set = false + break + end + if last_in_set then + self:cleanup_aggregated_stats(name, 'counter') + end end elseif has_suffix(ev.name, '.gauge') then local gauges = self.gauges[name] @@ -390,6 +398,14 @@ function Manager:monitor_worker_stats(id) gauges.active[pid] = nil gauges.rrd[pid] = nil S.unlink(strip_suffix(qualified_name, ".gauge")..".rrd") + local last_in_set = true + for _ in pairs(gauges.active) do + last_in_set = false + break + end + if last_in_set then + self:cleanup_aggregated_stats(name, 'gauge') + end end end end @@ -424,6 +440,20 @@ function Manager:sample_active_stats() end end +function Manager:cleanup_aggregated_stats(name, typ) + shm.unlink(name) + shm.unlink(strip_suffix(name, "."..typ)..".rrd") + self:cleanup_parent_directories(name) +end + +function Manager:cleanup_parent_directories(name) + local parent = name:match("(.*)/[^/]+$") + if not parent then return end + for _ in pairs(shm.children(parent)) do return end + shm.unlink(parent) + self:cleanup_parent_directories(parent) +end + function Manager:start_worker_for_graph(id, graph) local scheduling = self:compute_scheduling_for_worker(id, graph) self:info('Starting worker %s.', id) From 76c0c80a7cca05d24d9a83cc71d22a4fb508593a Mon Sep 17 00:00:00 2001 From: Alexander Gall Date: Thu, 5 Mar 2020 15:00:45 +0100 Subject: [PATCH 092/209] apps.vhost: suppress -Wstringop-truncation GCC 8 added a warning that affects a legitimate use case. --- src/apps/vhost/vhost_user.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/apps/vhost/vhost_user.c b/src/apps/vhost/vhost_user.c index 77752f93a5..3550deff86 100644 --- a/src/apps/vhost/vhost_user.c +++ b/src/apps/vhost/vhost_user.c @@ -33,8 +33,10 @@ int vhost_user_connect(const char *path) } un.sun_family = AF_UNIX; +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wstringop-truncation" strncpy(un.sun_path, path, sizeof(un.sun_path)); - +#pragma GCC diagnostic pop if (connect(sock, (struct sockaddr *) &un, sizeof(un)) == -1) { close(sock); return -1; @@ -54,7 +56,10 @@ int vhost_user_listen(const char *path) } un.sun_family = AF_UNIX; +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wstringop-truncation" strncpy(un.sun_path, path, sizeof(un.sun_path)); +#pragma GCC diagnostic pop unlink(un.sun_path); if (bind(sock, (struct sockaddr *) &un, sizeof(un)) == -1) { close(sock); From 56884a7d7aa0fed498ecd5574e8371ac74539914 Mon Sep 17 00:00:00 2001 From: Alexander Gall Date: Mon, 30 Mar 2020 17:08:06 +0200 Subject: [PATCH 093/209] apps.mellanox.connectx4: add MT28800 as supported device Also add identification of ConnectX type to the driver for future version-dependent code. --- src/apps/mellanox/connectx4.lua | 13 ++++++++++++- src/lib/hardware/pci.lua | 1 + 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/src/apps/mellanox/connectx4.lua b/src/apps/mellanox/connectx4.lua index 4c9cf10728..562f0f052d 100644 --- a/src/apps/mellanox/connectx4.lua +++ b/src/apps/mellanox/connectx4.lua @@ -177,9 +177,18 @@ end ConnectX4 = {} ConnectX4.__index = ConnectX4 +local mlx_types = { + ["0x1013" ] = 4, -- ConnectX4 + ["0x1017" ] = 5, -- ConnectX5 + ["0x1019" ] = 5, -- ConnectX5 +} + function ConnectX4:new (conf) local self = setmetatable({}, self) local pciaddress = pci.qualified(conf.pciaddress) + local device_info = pci.device_info(pciaddress) + self.mlx = assert(mlx_types[device_info.device], + "Unsupported device "..device_info.device) local sendq_size = conf.sendq_size or 1024 local recvq_size = conf.recvq_size or 1024 @@ -210,7 +219,9 @@ function ConnectX4:new (conf) local hca = hca_factory:new() -- Makes enable_hca() hang with ConnectX5 - -- init_seg:reset() + if self.mlx == 4 then + init_seg:reset() + end if debug_trace then init_seg:dump() end while not init_seg:ready() do C.usleep(1000) diff --git a/src/lib/hardware/pci.lua b/src/lib/hardware/pci.lua index 7b74ab370f..82c330e411 100644 --- a/src/lib/hardware/pci.lua +++ b/src/lib/hardware/pci.lua @@ -86,6 +86,7 @@ local cards = { ["0x15b3"] = { ["0x1013" ] = {model = 'MT27700', driver = 'apps.mellanox.connectx4'}, ["0x1017" ] = {model = 'MT27800', driver = 'apps.mellanox.connectx4'}, + ["0x1019" ] = {model = 'MT28800', driver = 'apps.mellanox.connectx4'}, }, } From a0e85852a4d516ffb7ddd6c7d2444197852bfc80 Mon Sep 17 00:00:00 2001 From: Alexander Gall Date: Mon, 30 Mar 2020 17:52:48 +0200 Subject: [PATCH 094/209] apps.mellanox.connectx4: add per-queue drop counters for ConnectX5 These counters capture the drops due to ingress buffer overflow. No other per-queue stats are currently implemented. The counters are aggregated to a single ingress drop counter. --- src/apps/mellanox/connectx4.lua | 80 ++++++++++++++++++++++++++++++--- 1 file changed, 74 insertions(+), 6 deletions(-) diff --git a/src/apps/mellanox/connectx4.lua b/src/apps/mellanox/connectx4.lua index 562f0f052d..5f33b8f510 100644 --- a/src/apps/mellanox/connectx4.lua +++ b/src/apps/mellanox/connectx4.lua @@ -257,6 +257,9 @@ function ConnectX4:new (conf) local rqlist = {} local rqs = {} + -- List of queue counter IDs (ConnectX5 and up) + local counter_set_ids = {} + local usevlan = false for _, queue in ipairs(conf.queues) do @@ -277,9 +280,15 @@ function ConnectX4:new (conf) cxq.swq = cast(ffi.typeof(cxq.swq), workqueues + 64 * recvq_size) -- Create the queue objects local tis = hca:create_tis(0, tdomain) + local counter_set_id + if self.mlx > 4 then + counter_set_id = hca:alloc_q_counter() + table.insert(counter_set_ids, counter_set_id) + end -- XXX order check cxq.sqn = hca:create_sq(scqn, pd, sendq_size, cxq.doorbell, cxq.swq, uar, tis) - cxq.rqn = hca:create_rq(rcqn, pd, recvq_size, cxq.doorbell, cxq.rwq) + cxq.rqn = hca:create_rq(rcqn, pd, recvq_size, cxq.doorbell, cxq.rwq, + counter_set_id) hca:modify_sq(cxq.sqn, 0, 1) -- RESET -> READY hca:modify_rq(cxq.rqn, 0, 1) -- RESET -> READY @@ -321,6 +330,11 @@ function ConnectX4:new (conf) for _, l4_proto in ipairs(l4_protos) do local tir = hca:create_tir_indirect(rqt, tdomain, l3_proto, l4_proto) + -- NOTE: flow table entries will only match if the packet + -- contains the complete L4 header. Keep this in mind when + -- processing truncated packets (e.g. from a port-mirror). + -- If the header is incomplete, the packet will fall through + -- to the wildcard match and end up in the first queue. hca:set_flow_table_entry_ip(rxtable, NIC_RX, flow_group_ip, index, tir, l3_proto, l4_proto) index = index + 1 @@ -379,7 +393,11 @@ function ConnectX4:new (conf) set(stats.rxpackets, r.rxpackets) set(stats.rxmcast, r.rxmcast) set(stats.rxbcast, r.rxbcast) - set(stats.rxdrop, r.rxdrop) + if self.mlx == 4 then + -- ConnectX 4 doesn't have per-queue drop stats, + -- but this counter appears to always be zero :/ + set(stats.rxdrop, r.rxdrop) + end set(stats.rxerrors, r.rxerrors) set(stats.txbytes, r.txbytes) set(stats.txpackets, r.txpackets) @@ -404,10 +422,27 @@ function ConnectX4:new (conf) end }, } + + -- Empty for ConnectX4 + for _, id in ipairs(counter_set_ids) do + table.insert(self.stats_reqs, + { + start_fn = HCA.query_q_counter_start, + finish_fn = HCA.query_q_counter_finish, + args = { set_id = id }, + process_fn = function(r, stats) + -- Incremental update relies on query_q_counter to + -- clear the counter after read. + counter.set(stats.rxdrop, + counter.read(stats.rxdrop) + r.out_of_buffer) + end + }) + end + for _, req in ipairs(self.stats_reqs) do req.hca = hca_factory:new() -- Post command - req.start_fn(req.hca) + req.start_fn(req.hca, req.args) end self.sync_timer = lib.throttle(1) @@ -868,7 +903,10 @@ function HCA:create_tir_indirect (rqt, transport_domain, l3_proto, l4_proto) self:command("CREATE_TIR", 0x10C, 0x0C) :input("opcode", 0x00, 31, 16, 0x900) :input("disp_type", 0x20 + 0x04, 31, 28, 1) -- indirect - :input("rx_hash_symmetric",0x20 + 0x20, 31, 31, 1) -- hash symmetrically + -- Symmetric hashing would sort src/dst ports prior to hashing to + -- map bi-directional traffic to the same queue. We don't need that + -- since flows are inherently uni-directional. + :input("rx_hash_symmetric",0x20 + 0x20, 31, 31, 0) -- disabled :input("indirect_table", 0x20 + 0x20, 23, 0, rqt) :input("rx_hash_fn", 0x20 + 0x24, 31, 28, 2) -- toeplitz :input("transport_domain", 0x20 + 0x24, 23, 0, transport_domain) @@ -950,7 +988,7 @@ end -- Create a receive queue and return a receive queue object. -- Return the receive queue number and a pointer to the WQEs. -function HCA:create_rq (cqn, pd, size, doorbell, rwq) +function HCA:create_rq (cqn, pd, size, doorbell, rwq, counter_set_id) local log_wq_size = log2size(size) local db_phy = memory.virtual_to_physical(doorbell) local rwq_phy = memory.virtual_to_physical(rwq) @@ -969,7 +1007,11 @@ function HCA:create_rq (cqn, pd, size, doorbell, rwq) :input("log_wq_size", 0x20 + 0x30 + 0x20, 4 , 0, log_wq_size) :input("pas[0] high", 0x20 + 0x30 + 0xC0, 63, 32, ptrbits(rwq_phy, 63, 32)) :input("pas[0] low", 0x20 + 0x30 + 0xC4, 31, 0, ptrbits(rwq_phy, 31, 0)) - :execute() + if counter_set_id then + -- Only set for ConnectX5 and higher + self:input("counter_set_id",0x20 + 0x0C, 31, 24, counter_set_id) + end + self:execute() return self:output(0x08, 23, 0) end @@ -1622,6 +1664,32 @@ function HCA:get_port_stats_finish () return port_stats end +function HCA:alloc_q_counter() + self:command("ALLOC_Q_COUNTER", 0x18, 0x10C) + :input("opcode", 0x00, 31, 16, 0x771) + :execute() + return self:output(0x08, 7, 0) +end + +local q_stats = { + out_of_buffer = 0ULL +} +function HCA:query_q_counter_start (args) + self:command("QUERY_Q_COUNTER", 0x20, 0x10C) + :input("opcode", 0x00, 31, 16, 0x773) + -- Clear the counter after reading. This allows us to + -- update the rxdrop stat incrementally. + :input("clear", 0x18, 31, 31, 1) + :input("counter_set_id",0x1c, 7, 0, args.set_id) + :execute_async() +end + +local out_of_buffer = 0ULL +function HCA:query_q_counter_finish () + q_stats.out_of_buffer = self:output(0x10 + 0x20, 31, 0) + return q_stats +end + --------------------------------------------------------------- -- Command Interface implementation. -- From 0068df61213d030ac6064f0d5db8705373e7e3c7 Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Fri, 17 Apr 2020 15:28:24 +0200 Subject: [PATCH 095/209] arch/checksum: fix bug in carry propagation This fixes a bug in our scalar x86 IP checksum implementation, where some carries where missed. Really hard to find with random testing, added a systematic test case for carry propagation. --- src/arch/checksum.dasl | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/arch/checksum.dasl b/src/arch/checksum.dasl index a188f75658..38fe29a499 100644 --- a/src/arch/checksum.dasl +++ b/src/arch/checksum.dasl @@ -71,6 +71,7 @@ local function gen_checksum () | jl >5 -- Jump to branch '3'. | mov r9d, dword [rdi + r8] -- Fetch 32-bit from data + r8 into r9d. | add rax, r9 -- Sum acc with r9. Accumulate carry. + | adc rax, 0 -- Sum carry-bit into acc. | sub rcx, 4 -- Decrease index by 4. | add r8, 4 -- Next 32-bit. | 5: @@ -78,6 +79,7 @@ local function gen_checksum () | jl >6 -- Jump to branch '4'. | movzx r9, word [rdi + r8] -- Fetch 16-bit from data + r8 into r9. | add rax, r9 -- Sum acc with r9. Accumulate carry. + | adc rax, 0 -- Sum carry-bit into acc. | sub rcx, 2 -- Decrease index by 2. | add r8, 2 -- Next 16-bit. | 6: @@ -85,6 +87,7 @@ local function gen_checksum () | jl >7 -- Jump to branch '5'. | movzx r9, byte [rdi + r8] -- Fetch 8-bit from data + r8 into r9. | add rax, r9 -- Sum acc with r9. Accumulate carry. + | adc rax, 0 -- Sum carry-bit into acc. -- Fold 64-bit into 16-bit. | 7: | mov r9, rax -- Assign acc to r9. @@ -157,4 +160,11 @@ function selftest () assert(hex(checksum(pkt.data, pkt.length, 0)) == hex(ntohs(checksum_lua(pkt.data, pkt.length)))) assert(hex(checksum(pkt.data, pkt.length, 0)) == hex(C.cksum_generic(pkt.data, pkt.length, 0))) end + -- Test carry propagation + for l = 1, 63 do + local pkt = { data = ffi.new("uint8_t[?]", l), length = l } + for i = 0, l-2 do pkt.data[i]=0xff end; pkt.data[l-1] = 0x01 + assert(hex(checksum(pkt.data, pkt.length, 0)) == hex(ntohs(checksum_lua(pkt.data, pkt.length)))) + assert(hex(checksum(pkt.data, pkt.length, 0)) == hex(C.cksum_generic(pkt.data, pkt.length, 0))) + end end From fc37be48e226af2124359bb016f0b8ad4b5ee09d Mon Sep 17 00:00:00 2001 From: Alexander Gall Date: Wed, 15 Apr 2020 10:40:37 +0200 Subject: [PATCH 096/209] apps.mellanox.connectx4: unbind device in IO:new() IO:new() may be called before ConnectX4:new(). The device is unbound by whichever method runs first (IO:new() waits for ConnectX4:new() to set up the CXQ in any case). --- src/apps/mellanox/connectx4.lua | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/apps/mellanox/connectx4.lua b/src/apps/mellanox/connectx4.lua index 5f33b8f510..c4346561dd 100644 --- a/src/apps/mellanox/connectx4.lua +++ b/src/apps/mellanox/connectx4.lua @@ -1080,6 +1080,9 @@ function IO:new (conf) local pciaddress = pci.qualified(conf.pciaddress) local queue = conf.queue + -- This is also done in Connectex4:new() but might not have + -- happened yet. + pci.unbind_device_from_linux(pciaddress) local mmio, fd = pci.map_pci_memory(pciaddress, 0, false) local online = false -- True when queue is up and running From adc7d3d697204e8c557cdcc67b7368a2e41da93c Mon Sep 17 00:00:00 2001 From: Alexander Gall Date: Fri, 1 May 2020 15:43:36 +0200 Subject: [PATCH 097/209] apps.mellanox.connectx4: fix mcast counters The intel_mp driver conflates multicast packets with broadcast packets, which then needs to be reversed by lib.ipc.shmem.iftable_mib. We have to do the same here to get conistent MIB counters. It would probably be better to properly count the multicast packets in the Intel driver in the first place. --- src/apps/mellanox/connectx4.lua | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/src/apps/mellanox/connectx4.lua b/src/apps/mellanox/connectx4.lua index c4346561dd..ec21bb75b3 100644 --- a/src/apps/mellanox/connectx4.lua +++ b/src/apps/mellanox/connectx4.lua @@ -1649,19 +1649,27 @@ end function HCA:get_port_stats_finish () port_stats.rxbytes = self:output64(0x18 + 0x00) -- includes 4-byte CRC local in_ucast_packets = self:output64(0x18 + 0x08) - port_stats.rxmcast = self:output64(0x18 + 0x48) - port_stats.rxbcast = self:output64(0x18 + 0x50) + local in_mcast_packets = self:output64(0x18 + 0x48) + local in_bcast_packets = self:output64(0x18 + 0x50) + -- This is weird. The intel_mp driver adds broadcast packets to the + -- mcast counter, it is unclear why. Then + -- lib.ipc.shmem.iftable_mib reverses it to get the true mcast + -- counter back. So we do the same here. The proper fix would be + -- to fix the Intel driver and remove the anti-hack from + -- iftable_mib. + port_stats.rxmcast = in_mcast_packets + in_bcast_packets + port_stats.rxbcast = in_bcast_packets port_stats.rxpackets = in_ucast_packets + port_stats.rxmcast - + port_stats.rxbcast port_stats.rxdrop = self:output64(0x18 + 0x10) port_stats.rxerrors = self:output64(0x18 + 0x18) port_stats.txbytes = self:output64(0x18 + 0x28) local out_ucast_packets = self:output64(0x18 + 0x30) - port_stats.txmcast = self:output64(0x18 + 0x58) - port_stats.txbcast = self:output64(0x18 + 0x60) + local out_mcast_packets = self:output64(0x18 + 0x58) + local out_bcast_packets = self:output64(0x18 + 0x60) + port_stats.txmcast = out_mcast_packets + out_bcast_packets + port_stats.txbcast = out_bcast_packets port_stats.txpackets = out_ucast_packets + port_stats.txmcast - + port_stats.txbcast port_stats.txdrop = self:output64(0x18 + 0x38) port_stats.txerrors = self:output64(0x18 + 0x40) return port_stats From 22197483947eb9cfb59c6b50eb3b0c32a290d92e Mon Sep 17 00:00:00 2001 From: Alexander Gall Date: Tue, 14 Apr 2020 20:36:33 +0200 Subject: [PATCH 098/209] Fix transitions in CXQ state machine transition() did not actually check the current state, which caused IO handlers to open the queue prematurely, leading to crashes in a multi-process setup. There probably still lurks a race condition without an atomic "LOCK CMPXCHG" operation, as commented in the code. The CXQ state machine is called from the push()/pull() methods and could introduce branches that might have a negative impact on performance (side traces). --- src/apps/mellanox/connectx4.lua | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/apps/mellanox/connectx4.lua b/src/apps/mellanox/connectx4.lua index ec21bb75b3..24a09b0dd7 100644 --- a/src/apps/mellanox/connectx4.lua +++ b/src/apps/mellanox/connectx4.lua @@ -163,8 +163,12 @@ local DEAD = 3 -- Returns true on successful transition, false if oldstate does not match. function transition (cxq, oldstate, newstate) -- XXX use atomic x86 "LOCK CMPXCHG" instruction. Have to teach DynASM. - cxq.state = newstate - return true + if cxq.state == oldstate then + cxq.state = newstate + return true + else + return false + end end --------------------------------------------------------------- From 6cdcb40c438d4a83ebb1a2c8d71f5ce99144ff2c Mon Sep 17 00:00:00 2001 From: Alexander Gall Date: Tue, 27 Oct 2020 14:14:41 +0100 Subject: [PATCH 099/209] apps.mellanox: free CXQ during process shutdown Also change to core.sync for atmoic state transition. --- src/apps/mellanox/connectx4.lua | 61 ++++++++++++++++++++------------- src/core/main.lua | 1 + 2 files changed, 38 insertions(+), 24 deletions(-) diff --git a/src/apps/mellanox/connectx4.lua b/src/apps/mellanox/connectx4.lua index 24a09b0dd7..27941948c9 100644 --- a/src/apps/mellanox/connectx4.lua +++ b/src/apps/mellanox/connectx4.lua @@ -32,6 +32,7 @@ module(...,package.seeall) local ffi = require "ffi" local C = ffi.C local lib = require("core.lib") +local sync = require("core.sync") local pci = require("lib.hardware.pci") local register = require("lib.hardware.register") local index_set = require("lib.index_set") @@ -76,7 +77,8 @@ local rqt_max_size = 128 -- future that we do not need this much flexibility. Time will tell.) --------------------------------------------------------------- --- CXQs can be in one of four states: +-- CXQs can be in one of five states: +-- INIT: CXQ is being initialized by the control app -- FREE: CXQ is ready and available for use by an IO app. -- IDLE: CXQ is owned by an app, but not actively processing right now. -- BUSY: CXQ is owned by an app and is currently processing (e.g. push/pull). @@ -90,13 +92,13 @@ local rqt_max_size = 128 -- -- App Change Why -- ---- ----------- -------------------------------------------------------- --- CTRL none->BUSY: Control app starts initialization. --- CTRL BUSY->FREE: Control app completes initialization. +-- CTRL none->INIT: Control app starts initialization. +-- CTRL INIT->FREE: Control app completes initialization. -- IO FREE->IDLE: IO app starts and becomes owner of the CXQ. -- IO IDLE->FREE: IO app stops and releases the CXQ for future use. -- IO IDLE->BUSY: IO app starts running a pull/push method. -- IO BUSY->IDLE: IO app stops running a pull/push method. --- CTRL IDLE->DEAD: Control app closes the CXQ. (Replacement can be created.) +-- CTRL IDLE->DEAD: Control app closes the CXQ. (Replacement can be created.) NYI -- -- These state transitions are *PROHIBITED* for important reasons: -- @@ -120,7 +122,7 @@ local rqt_max_size = 128 -- format.) local cxq_t = ffi.typeof([[ struct { - uint32_t state; // current state / availability + int state[1]; // current state / availability // configuration information: uint32_t sqn; // send queue number @@ -154,20 +156,28 @@ local cxq_t = ffi.typeof([[ ]]) -- CXQ states: -local BUSY = 0 -- Implicit initial state due to 0 value. -local IDLE = 1 -local FREE = 2 -local DEAD = 3 - --- Transition from oldstate to newstate. --- Returns true on successful transition, false if oldstate does not match. -function transition (cxq, oldstate, newstate) - -- XXX use atomic x86 "LOCK CMPXCHG" instruction. Have to teach DynASM. - if cxq.state == oldstate then - cxq.state = newstate - return true - else - return false +local INIT = 0 -- Implicit initial state due to 0 value. +local BUSY = 1 +local IDLE = 2 +local FREE = 3 +local DEAD = 4 + +-- Release CXQ after process termination. Called from +-- core.main.shutdown +function shutdown(pid) + for _, pciaddr in ipairs(shm.children("/"..pid.."/mellanox")) do + for _, queue in ipairs(shm.children("/"..pid.."/mellanox/"..pciaddr)) do + local backlink = "/"..pid.."/mellanox/"..pciaddr.."/"..queue + local shm_name = "/"..pid.."/group/pci/"..pciaddr.."/"..queue + if shm.exists(shm_name) then + local cxq = shm.open(shm_name, cxq_t) + assert(sync.cas(cxq.state, IDLE, FREE) or + sync.cas(cxq.state, BUSY, FREE), + "ConnectX4: failed to free "..shm_name.. + " during shutdown") + end + shm.unlink(backlink) + end end end @@ -297,7 +307,7 @@ function ConnectX4:new (conf) hca:modify_rq(cxq.rqn, 0, 1) -- RESET -> READY -- CXQ is now fully initialized & ready for attach. - assert(transition(cxq, BUSY, FREE)) + assert(sync.cas(cxq.state, INIT, FREE)) usevlan = usevlan or (queue.vlan ~= nil) @@ -1098,6 +1108,7 @@ function IO:new (conf) -- Close the queue mapping. local function close () + shm.unlink(self.backlink) shm.unmap(cxq) cxq = nil end @@ -1105,9 +1116,11 @@ function IO:new (conf) -- Open the queue mapping. local function open () local shmpath = "group/pci/"..pciaddress.."/"..queue + self.backlink = "mellanox/"..pciaddress.."/"..queue if shm.exists(shmpath) then + shm.alias(self.backlink, shmpath) cxq = shm.open(shmpath, cxq_t) - if transition(cxq, FREE, IDLE) then + if sync.cas(cxq.state, FREE, IDLE) then sq = SQ:new(cxq, mmio) rq = RQ:new(cxq) else @@ -1124,10 +1137,10 @@ function IO:new (conf) end if cxq then -- Careful: Control app may have closed the CXQ. - if transition(cxq, IDLE, BUSY) then + if sync.cas(cxq.state, IDLE, BUSY) then return true else - assert(cxq.state == DEAD, "illegal state detected") + assert(cxq.state[0] == DEAD, "illegal state detected") close() end end @@ -1135,7 +1148,7 @@ function IO:new (conf) -- Enter the idle state. local function deactivate () - assert(transition(cxq, BUSY, IDLE)) + assert(sync.cas(cxq.state, BUSY, IDLE)) end -- Send packets to the NIC diff --git a/src/core/main.lua b/src/core/main.lua index a18520568b..3e674c17c7 100644 --- a/src/core/main.lua +++ b/src/core/main.lua @@ -180,6 +180,7 @@ function shutdown (pid) -- Run cleanup hooks safely(function () require("apps.interlink.receiver").shutdown(pid) end) safely(function () require("apps.interlink.transmitter").shutdown(pid) end) + safely(function () require("apps.mellanox.connectx4").shutdown(pid) end) -- Parent process performs additional cleanup steps. -- (Parent is the process whose 'group' folder is not a symlink.) local st, err = S.lstat(shm.root.."/"..pid.."/group") From c579524f88605d1716448ae060526273206764cb Mon Sep 17 00:00:00 2001 From: Alexander Gall Date: Fri, 6 Nov 2020 09:38:01 +0100 Subject: [PATCH 100/209] apps.mellanox: use correct WQE stride in RX queues The size of a WQE for a receive queue is 16 bytes but a value of 64 was used when allocating the DMA memory for the work queues. --- src/apps/mellanox/connectx4.lua | 72 +++++++++++++++++---------------- 1 file changed, 38 insertions(+), 34 deletions(-) diff --git a/src/apps/mellanox/connectx4.lua b/src/apps/mellanox/connectx4.lua index 27941948c9..1f706e0163 100644 --- a/src/apps/mellanox/connectx4.lua +++ b/src/apps/mellanox/connectx4.lua @@ -142,6 +142,10 @@ local cxq_t = ffi.typeof([[ // send work queue and send/receive completion queues union { uint8_t u8[64]; uint32_t u32[0]; uint64_t u64[0];} *swq, *scq, *rcq; + // The tx and rx lists must each be large enough for the maximum + // queue size, which currently is 32768. We should probably add + // a check for that. + // Transmit state struct packet *tx[64*1024]; // packets queued for transmit uint16_t next_tx_wqeid; // work queue ID for next transmit descriptor @@ -246,6 +250,7 @@ function ConnectX4:new (conf) hca:enable_hca() hca:set_issi(1) hca:alloc_pages(hca:query_pages("boot")) + local max_cap = hca:query_hca_general_cap('max') if debug_trace then self:dump_capabilities(hca) end -- Initialize the card @@ -280,6 +285,19 @@ function ConnectX4:new (conf) -- Create a shared memory object for controlling the queue pair local cxq = shm.create("group/pci/"..pciaddress.."/"..queue.id, cxq_t) + local function check_qsize (type, size) + assert(check_pow2(size), + string.format("%s: %s queue size must be a power of 2: %d", + conf.pciaddress, type, size)) + assert(log2size(size) <= max_cap['log_max_wq_sz'], + string.format("%s: %s queue size too big: requested %d, allowed %d", + conf.pciaddress, type, size, + math.pow(2, max_cap['log_max_wq_sz']))) + end + + check_qsize("Send", sendq_size) + check_qsize("Receive", recvq_size) + cxq.rlkey = rlkey cxq.sqsize = sendq_size cxq.rqsize = recvq_size @@ -289,9 +307,13 @@ function ConnectX4:new (conf) cxq.scq = cast(typeof(cxq.scq), scqe) cxq.rcq = cast(typeof(cxq.rcq), rcqe) cxq.doorbell = cast(typeof(cxq.doorbell), memory.dma_alloc(16)) - local workqueues = memory.dma_alloc(64 * (sendq_size + recvq_size), 4096) + + local rq_stride = ffi.sizeof(ffi.typeof(cxq.rwq[0])) + local sq_stride = ffi.sizeof(ffi.typeof(cxq.swq[0])) + local workqueues = memory.dma_alloc(sq_stride * sendq_size + + rq_stride *recvq_size, 4096) cxq.rwq = cast(ffi.typeof(cxq.rwq), workqueues) - cxq.swq = cast(ffi.typeof(cxq.swq), workqueues + 64 * recvq_size) + cxq.swq = cast(ffi.typeof(cxq.swq), workqueues + rq_stride * recvq_size) -- Create the queue objects local tis = hca:create_tis(0, tdomain) local counter_set_id @@ -300,8 +322,10 @@ function ConnectX4:new (conf) table.insert(counter_set_ids, counter_set_id) end -- XXX order check - cxq.sqn = hca:create_sq(scqn, pd, sendq_size, cxq.doorbell, cxq.swq, uar, tis) - cxq.rqn = hca:create_rq(rcqn, pd, recvq_size, cxq.doorbell, cxq.rwq, + cxq.sqn = hca:create_sq(scqn, pd, sq_stride, sendq_size, + cxq.doorbell, cxq.swq, uar, tis) + cxq.rqn = hca:create_rq(rcqn, pd, rq_stride, recvq_size, + cxq.doorbell, cxq.rwq, counter_set_id) hca:modify_sq(cxq.sqn, 0, 1) -- RESET -> READY hca:modify_rq(cxq.rqn, 0, 1) -- RESET -> READY @@ -1002,7 +1026,8 @@ end -- Create a receive queue and return a receive queue object. -- Return the receive queue number and a pointer to the WQEs. -function HCA:create_rq (cqn, pd, size, doorbell, rwq, counter_set_id) +function HCA:create_rq (cqn, pd, stride, size, doorbell, rwq, counter_set_id) + local log_wq_stride = log2size(stride) local log_wq_size = log2size(size) local db_phy = memory.virtual_to_physical(doorbell) local rwq_phy = memory.virtual_to_physical(rwq) @@ -1016,7 +1041,7 @@ function HCA:create_rq (cqn, pd, size, doorbell, rwq, counter_set_id) :input("pd", 0x20 + 0x30 + 0x08, 23, 0, pd) :input("dbr_addr high", 0x20 + 0x30 + 0x10, 31, 0, ptrbits(db_phy, 63, 32)) :input("dbr_addr low", 0x20 + 0x30 + 0x14, 31, 0, ptrbits(db_phy, 31, 0)) - :input("log_wq_stride", 0x20 + 0x30 + 0x20, 19, 16, 4) + :input("log_wq_stride", 0x20 + 0x30 + 0x20, 19, 16, log_wq_stride) :input("log_page_size", 0x20 + 0x30 + 0x20, 12, 8, log_page_size) :input("log_wq_size", 0x20 + 0x30 + 0x20, 4 , 0, log_wq_size) :input("pas[0] high", 0x20 + 0x30 + 0xC0, 63, 32, ptrbits(rwq_phy, 63, 32)) @@ -1051,7 +1076,8 @@ end -- Create a Send Queue. -- Return the send queue number and a pointer to the WQEs. -function HCA:create_sq (cqn, pd, size, doorbell, swq, uar, tis) +function HCA:create_sq (cqn, pd, stride, size, doorbell, swq, uar, tis) + local log_wq_stride = log2size(stride) local log_wq_size = log2size(size) local db_phy = memory.virtual_to_physical(doorbell) local swq_phy = memory.virtual_to_physical(swq) @@ -1069,7 +1095,7 @@ function HCA:create_sq (cqn, pd, size, doorbell, swq, uar, tis) :input("uar_page", 0x20 + 0x30 + 0x0C, 23, 0, uar) :input("pas[0] high", 0x20 + 0x30 + 0x10, 31, 0, ptrbits(db_phy, 63, 32)) :input("pas[0] low", 0x20 + 0x30 + 0x14, 31, 0, ptrbits(db_phy, 31, 0)) - :input("log_wq_stride", 0x20 + 0x30 + 0x20, 19, 16, 6) + :input("log_wq_stride", 0x20 + 0x30 + 0x20, 19, 16, log_wq_stride) :input("log_wq_page_sz", 0x20 + 0x30 + 0x20, 12, 8, 6) -- XXX check :input("log_wq_size", 0x20 + 0x30 + 0x20, 4, 0, log_wq_size) :input("pas[0] high", 0x20 + 0x30 + 0xC0, 31, 0, ptrbits(swq_phy, 63, 32)) @@ -1175,34 +1201,8 @@ end --------------------------------------------------------------- -- Receive queue --- Work queue entries have irregular shapes and sizes. --- We operate on them simply as 64-byte chunks. -local wqe_t = ffi.typeof[[ - union { - uint8_t u8[64]; - uint32_t u32[0]; - uint64_t u64[0]; - } * - ]] - --- CQEs are similar to WQEs. -local cqe_t = wqe_t - -local doorbell_t = ffi.typeof[[ - struct { - uint32_t receive; - uint32_t send; - }* -]] - RQ = {} -local rwqe_t = ffi.typeof[[ - struct { - uint32_t length, lkey, address_high, address_low; - } * -]] - function RQ:new (cxq) local rq = {} @@ -2186,6 +2186,10 @@ function log2size (size) return math.ceil(math.log(size) / math.log(2)) end +function check_pow2 (num) + return bit.band(num, num - 1) == 0 +end + function selftest () io.stdout:setvbuf'no' From 42c2196a65dcd4e043107a059a4f1f09a32bdd2d Mon Sep 17 00:00:00 2001 From: Luke Gorrie Date: Thu, 31 Jan 2019 17:24:25 +0000 Subject: [PATCH 101/209] mellanox: Rename ConnectX4 -> ConnectX (because it works with 5+ too) (cherry picked from commit 937b1e45e7bcc3bc1db81313673d8008e5abbdcf) --- src/apps/mellanox/{connectx4.lua => connectx.lua} | 10 +++++----- src/apps/mellanox/connectx_test.lua | 12 ++++++------ 2 files changed, 11 insertions(+), 11 deletions(-) rename src/apps/mellanox/{connectx4.lua => connectx.lua} (99%) diff --git a/src/apps/mellanox/connectx4.lua b/src/apps/mellanox/connectx.lua similarity index 99% rename from src/apps/mellanox/connectx4.lua rename to src/apps/mellanox/connectx.lua index 1f706e0163..52503a2ee9 100644 --- a/src/apps/mellanox/connectx4.lua +++ b/src/apps/mellanox/connectx.lua @@ -1,10 +1,10 @@ --- Device driver for the Mellanox ConnectX-4 Ethernet controller family. +-- Device driver for the Mellanox ConnectX-4+ Ethernet controller family. -- Use of this source code is governed by the Apache 2.0 license; see COPYING. --- This is a device driver for Mellanox ConnectX-4 and ConnectX-4 LX --- ethernet cards. This driver is completely stand-alone and does not --- depend on any other software such as Mellanox OFED library or the --- Linux mlx5 driver. +-- This is a device driver for Mellanox ConnectX family ethernet +-- cards. This driver is completely stand-alone and does not depend on +-- any other software such as Mellanox OFED library or the Linux mlx5 +-- driver. -- -- Thanks are due to Mellanox and Deutsche Telekom for making it -- possible to develop this driver based on publicly available diff --git a/src/apps/mellanox/connectx_test.lua b/src/apps/mellanox/connectx_test.lua index 0e7c56e880..ee98edecf3 100644 --- a/src/apps/mellanox/connectx_test.lua +++ b/src/apps/mellanox/connectx_test.lua @@ -1,10 +1,10 @@ --- Test suite for the Mellanox ConnectX-4 driver. +-- Test suite for the Mellanox ConnectX driver. -- Use of this source code is governed by the Apache 2.0 license; see COPYING. module(..., package.seeall) local ffi = require("ffi") local C = ffi.C -local connectx4 = require("apps.mellanox.connectx4") +local connectx = require("apps.mellanox.connectx") local counter = require("core.counter") local lib = require("core.lib") @@ -27,7 +27,7 @@ local lib = require("core.lib") -- -- Hardware queue count will be macs*vlans*rss on each interface. function switch (pci0, pci1, npackets, ncores, minlen, maxlen, minburst, maxburst, macs, vlans, rss) - print("selftest: connectx4_test switch") + print("selftest: connectx_test switch") assert(rss == 1, "rss not yet handled") assert(ncores == 1, "multicore not yet handled") -- Create queue definitions @@ -39,14 +39,14 @@ function switch (pci0, pci1, npackets, ncores, minlen, maxlen, minburst, maxburs end end -- Instantiate app network - local nic0 = connectx4.ConnectX4:new({pciaddress=pci0, queues=queues, macvlan=true}) - local nic1 = connectx4.ConnectX4:new({pciaddress=pci1, queues=queues, macvlan=true}) + local nic0 = connectx.ConnectX:new({pciaddress=pci0, queues=queues, macvlan=true}) + local nic1 = connectx.ConnectX:new({pciaddress=pci1, queues=queues, macvlan=true}) local io0 = {} -- io apps on nic0 local io1 = {} -- io apps on nic1 print(("creating %d queues per device..."):format(#queues)) for _, queue in ipairs(queues) do local function ioapp (pci, queue) - local a = connectx4.IO:new({pciaddress=pci, queue=queue.id}) + local a = connectx.IO:new({pciaddress=pci, queue=queue.id}) a.input = { input = link.new(("input-%s-%s" ):format(pci, queue.id)) } a.output = { output = link.new(("output-%s-%s"):format(pci, queue.id)) } return a From 556e42779299166232853eb8e6ff64c2174879d3 Mon Sep 17 00:00:00 2001 From: Alexander Gall Date: Fri, 6 Nov 2020 10:13:32 +0100 Subject: [PATCH 102/209] mellanox: complete renaming ConnectX4 -> ConnectX --- src/apps/mellanox/connectx.lua | 28 ++++++++++++++-------------- src/core/main.lua | 2 +- src/lib/hardware/pci.lua | 8 ++++---- 3 files changed, 19 insertions(+), 19 deletions(-) diff --git a/src/apps/mellanox/connectx.lua b/src/apps/mellanox/connectx.lua index 52503a2ee9..fdfed64f90 100644 --- a/src/apps/mellanox/connectx.lua +++ b/src/apps/mellanox/connectx.lua @@ -177,7 +177,7 @@ function shutdown(pid) local cxq = shm.open(shm_name, cxq_t) assert(sync.cas(cxq.state, IDLE, FREE) or sync.cas(cxq.state, BUSY, FREE), - "ConnectX4: failed to free "..shm_name.. + "ConnectX: failed to free "..shm_name.. " during shutdown") end shm.unlink(backlink) @@ -186,14 +186,14 @@ function shutdown(pid) end --------------------------------------------------------------- --- ConnectX4 Snabb app. +-- ConnectX Snabb app. -- -- Uses the driver routines to implement ConnectX-4 support in -- the Snabb app network. --------------------------------------------------------------- -ConnectX4 = {} -ConnectX4.__index = ConnectX4 +ConnectX = {} +ConnectX.__index = ConnectX local mlx_types = { ["0x1013" ] = 4, -- ConnectX4 @@ -201,7 +201,7 @@ local mlx_types = { ["0x1019" ] = 5, -- ConnectX5 } -function ConnectX4:new (conf) +function ConnectX:new (conf) local self = setmetatable({}, self) local pciaddress = pci.qualified(conf.pciaddress) local device_info = pci.device_info(pciaddress) @@ -513,7 +513,7 @@ function ConnectX4:new (conf) return self end -function ConnectX4:dump_capabilities (hca) +function ConnectX:dump_capabilities (hca) --if true then return end -- Print current and maximum card capabilities. -- XXX Check if we have any specific requirements that we need to @@ -526,7 +526,7 @@ function ConnectX4:dump_capabilities (hca) end end -function ConnectX4:check_vport () +function ConnectX:check_vport () if true then return end local vport_ctx = hca:query_nic_vport_context() for k,v in pairs(vport_ctx) do @@ -538,7 +538,7 @@ function ConnectX4:check_vport () end end -function ConnectX4:print_vport_counter () +function ConnectX:print_vport_counter () local c = self.hca:query_vport_counter() local t = {} -- Sort into key order @@ -2193,15 +2193,15 @@ end function selftest () io.stdout:setvbuf'no' - local pcidev0 = lib.getenv("SNABB_PCI_CONNECTX4_0") - local pcidev1 = lib.getenv("SNABB_PCI_CONNECTX4_1") + local pcidev0 = lib.getenv("SNABB_PCI_CONNECTX_0") + local pcidev1 = lib.getenv("SNABB_PCI_CONNECTX_1") -- XXX check PCI device type if not pcidev0 then - print("SNABB_PCI_CONNECTX4_0 not set") + print("SNABB_PCI_CONNECTX_0 not set") os.exit(engine.test_skipped_code) end if not pcidev1 then - print("SNABB_PCI_CONNECTX4_1 not set") + print("SNABB_PCI_CONNECTX_1 not set") os.exit(engine.test_skipped_code) end @@ -2213,8 +2213,8 @@ function selftest () io1.output = { output = link.new('output1') } -- Exercise the IO apps before the NIC is initialized. io0:pull() io0:push() io1:pull() io1:push() - local nic0 = ConnectX4:new{pciaddress = pcidev0, queues = {{id='a'}}} - local nic1 = ConnectX4:new{pciaddress = pcidev1, queues = {{id='b'}}} + local nic0 = ConnectX:new{pciaddress = pcidev0, queues = {{id='a'}}} + local nic1 = ConnectX:new{pciaddress = pcidev1, queues = {{id='b'}}} print("selftest: waiting for both links up") while (nic0.hca:query_vport_state().oper_state ~= 1) or diff --git a/src/core/main.lua b/src/core/main.lua index 3e674c17c7..1db0f366fc 100644 --- a/src/core/main.lua +++ b/src/core/main.lua @@ -180,7 +180,7 @@ function shutdown (pid) -- Run cleanup hooks safely(function () require("apps.interlink.receiver").shutdown(pid) end) safely(function () require("apps.interlink.transmitter").shutdown(pid) end) - safely(function () require("apps.mellanox.connectx4").shutdown(pid) end) + safely(function () require("apps.mellanox.connectx").shutdown(pid) end) -- Parent process performs additional cleanup steps. -- (Parent is the process whose 'group' folder is not a symlink.) local st, err = S.lstat(shm.root.."/"..pid.."/group") diff --git a/src/lib/hardware/pci.lua b/src/lib/hardware/pci.lua index 28f0e87ce8..a58f2d513e 100644 --- a/src/lib/hardware/pci.lua +++ b/src/lib/hardware/pci.lua @@ -86,9 +86,9 @@ local cards = { ["0x0903"] = {model = 'SFN7122F', driver = 'apps.solarflare.solarflare'} }, ["0x15b3"] = { - ["0x1013" ] = {model = 'MT27700', driver = 'apps.mellanox.connectx4'}, - ["0x1017" ] = {model = 'MT27800', driver = 'apps.mellanox.connectx4'}, - ["0x1019" ] = {model = 'MT28800', driver = 'apps.mellanox.connectx4'}, + ["0x1013" ] = {model = 'MT27700', driver = 'apps.mellanox.connectx'}, + ["0x1017" ] = {model = 'MT27800', driver = 'apps.mellanox.connectx'}, + ["0x1019" ] = {model = 'MT28800', driver = 'apps.mellanox.connectx'}, }, } @@ -96,7 +96,7 @@ local link_names = { ['apps.solarflare.solarflare'] = { "rx", "tx" }, ['apps.intel_mp.intel_mp'] = { "input", "output" }, ['apps.intel.intel_app'] = { "rx", "tx" }, - ['apps.mellanox.connectx4'] = { "input", "output" }, + ['apps.mellanox.connectx'] = { "input", "output" }, } -- Return the name of the Lua module that implements support for this device. From a348899c24ca3dc9c778e46c5e7c0273aa379edf Mon Sep 17 00:00:00 2001 From: Alexander Gall Date: Tue, 23 Feb 2021 11:49:24 +0100 Subject: [PATCH 103/209] lib.ctable: fix hugepages allocation size mmap() silently rounds up the size of the memory region to a multiple of the huge page size. This causes munmap() to fail if called with the original value. --- src/lib/ctable.lua | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/lib/ctable.lua b/src/lib/ctable.lua index 6397f649c4..fb575efdc5 100644 --- a/src/lib/ctable.lua +++ b/src/lib/ctable.lua @@ -160,11 +160,14 @@ end -- hugepages, not this code. local try_huge_pages = true local huge_page_threshold = 1e6 +local huge_page_size = memory.get_huge_page_size() local function calloc(t, count) if count == 0 then return 0, 0 end local byte_size = ffi.sizeof(t) * count + local alloc_byte_size = byte_size local mem, err if try_huge_pages and byte_size > huge_page_threshold then + alloc_byte_size = ceil(byte_size/huge_page_size) * huge_page_size mem, err = S.mmap(nil, byte_size, 'read, write', 'private, anonymous, hugetlb') if not mem then @@ -179,7 +182,7 @@ local function calloc(t, count) if not mem then error("mmap failed: " .. tostring(err)) end end local ret = ffi.cast(ffi.typeof('$*', t), mem) - ffi.gc(ret, function (ptr) S.munmap(ptr, byte_size) end) + ffi.gc(ret, function (ptr) S.munmap(ptr, alloc_byte_size) end) return ret, byte_size end From 3461a095dde523ddc93aa7d0c7a6ec164cd11687 Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Mon, 12 Jul 2021 11:32:05 +0000 Subject: [PATCH 104/209] apps.xdp: only run rxtx_match selftest if SNABB_XDP_NQUEUES=1 --- src/apps/xdp/xdp.lua | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/apps/xdp/xdp.lua b/src/apps/xdp/xdp.lua index 52c9a5606c..be05faf3de 100644 --- a/src/apps/xdp/xdp.lua +++ b/src/apps/xdp/xdp.lua @@ -746,8 +746,10 @@ function selftest () selftest_rxtx(xdpdeva, xdpmaca, xdpdevb, xdpmacb, nqueues) print("test: duplex") selftest_duplex(xdpdeva, xdpmaca, xdpdevb, xdpmacb, nqueues) + if nqueues == 1 then print("test: rxtx_match") selftest_rxtx_match(xdpdeva, xdpmaca, xdpdevb, xdpmacb) + end if nqueues > 1 then print("test: share_interface") selftest_share_interface(xdpdeva, xdpmaca, xdpdevb, xdpmacb, nqueues) From 00cac3632182e38e3eb907914f55e2c3ce5b7e8c Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Mon, 12 Jul 2021 11:32:24 +0000 Subject: [PATCH 105/209] apps.xdp: debug stop() --- src/apps/xdp/xdp.lua | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/src/apps/xdp/xdp.lua b/src/apps/xdp/xdp.lua index be05faf3de..711e8d733b 100644 --- a/src/apps/xdp/xdp.lua +++ b/src/apps/xdp/xdp.lua @@ -595,6 +595,7 @@ end -- Instance methods function XDP:stop () + print("STOP BEGIN", self.rxq, self.txq, packet.nfree()) -- Close socket. self.sock:close() -- Reclaim packet buffers left on rings. @@ -621,16 +622,29 @@ function XDP:stop () packet.free_internal(reclaim(self.cr)) self.txq = self.txq - 1 end + print("STOP FLUSH", self.rxq, self.txq, packet.nfree()) -- Then, we use the final rxq/txq tallies to infer how many packets on the -- transmit and fill rings are left dangling, and free those amounts of -- packets (starting from the most recently enqueued, going backwards) from -- each ring individually. - for _ = 1, self.txq do + for _ = 1, math.min(self.txq, xdp_ring_ndesc) do packet.free_internal(rewind_transmit(self.tx)) + self.txq = self.txq - 1 end - for _ = 1, self.rxq do + for _ = 1, math.min(self.rxq, xdp_ring_ndesc) do packet.free_internal(rewind_fill(self.fr)) + self.rxq = self.rxq - 1 + end + print("STOP REWIND", self.rxq, self.txq, packet.nfree()) + for _ = 1, self.rxq do + packet.free_internal(receive(self.rx)) + self.rxq = self.rxq - 1 + end + for _ = 1, self.txq do + packet.free_internal(reclaim(self.cr)) + self.txq = self.txq - 1 end + print("STOP UNACCOUNTED", self.rxq, self.txq, packet.nfree()) -- Unmap rings. assert(S.munmap(self.rx.map, self.rx.maplen)) assert(S.munmap(self.tx.map, self.tx.maplen)) @@ -747,8 +761,8 @@ function selftest () print("test: duplex") selftest_duplex(xdpdeva, xdpmaca, xdpdevb, xdpmacb, nqueues) if nqueues == 1 then - print("test: rxtx_match") - selftest_rxtx_match(xdpdeva, xdpmaca, xdpdevb, xdpmacb) + print("test: rxtx_match") + selftest_rxtx_match(xdpdeva, xdpmaca, xdpdevb, xdpmacb) end if nqueues > 1 then print("test: share_interface") From 8d4ee134b911e69c1e2fad4e80fc91733607afed Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Mon, 12 Jul 2021 11:48:28 +0000 Subject: [PATCH 106/209] apps.xdp: mark stop() NYI / not supported --- src/apps/xdp/xdp.lua | 74 ++++++-------------------------------------- 1 file changed, 9 insertions(+), 65 deletions(-) diff --git a/src/apps/xdp/xdp.lua b/src/apps/xdp/xdp.lua index 711e8d733b..97677d5ff2 100644 --- a/src/apps/xdp/xdp.lua +++ b/src/apps/xdp/xdp.lua @@ -595,63 +595,7 @@ end -- Instance methods function XDP:stop () - print("STOP BEGIN", self.rxq, self.txq, packet.nfree()) - -- Close socket. - self.sock:close() - -- Reclaim packet buffers left on rings. - -- - -- Problem: we need a way to tell apart which packets buffers on the - -- (write-only) tx and fill rings need to be freed, and which packet buffers - -- were already enqueued to the (read-only) rx and completions rings. - -- Otherwise, we might cause memory corruption by double-freeing packets. - -- - -- We can not however reliably inspect the kernel's internal read cursors - -- for the tx and fill rings. Instead we solve this with a *hack* based on - -- the assumptions that 1) the kernel does not modify the rings after - -- closing the XDP socket; 2) the kernel moves packets from fill to rx rings - -- and tx to completion rings *in-order*; 3) the kernel does not clobber - -- descriptors that have not yet moved to an rx or completion ring. - -- - -- First we flush the rx and completion rings, freeing any dequeued packets, - -- while updating the rxq and txq tallies (see XDP:create_xsk()). - while not empty(self.rx) do - packet.free_internal(receive(self.rx)) - self.rxq = self.rxq - 1 - end - while not empty(self.cr) do - packet.free_internal(reclaim(self.cr)) - self.txq = self.txq - 1 - end - print("STOP FLUSH", self.rxq, self.txq, packet.nfree()) - -- Then, we use the final rxq/txq tallies to infer how many packets on the - -- transmit and fill rings are left dangling, and free those amounts of - -- packets (starting from the most recently enqueued, going backwards) from - -- each ring individually. - for _ = 1, math.min(self.txq, xdp_ring_ndesc) do - packet.free_internal(rewind_transmit(self.tx)) - self.txq = self.txq - 1 - end - for _ = 1, math.min(self.rxq, xdp_ring_ndesc) do - packet.free_internal(rewind_fill(self.fr)) - self.rxq = self.rxq - 1 - end - print("STOP REWIND", self.rxq, self.txq, packet.nfree()) - for _ = 1, self.rxq do - packet.free_internal(receive(self.rx)) - self.rxq = self.rxq - 1 - end - for _ = 1, self.txq do - packet.free_internal(reclaim(self.cr)) - self.txq = self.txq - 1 - end - print("STOP UNACCOUNTED", self.rxq, self.txq, packet.nfree()) - -- Unmap rings. - assert(S.munmap(self.rx.map, self.rx.maplen)) - assert(S.munmap(self.tx.map, self.tx.maplen)) - assert(S.munmap(self.fr.map, self.fr.maplen)) - assert(S.munmap(self.cr.map, self.cr.maplen)) - -- Close interface lockfd. See XDP:open_interface(). - self.lockfd:close() + error("Can not stop XDP driver (operation not supported)") end function XDP:pull () @@ -756,14 +700,14 @@ function selftest () end snabb_enable_xdp() engine.report_load() - print("test: rxtx") - selftest_rxtx(xdpdeva, xdpmaca, xdpdevb, xdpmacb, nqueues) - print("test: duplex") - selftest_duplex(xdpdeva, xdpmaca, xdpdevb, xdpmacb, nqueues) if nqueues == 1 then print("test: rxtx_match") selftest_rxtx_match(xdpdeva, xdpmaca, xdpdevb, xdpmacb) end + print("test: rxtx") + selftest_rxtx(xdpdeva, xdpmaca, xdpdevb, xdpmacb, nqueues) + print("test: duplex") + selftest_duplex(xdpdeva, xdpmaca, xdpdevb, xdpmacb, nqueues) if nqueues > 1 then print("test: share_interface") selftest_share_interface(xdpdeva, xdpmaca, xdpdevb, xdpmacb, nqueues) @@ -904,10 +848,10 @@ function selftest_rxtx_match (xdpdeva, xdpmaca, xdpdevb, xdpmacb) }) config.app(c, "npackets", npackets.Npackets, {npackets=1000}) config.app(c, "match", match.Match) - config.app(c, xdpdeva, XDP, {ifname=xdpdeva}) - config.app(c, xdpdevb, XDP, {ifname=xdpdevb}) - config.link(c, "source.output -> "..xdpdeva..".input") - config.link(c, xdpdevb..".output -> match.rx") + config.app(c, xdpdeva.."_q0", XDP, {ifname=xdpdeva}) + config.app(c, xdpdevb.."_q0", XDP, {ifname=xdpdevb}) + config.link(c, "source.output -> "..xdpdeva.."_q0.input") + config.link(c, xdpdevb.."_q0.output -> match.rx") config.link(c, "source.copy -> npackets.input") config.link(c, "npackets.output -> match.comparator") engine.configure(c) From 2bcc644bb3151a1cfed45127b6b5ba0b8eecc7e8 Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Tue, 13 Jul 2021 15:20:46 +0000 Subject: [PATCH 107/209] lwaftr --xdp sketch --- src/program/lwaftr/run/README | 2 ++ src/program/lwaftr/run/run.lua | 11 +++++++++++ src/program/lwaftr/setup.lua | 21 +++++++++++++++++++++ 3 files changed, 34 insertions(+) diff --git a/src/program/lwaftr/run/README b/src/program/lwaftr/run/README index 534b20e581..bb8dafeb1d 100644 --- a/src/program/lwaftr/run/README +++ b/src/program/lwaftr/run/README @@ -29,6 +29,8 @@ Optional arguments: -i, --virtio Interpret PCI addresses as referring to virtio-net interfaces instead of auto-detecting the appropriate driver. + --xdp Use Linux interfaces via XDP. + (Incompatible with --on-a-stick.) -r SIZE, --ring-buffer-size SIZE Set NIC receive buffer size. The default is driver-dependent. See diff --git a/src/program/lwaftr/run/run.lua b/src/program/lwaftr/run/run.lua index 04c6ead5f5..4a46e2ec41 100644 --- a/src/program/lwaftr/run/run.lua +++ b/src/program/lwaftr/run/run.lua @@ -56,6 +56,10 @@ function parse_args(args) function handlers.v () opts.verbosity = opts.verbosity + 1 end function handlers.t (arg) opts.trace = assert(arg) end function handlers.i () opts.virtio_net = true end + handlers['xdp'] = function(arg) + opts['xdp'] = true + scheduling.enable_xdp = {} -- XXX - maybe configure num_chunks here? + end function handlers.D (arg) opts.duration = assert(tonumber(arg), "duration must be a number") assert(opts.duration >= 0, "duration can't be negative") @@ -105,6 +109,7 @@ function parse_args(args) lib.dogetopt(args, handlers, "b:c:vD:yhir:n:t:", { conf = "c", name = "n", cpu = 1, v4 = 1, v6 = 1, ["on-a-stick"] = 1, virtio = "i", ["ring-buffer-size"] = "r", + ["xdp"] = 0, ["real-time"] = 0, mirror = 1, ["ingress-drop-monitor"] = 1, verbose = "v", trace = "t", ["bench-file"] = "b", ["profile"] = 0, duration = "D", hydra = "y", help = "h" }) @@ -153,6 +158,12 @@ function run(args) return setup_fn(graph, lwconfig, 'inetNic', 'b4sideNic') end + -- If --xdp has been specified, always use this. + if opts.xdp then + return setup.load_xdp(graph, lwconfig, 'inetNic', 'b4sideNic', + opts.ring_buffer_size) + end + -- If instance has external-interface.device configure as bump-in-the-wire -- otherwise configure it in on-a-stick mode. local device, id, queue = lwutil.parse_instance(lwconfig) diff --git a/src/program/lwaftr/setup.lua b/src/program/lwaftr/setup.lua index b8bb51850c..1379e80afb 100644 --- a/src/program/lwaftr/setup.lua +++ b/src/program/lwaftr/setup.lua @@ -254,6 +254,27 @@ function load_phy(c, conf, v4_nic_name, v6_nic_name, ring_buffer_size) link_sink(c, v4_nic_name..'.'..v4_info.rx, v6_nic_name..'.'..v6_info.rx) end +function load_xdp(c, conf, v4_nic_name, v6_nic_name, ring_buffer_size) + local v4_device, id, queue = lwutil.parse_instance(conf) + local v6_device = queue.external_interface.device + assert(lib.is_iface(v4_device), v4_nic_name..": "..v4_device.." is not a Linux interface") + assert(lib.is_iface(v6_device), v6_nic_name..": "..v6_device.." is not a Linux interface") + assert(not lwutil.is_on_a_stick(v4_device, queue), + "--xdp does not support on-a-stick configuration") + + lwaftr_app(c, conf) + + config.app(c, v4_nic_name, require("apps.xdp.xdp").driver, { + ifname=v4_device, + queue=id}) + config.app(c, v6_nic_name, require("apps.xdp.xdp").driver, { + ifname=v6_device, + queue=id}) + + link_source(c, v4_nic_name..'.output', v6_nic_name..'.output') + link_sink(c, v4_nic_name..'.input', v6_nic_name..'.input') +end + function load_on_a_stick_kernel_iface (c, conf, args) local RawSocket = require("apps.socket.raw").RawSocket local iface, id, queue = lwutil.parse_instance(conf) From dca300464830d7ded3403a6cea88a4f8b94ac7fe Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Tue, 13 Jul 2021 15:26:52 +0000 Subject: [PATCH 108/209] snabb-softwire-v2: add default for leaf error-rate-limiting/packets --- src/lib/yang/snabb-softwire-v2.yang | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/lib/yang/snabb-softwire-v2.yang b/src/lib/yang/snabb-softwire-v2.yang index bd14a9e760..5942f682d2 100644 --- a/src/lib/yang/snabb-softwire-v2.yang +++ b/src/lib/yang/snabb-softwire-v2.yang @@ -11,6 +11,11 @@ module snabb-softwire-v2 { description "Configuration for the Snabb Switch lwAFTR."; + revision 2021-07-13 { + description + "Add default value for error-rate-limiting/packets."; + } + revision 2019-09-17 { description "Add discontinuity time to softwire-state."; @@ -508,6 +513,7 @@ module snabb-softwire-v2 { container error-rate-limiting { leaf packets { type uint32; + default 200; description "The number of ICMP error messages which can be sent within the specified time period."; From 8c2000b748ac6c7e4a81164c84b9dead0a102358 Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Wed, 21 Jul 2021 09:52:16 +0000 Subject: [PATCH 109/209] apps/ipv4,6/reassemble,fragment: handle padded packets --- src/apps/ipv4/fragment.lua | 2 +- src/apps/ipv4/reassemble.lua | 2 +- src/apps/ipv6/fragment.lua | 2 +- src/apps/ipv6/reassemble.lua | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/apps/ipv4/fragment.lua b/src/apps/ipv4/fragment.lua index 3aced06e1d..4e07271bac 100644 --- a/src/apps/ipv4/fragment.lua +++ b/src/apps/ipv4/fragment.lua @@ -64,7 +64,7 @@ end local function ipv4_packet_has_valid_length(h, len) if len < ffi.sizeof(ether_ipv4_header_t) then return false end if ipv4_header_length(h.ipv4) < 20 then return false end - return ntohs(h.ipv4.total_length) == len - ether_header_len + return ntohs(h.ipv4.total_length) <= len - ether_header_len end Fragmenter = {} diff --git a/src/apps/ipv4/reassemble.lua b/src/apps/ipv4/reassemble.lua index 87b20b05b9..fa01f7b3be 100644 --- a/src/apps/ipv4/reassemble.lua +++ b/src/apps/ipv4/reassemble.lua @@ -75,7 +75,7 @@ local function ipv4_packet_has_valid_length(h, len) if len < ffi.sizeof(ether_ipv4_header_t) then return false end local ihl = bit.band(h.ipv4.version_and_ihl, ipv4_ihl_mask) if ihl < 5 then return false end - return ntohs(h.ipv4.total_length) == len - ether_header_len + return ntohs(h.ipv4.total_length) <= len - ether_header_len end -- IPv4 requires recalculating an embedded checksum. diff --git a/src/apps/ipv6/fragment.lua b/src/apps/ipv6/fragment.lua index ec8cc14e95..eedca13cf2 100644 --- a/src/apps/ipv6/fragment.lua +++ b/src/apps/ipv6/fragment.lua @@ -80,7 +80,7 @@ local fragment_header_ptr_t = ffi.typeof('$*', fragment_header_t) -- Precondition: packet already has IPv6 ethertype. local function ipv6_packet_has_valid_length(h, len) if len < ether_ipv6_header_len then return false end - return ntohs(h.ipv6.payload_length) == len - ether_ipv6_header_len + return ntohs(h.ipv6.payload_length) <= len - ether_ipv6_header_len end Fragmenter = {} diff --git a/src/apps/ipv6/reassemble.lua b/src/apps/ipv6/reassemble.lua index a3467c18b8..e2c418fe7f 100644 --- a/src/apps/ipv6/reassemble.lua +++ b/src/apps/ipv6/reassemble.lua @@ -90,7 +90,7 @@ local function ipv6_packet_has_valid_length(h, len) -- The minimum Ethernet frame size is 60 bytes (without FCS). Those -- frames may contain padding bytes. local payload_length = ntohs(h.ipv6.payload_length) - return payload_length <= 60 or payload_length == len - ether_ipv6_header_len + return payload_length <= len - ether_ipv6_header_len end local function swap(array, i, j) From 561b52a444266396e8f63414fa90d8fdbaa1bfe1 Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Wed, 21 Jul 2021 09:59:21 +0000 Subject: [PATCH 110/209] lwaftr/setup: fix interfaces for bump-in-the-wire Interfaces used in bump-in-the-wire configurations were swapped, possibly during a prior refactor. Undo this. --- src/program/lwaftr/setup.lua | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/program/lwaftr/setup.lua b/src/program/lwaftr/setup.lua index 1379e80afb..3268fb3c37 100644 --- a/src/program/lwaftr/setup.lua +++ b/src/program/lwaftr/setup.lua @@ -206,8 +206,8 @@ end function load_kernel_iface (c, conf, v4_nic_name, v6_nic_name) local RawSocket = require("apps.socket.raw").RawSocket - local v4_iface, id, queue = lwutil.parse_instance(conf) - local v6_iface = queue.external_interface.dev_info + local v6_iface, id, queue = lwutil.parse_instance(conf) + local v4_iface = queue.external_interface.dev_info local dev_info = {rx = "rx", tx = "tx"} lwaftr_app(c, conf, v6_iface) @@ -220,8 +220,8 @@ function load_kernel_iface (c, conf, v4_nic_name, v6_nic_name) end function load_phy(c, conf, v4_nic_name, v6_nic_name, ring_buffer_size) - local v4_pci, id, queue = lwutil.parse_instance(conf) - local v6_pci = queue.external_interface.device + local v6_pci, id, queue = lwutil.parse_instance(conf) + local v4_pci = queue.external_interface.device local v4_info = pci.device_info(v4_pci) local v6_info = pci.device_info(v6_pci) validate_pci_devices({v4_pci, v6_pci}) @@ -255,11 +255,11 @@ function load_phy(c, conf, v4_nic_name, v6_nic_name, ring_buffer_size) end function load_xdp(c, conf, v4_nic_name, v6_nic_name, ring_buffer_size) - local v4_device, id, queue = lwutil.parse_instance(conf) - local v6_device = queue.external_interface.device + local v6_device, id, queue = lwutil.parse_instance(conf) + local v4_device = queue.external_interface.device assert(lib.is_iface(v4_device), v4_nic_name..": "..v4_device.." is not a Linux interface") assert(lib.is_iface(v6_device), v6_nic_name..": "..v6_device.." is not a Linux interface") - assert(not lwutil.is_on_a_stick(v4_device, queue), + assert(not lwutil.is_on_a_stick(v6_device, queue), "--xdp does not support on-a-stick configuration") lwaftr_app(c, conf) @@ -388,8 +388,8 @@ function load_on_a_stick(c, conf, args) end function load_virt(c, conf, v4_nic_name, v6_nic_name) - local v4_pci, id, queue = lwutil.parse_instance(conf) - local v6_pci = queue.external_device.device + local v6_pci, id, queue = lwutil.parse_instance(conf) + local v4_pci = queue.external_device.device lwaftr_app(c, conf, device) validate_pci_devices({v4_pci, v6_pci}) From 1cdc6352d48d51e7e53b7d50b91c4be43ca7e19f Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Wed, 21 Jul 2021 09:59:52 +0000 Subject: [PATCH 111/209] snabb-softwire-v2.yang: documentation edits --- src/lib/yang/snabb-softwire-v2.yang | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/lib/yang/snabb-softwire-v2.yang b/src/lib/yang/snabb-softwire-v2.yang index 5942f682d2..17239e71a1 100644 --- a/src/lib/yang/snabb-softwire-v2.yang +++ b/src/lib/yang/snabb-softwire-v2.yang @@ -678,12 +678,12 @@ module snabb-softwire-v2 { leaf ip { type inet:ipv6-address; description - "IPv4 address of the next hop for the internet-facing NIC. - The lwAFTR will resolve this to a MAC address using ARP."; + "IPv6 address of the next hop for the internal-facing NIC. + The lwAFTR will resolve this to a MAC address using NDP."; } leaf resolved-mac { config false; - description "Resolved next-hop mac address found by ARP."; + description "Resolved next-hop mac address found by NDP."; type yang:mac-address; } } @@ -692,7 +692,7 @@ module snabb-softwire-v2 { type yang:mac-address; description "Statically configured MAC address of the next hop for the - internet-facing NIC."; + internal-facing NIC."; } } } From 6bcce09e31484880e022ef62a1b547e050f1931c Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Fri, 28 Feb 2020 17:30:56 +0100 Subject: [PATCH 112/209] lib.numa: gracefully handle failure to read CPU performance governor --- src/lib/numa.lua | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/lib/numa.lua b/src/lib/numa.lua index 55b510be24..04756c9f6e 100644 --- a/src/lib/numa.lua +++ b/src/lib/numa.lua @@ -158,12 +158,20 @@ local function assert_irqbalanced_disabled (warn) end end +local function read_cpu_performance_governor (cpu) + local path = '/sys/devices/system/cpu/cpu'..cpu..'/cpufreq/scaling_governor' + local f = io.open(path) + if not f then return "unknown" end + local gov = f:read() + f:close() + return gov +end + local function check_cpu_performance_tuning (cpu, strict) local warn = warn if strict then warn = die end assert_irqbalanced_disabled(warn) - local path = '/sys/devices/system/cpu/cpu'..cpu..'/cpufreq/scaling_governor' - local gov = assert(io.open(path)):read() + local gov = read_cpu_performance_governor(cpu) if not gov:match('performance') then warn('Expected performance scaling governor for CPU %s, but got "%s"', cpu, gov) From 191581a882185be405157596c6ac036d0dcfe3aa Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Mon, 26 Jul 2021 12:44:06 +0000 Subject: [PATCH 113/209] apps.lwaftr.ndp: send NS via solicited node multicast --- src/apps/lwaftr/ndp.lua | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/src/apps/lwaftr/ndp.lua b/src/apps/lwaftr/ndp.lua index 7e8b48f502..e35bc86485 100644 --- a/src/apps/lwaftr/ndp.lua +++ b/src/apps/lwaftr/ndp.lua @@ -220,18 +220,17 @@ local function make_na_packet(src_mac, dst_mac, src_ip, dst_ip, is_router) end -- Solicit a neighbor's address. -local function make_ns_packet(src_mac, src_ip, dst_ip) +local function make_ns_packet(src_mac, src_ip, dst_mac, dst_ip, target_ip) local message = ns_header_t() message.flags = 0 - message.target_ip = dst_ip + message.target_ip = target_ip local option = ether_option_header_t() option.header.type = option_source_link_layer_address option.header.length = 1 -- One 8-byte unit. option.addr = src_mac - local broadcast_mac = ethernet:pton("ff:ff:ff:ff:ff:ff") - return make_ndp_packet(src_mac, broadcast_mac, src_ip, dst_ip, icmpv6_ns, + return make_ndp_packet(src_mac, dst_mac, src_ip, dst_ip, icmpv6_ns, message, option) end @@ -298,6 +297,21 @@ function NDP:new(conf) assert(o.next_ip, 'NDP needs next-hop IPv6 address to learn next-hop MAC') self.ns_interval = 3 -- Send a new NS every three seconds. end + if o.next_ip then + -- Construct Solicited-Node multicast address + -- https://datatracker.ietf.org/doc/html/rfc4861#section-2.3 + o.solicited_node_mcast = ipv6:pton("ff02::1:ff00:0") -- /104 + o.solicited_node_mcast[13] = o.next_ip[13] + o.solicited_node_mcast[14] = o.next_ip[14] + o.solicited_node_mcast[15] = o.next_ip[15] + -- Construct Ethernet multicast address + -- https://datatracker.ietf.org/doc/html/rfc2464#section-7 + o.mac_mcast = ethernet:pton("33:33:00:00:00:00") + o.mac_mcast[2] = o.solicited_node_mcast[12] + o.mac_mcast[3] = o.solicited_node_mcast[13] + o.mac_mcast[4] = o.solicited_node_mcast[14] + o.mac_mcast[5] = o.solicited_node_mcast[15] + end return setmetatable(o, {__index=NDP}) end @@ -314,7 +328,9 @@ function NDP:maybe_send_ns_request (output) if self.next_ns_time <= engine.now() then self:ndp_resolving(self.next_ip) transmit(self.output.south, - make_ns_packet(self.self_mac, self.self_ip, self.next_ip)) + make_ns_packet(self.self_mac, self.self_ip, + self.mac_mcast, self.solicited_node_mcast, + self.next_ip)) self.next_ns_time = engine.now() + self.ns_interval end end @@ -500,7 +516,9 @@ function selftest() config.link(c, "sink2.tx -> nd2.north") config.link(c, "nd2.north -> sink2.rx") engine.configure(c) - engine.main({ duration = 0.1 }) + local breaths = counter.read(engine.breaths) + local function done() return counter.read(engine.breaths)-breaths > 1 end + engine.main({ done = done }) local function mac_eq(a, b) return ffi.C.memcmp(a, b, 6) == 0 end local nd1, nd2 = engine.app_table.nd1, engine.app_table.nd2 From 4817ae8228f8b17b62dffe9bf0df955486b42331 Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Tue, 27 Jul 2021 10:56:54 +0000 Subject: [PATCH 114/209] lwaftr: support vlans in XDP mode --- src/program/lwaftr/setup.lua | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/src/program/lwaftr/setup.lua b/src/program/lwaftr/setup.lua index 3268fb3c37..87e9de8c13 100644 --- a/src/program/lwaftr/setup.lua +++ b/src/program/lwaftr/setup.lua @@ -271,8 +271,25 @@ function load_xdp(c, conf, v4_nic_name, v6_nic_name, ring_buffer_size) ifname=v6_device, queue=id}) - link_source(c, v4_nic_name..'.output', v6_nic_name..'.output') - link_sink(c, v4_nic_name..'.input', v6_nic_name..'.input') + local v4_src, v6_src = v4_nic_name..'.output', v6_nic_name..'.output' + local v4_sink, v6_sink = v4_nic_name..'.input', v6_nic_name..'.input' + + -- Linux removes VLAN tag, but we have to tag outgoing packets + if queue.external_interface.vlan_tag then + config.app(c, "tagv4", vlan.Tagger, + { tag=queue.external_interface.vlan_tag }) + config.link(c, "tagv4.output -> "..v4_sink) + v4_sink = "tagv4.input" + end + if queue.internal_interface.vlan_tag then + config.app(c, "tagv6", vlan.Tagger, + { tag=queue.internal_interface.vlan_tag }) + config.link(c, "tagv6.output -> "..v6_sink) + v6_sink = "tagv6.input" + end + + link_source(c, v4_src, v6_src) + link_sink(c, v4_sink, v6_sink) end function load_on_a_stick_kernel_iface (c, conf, args) From 004587b50d34a14534ecd2dfe6a28b0b45c74d09 Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Thu, 29 Jul 2021 13:06:23 +0000 Subject: [PATCH 115/209] lwaftr: automate linux interface setup for XDP --- src/program/lwaftr/run/run.lua | 5 ++++ src/program/lwaftr/setup.lua | 45 ++++++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+) diff --git a/src/program/lwaftr/run/run.lua b/src/program/lwaftr/run/run.lua index 4a46e2ec41..97c04420f4 100644 --- a/src/program/lwaftr/run/run.lua +++ b/src/program/lwaftr/run/run.lua @@ -152,6 +152,11 @@ function run(args) -- anything defined in the config. if opts.name then conf.softwire_config.name = opts.name end + -- If we’re using XDP, setup interfaces here + if opts.xdp then + setup.xdp_ifsetup(conf) + end + local function setup_fn(graph, lwconfig) -- If --virtio has been specified, always use this. if opts.virtio_net then diff --git a/src/program/lwaftr/setup.lua b/src/program/lwaftr/setup.lua index 87e9de8c13..9fd214c4ad 100644 --- a/src/program/lwaftr/setup.lua +++ b/src/program/lwaftr/setup.lua @@ -21,6 +21,7 @@ local vlan = require("apps.vlan.vlan") local pci = require("lib.hardware.pci") local cltable = require("lib.cltable") local ipv4 = require("lib.protocol.ipv4") +local ipv6 = require("lib.protocol.ipv6") local ethernet = require("lib.protocol.ethernet") local ipv4_ntop = require("lib.yang.util").ipv4_ntop local binary = require("lib.yang.binary") @@ -292,6 +293,50 @@ function load_xdp(c, conf, v4_nic_name, v6_nic_name, ring_buffer_size) link_sink(c, v4_sink, v6_sink) end +function xdp_ifsetup(conf) + for idevice, instance in pairs(conf.softwire_config.instance) do + local icfg, ecfg + local nqueues = 0 + for _, queue in pairs(instance.queue) do + nqueues = nqueues + 1 + if not icfg then icfg = queue.internal_interface + else assert(lib.equal(icfg, queue.internal_interface)) end + if not ecfg then ecfg = queue.external_interface + else assert(lib.equal(ecfg, queue.external_interface)) end + end + for qid in pairs(instance.queue) do + assert(qid < nqueues) + end + local function cmd(...) + local cmd + for _, part in ipairs({...}) do + if not cmd then cmd = part + else cmd = cmd.." "..part end + end + print("shell:", cmd) + assert(os.execute(cmd)) + end + local function ifsetup(ifname, cfg, opts, ip_ntop) + cmd('ip link set down', 'dev', ifname) + cmd('ip address flush', 'dev', ifname) + cmd('ip link set address', ethernet:ntop(cfg.mac), 'dev', ifname) + cmd('ip link set arp off', 'dev', ifname) + cmd('ip link set broadcast', "ff:ff:ff:ff:ff:ff", 'dev', ifname) + cmd('ip link set multicast on', 'dev', ifname) + cmd('ip link set mtu', opts.mtu, 'dev', ifname) + cmd('ip address add', ip_ntop(cfg.ip), 'dev', ifname) + cmd('ethtool --set-channels', ifname, 'combined', nqueues) + cmd('ip link set up', 'dev', ifname) + end + print("Configuring internal interface for XDP...") + ifsetup(idevice, icfg, conf.softwire_config.internal_interface, + function (ip) return ipv6:ntop(ip) end) + print("Configuring external interface for XDP...") + ifsetup(ecfg.device, ecfg, conf.softwire_config.external_interface, + ipv4_ntop) + end +end + function load_on_a_stick_kernel_iface (c, conf, args) local RawSocket = require("apps.socket.raw").RawSocket local iface, id, queue = lwutil.parse_instance(conf) From bce8579eb8ea0f2d8d4298b848952f3e1ec624e8 Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Tue, 24 Aug 2021 09:09:08 +0000 Subject: [PATCH 116/209] snabb-softwire-v2: allow more than two queues --- src/lib/yang/snabb-softwire-v2.yang | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/src/lib/yang/snabb-softwire-v2.yang b/src/lib/yang/snabb-softwire-v2.yang index 17239e71a1..47cc5aaae1 100644 --- a/src/lib/yang/snabb-softwire-v2.yang +++ b/src/lib/yang/snabb-softwire-v2.yang @@ -583,7 +583,7 @@ module snabb-softwire-v2 { key "id"; leaf id { - type uint8 { range 0..1; } + type uint8; description "RSS queue on which to attach. Traffic will be partitioned evenly between instances servicing queues on the same @@ -591,12 +591,7 @@ module snabb-softwire-v2 { is a function of the TCP or UDP source and destination ports (if any) and the source and destination IPv4 or IPv6 addresses. Fragmented packets will be delivered to the - lowest-numbered queue. - - Note that currently the lwAFTR is restricted to running at - most 2 RSS workers per device. This limitation may be lifted - to 4 soon. Raising it farther is possible but needs changes - to how the lwAFTR uses its PCI devices."; + lowest-numbered queue."; } container external-interface { From a86c61e17b9f1cb9c4d219c7059bf6b0c82dfd84 Mon Sep 17 00:00:00 2001 From: Alexander Gall Date: Mon, 30 Aug 2021 13:04:02 +0200 Subject: [PATCH 117/209] connectex.lua: extend RSS hashing to non-TCP/UDP IP packets --- src/apps/mellanox/connectx.lua | 63 ++++++++++++++++++++++------------ 1 file changed, 42 insertions(+), 21 deletions(-) diff --git a/src/apps/mellanox/connectx.lua b/src/apps/mellanox/connectx.lua index fdfed64f90..ac1db83375 100644 --- a/src/apps/mellanox/connectx.lua +++ b/src/apps/mellanox/connectx.lua @@ -352,18 +352,20 @@ function ConnectX:new (conf) end else -- Set up RSS accross all queues. Hashing is only performed for - -- IPv4/IPv6 and TCP/UDP, i.e. non-IP packets as well as non - -- TCP/UDP packets are mapped to Queue #1. Hashing is done by - -- the TIR for a specific combination of protocols, hence - -- separate flows are needed to provide each TIR with the - -- appropriate types of packets. + -- IPv4/IPv6 with or without TCP/UDP. All non-IP packets are + -- mapped to Queue #1. Hashing is done by the TIR for a + -- specific combination of header values, hence separate flows + -- are needed to provide each TIR with the appropriate types of + -- packets. local l3_protos = { 'v4', 'v6' } local l4_protos = { 'udp', 'tcp' } local rqt = hca:create_rqt(rqlist) - local flow_group_ip = - hca:create_flow_group_ip(rxtable, NIC_RX, 0, - #l3_protos * #l4_protos - 1) local index = 0 + + -- Match TCP/UDP packets + local flow_group_ip = + hca:create_flow_group_ip(rxtable, NIC_RX, index, + index + #l3_protos * #l4_protos - 1) for _, l3_proto in ipairs(l3_protos) do for _, l4_proto in ipairs(l4_protos) do local tir = hca:create_tir_indirect(rqt, tdomain, @@ -379,6 +381,17 @@ function ConnectX:new (conf) end end + -- Fall-through for non-TCP/UDP IP packets + local flow_group_ip_l3 = + hca:create_flow_group_ip(rxtable, NIC_RX, index, index + #l3_protos - 1, + "l3-only") + for _, l3_proto in ipairs(l3_protos) do + local tir = hca:create_tir_indirect(rqt, tdomain, l3_proto, nil) + hca:set_flow_table_entry_ip(rxtable, NIC_RX, flow_group_ip_l3, + index, tir, l3_proto, nil) + index = index + 1 + end + local flow_group_wildcard = hca:create_flow_group_wildcard(rxtable, NIC_RX, index, index) local tir_q1 = hca:create_tir_direct(rqlist[1], tdomain) @@ -925,8 +938,8 @@ function HCA:create_tir_direct (rqn, transport_domain) return self:output(0x08, 23, 0) end --- Create a TIR with indirect dispatching (hashing) for a particular --- combination of IP protocol and TCP/UDP ports. +-- Create a TIR with indirect dispatching (hashing) based on IPv4/IPv6 +-- addresses and optionally TCP/UDP ports. function HCA:create_tir_indirect (rqt, transport_domain, l3_proto, l4_proto) local l3_protos = { v4 = 0, @@ -937,7 +950,6 @@ function HCA:create_tir_indirect (rqt, transport_domain, l3_proto, l4_proto) udp = 1 } local l3_proto = assert(l3_protos[l3_proto or 'v4'], "invalid l3 proto") - local l4_proto = assert(l4_protos[l4_proto or 'tcp'], "invalid l4 proto") self:command("CREATE_TIR", 0x10C, 0x0C) :input("opcode", 0x00, 31, 16, 0x900) :input("disp_type", 0x20 + 0x04, 31, 28, 1) -- indirect @@ -949,8 +961,13 @@ function HCA:create_tir_indirect (rqt, transport_domain, l3_proto, l4_proto) :input("rx_hash_fn", 0x20 + 0x24, 31, 28, 2) -- toeplitz :input("transport_domain", 0x20 + 0x24, 23, 0, transport_domain) :input("l3_prot_type", 0x20 + 0x50, 31, 31, l3_proto) - :input("l4_prot_type", 0x20 + 0x50, 30, 30, l4_proto) - :input("selected_fields", 0x20 + 0x50, 29, 0, 15) -- SRC/DST/SPORT/DPORT + if l4_proto == nil then + self:input("selected_fields", 0x20 + 0x50, 29, 0, 3) -- SRC/DST + else + l4_proto = assert(l4_protos[l4_proto or 'tcp'], "invalid l4 proto") + self:input("l4_prot_type", 0x20 + 0x50, 30, 30, l4_proto) + :input("selected_fields", 0x20 + 0x50, 29, 0, 15) -- SRC/DST/SPORT/DPORT + end -- XXX Is random hash key a good solution? for i = 0x28, 0x4C, 4 do self:input("toeplitz_key["..((i-0x28)/4).."]", 0x20 + i, 31, 0, math.random(2^32)) @@ -1429,8 +1446,8 @@ function HCA:set_flow_table_entry_wildcard (table_id, table_type, group_id, :execute() end --- Create a flow group that inspects the ethertype and protocol fields. -function HCA:create_flow_group_ip (table_id, table_type, start_ix, end_ix) +-- Create a flow group that inspects the ethertype and optionally protocol fields. +function HCA:create_flow_group_ip (table_id, table_type, start_ix, end_ix, l3_only) self:command("CREATE_FLOW_GROUP", 0x3FC, 0x0C) :input("opcode", 0x00, 31, 16, 0x933) :input("table_type", 0x10, 31, 24, table_type) @@ -1439,14 +1456,16 @@ function HCA:create_flow_group_ip (table_id, table_type, start_ix, end_ix) :input("end_ix", 0x24, 31, 0, end_ix) -- (inclusive) :input("match_criteria_enable", 0x3C, 7, 0, 1) -- match outer headers :input("match_ether", 0x40 + 0x04, 15, 0, 0xFFFF) - :input("match_proto", 0x40 + 0x10, 31, 24, 0xFF) - :execute() + if l3_only == nil then + self:input("match_proto", 0x40 + 0x10, 31, 24, 0xFF) + end + self:execute() local group_id = self:output(0x08, 23, 0) return group_id end -- Set a flow table entry that matches on the ethertype for IPv4/IPv6 --- as well as TCP/UDP protocol/next-header. +-- as well as optionally on TCP/UDP protocol/next-header. function HCA:set_flow_table_entry_ip (table_id, table_type, group_id, flow_index, tir, l3_proto, l4_proto) local ethertypes = { @@ -1458,7 +1477,6 @@ function HCA:set_flow_table_entry_ip (table_id, table_type, group_id, tcp = 6 } local type = assert(ethertypes[l3_proto], "invalid l3 proto") - local proto = assert(l4_protos[l4_proto], "invalid l4 proto") self:command("SET_FLOW_TABLE_ENTRY", 0x40 + 0x300, 0x0C) :input("opcode", 0x00, 31, 16, 0x936) :input("opmod", 0x04, 15, 0, 0) -- new entry @@ -1469,8 +1487,11 @@ function HCA:set_flow_table_entry_ip (table_id, table_type, group_id, :input("action", 0x40 + 0x0C, 15, 0, 4) -- action = FWD_DST :input("dest_list_sz", 0x40 + 0x10, 23, 0, 1) -- destination list size :input("match_ether", 0x40 + 0x40 + 0x04, 15, 0, type) - :input("match_proto", 0x40 + 0x40 + 0x10, 31, 24, proto) - :input("dest_type", 0x40 + 0x300, 31, 24, 2) -- TIR + if l4_proto ~= nil then + proto = assert(l4_protos[l4_proto], "invalid l4 proto") + self:input("match_proto", 0x40 + 0x40 + 0x10, 31, 24, proto) + end + self:input("dest_type", 0x40 + 0x300, 31, 24, 2) -- TIR :input("dest_id", 0x40 + 0x300, 23, 0, tir) :execute() end From 909dca7a1d4181870a06592e58219e1535bf191e Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Tue, 24 Aug 2021 11:07:52 +0000 Subject: [PATCH 118/209] lib.protocol.ethernet: add ptoi used by connectx driver --- src/lib/protocol/ethernet.lua | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/lib/protocol/ethernet.lua b/src/lib/protocol/ethernet.lua index c874f35acb..f14f43e5db 100644 --- a/src/lib/protocol/ethernet.lua +++ b/src/lib/protocol/ethernet.lua @@ -66,6 +66,11 @@ function ethernet:ntop (n) return table.concat(p, ":") end +-- Convert printable address to integer +function ethernet:ptoi (p) + return tonumber(ffi.cast("uint64_t *", ethernet:pton(p))[0]) +end + -- Mapping of an IPv6 multicast address to a MAC address per RFC2464, -- section 7 function ethernet:ipv6_mcast(ip) From 6bca98b8c0c4a4376f379ff82a787212d081ac9e Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Tue, 24 Aug 2021 11:08:46 +0000 Subject: [PATCH 119/209] apps.mellanox.connectx: fixup lib.hardware.pci usage --- src/apps/mellanox/connectx.lua | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/apps/mellanox/connectx.lua b/src/apps/mellanox/connectx.lua index 964596d6a3..858bf47fb4 100644 --- a/src/apps/mellanox/connectx.lua +++ b/src/apps/mellanox/connectx.lua @@ -1124,7 +1124,8 @@ function IO:new (conf) -- This is also done in Connectex4:new() but might not have -- happened yet. pci.unbind_device_from_linux(pciaddress) - local mmio, fd = pci.map_pci_memory(pciaddress, 0, false) + local fd = pci.open_pci_resource_unlocked(pciaddress, 0) + local mmio = pci.map_pci_memory(fd) local online = false -- True when queue is up and running local cxq -- shm object containing queue control information From 832ea9dfabd5f6821749a3fda2786fb6d1107c2b Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Tue, 24 Aug 2021 11:09:50 +0000 Subject: [PATCH 120/209] apps.mellanox.connectx_test: traceprof is now part of RaptorJIT --- src/apps/mellanox/connectx_test.lua | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/apps/mellanox/connectx_test.lua b/src/apps/mellanox/connectx_test.lua index ee98edecf3..8bb20895f4 100644 --- a/src/apps/mellanox/connectx_test.lua +++ b/src/apps/mellanox/connectx_test.lua @@ -105,7 +105,8 @@ function switch (pci0, pci1, npackets, ncores, minlen, maxlen, minburst, maxburs local start = engine.now() local remaining = npackets - require("lib.traceprof.traceprof").start() + engine.vmprofile_enabled = true + engine.setvmprofile("connectx") while remaining > 0 do -- Send packets for id, _ in pairs(io0) do @@ -125,7 +126,7 @@ function switch (pci0, pci1, npackets, ncores, minlen, maxlen, minburst, maxburs for id, app in pairs(io1) do app:pull() app:push() dump(pci1, id, app) end -- Simulate breathing end - require("lib.traceprof.traceprof").stop() + engine.setvmprofile("engine") -- Receive any last packets C.usleep(100) for i = 1, 10 do From d58ce744c5c68bba3d8872f9075cca2062a6bb17 Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Tue, 24 Aug 2021 11:10:27 +0000 Subject: [PATCH 121/209] apps.mellanox.connectx_test: get PCI addresses from environment --- src/apps/mellanox/connectx_test.lua | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/apps/mellanox/connectx_test.lua b/src/apps/mellanox/connectx_test.lua index 8bb20895f4..3ac816f66a 100644 --- a/src/apps/mellanox/connectx_test.lua +++ b/src/apps/mellanox/connectx_test.lua @@ -176,6 +176,12 @@ function between (min, max) end function selftest () - switch("02:00.0", "03:00.0", 10e6, 1, 60, 1500, 100, 100, 4, 4, 1) + local pci0 = os.getenv("SNABB_PCI_CONNECTX0") + local pci1 = os.getenv("SNABB_PCI_CONNECTX1") + if not (pci0 and pci1) then + print("SNABB_PCI_CONNECTX0 and SNABB_PCI_CONNECTX1 must be set. Skipping selftest.") + os.exit(engine.test_skipped_code) + end + switch(pci0, pci1, 10e6, 1, 60, 1500, 100, 100, 4, 4, 1) end From 6036de6f707e01e2141c23050ebe9fcf497589c4 Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Wed, 25 Aug 2021 09:49:39 +0000 Subject: [PATCH 122/209] lib.protocol.etherenet: do endian conversion in ptoi --- src/lib/protocol/ethernet.lua | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/lib/protocol/ethernet.lua b/src/lib/protocol/ethernet.lua index f14f43e5db..ad8e4dd2db 100644 --- a/src/lib/protocol/ethernet.lua +++ b/src/lib/protocol/ethernet.lua @@ -68,7 +68,14 @@ end -- Convert printable address to integer function ethernet:ptoi (p) - return tonumber(ffi.cast("uint64_t *", ethernet:pton(p))[0]) + local n = ethernet:pton(p) + assert(ffi.abi("le")) + return bit.bor(bit.lshift(n[0], 40), + bit.lshift(n[1], 32), + bit.lshift(n[2], 24), + bit.lshift(n[3], 16), + bit.lshift(n[4], 8), + bit.lshift(n[5], 0)) end -- Mapping of an IPv6 multicast address to a MAC address per RFC2464, From bf84158a33cfb96f02e0ce5178d4386f9122b44c Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Wed, 25 Aug 2021 10:03:18 +0000 Subject: [PATCH 123/209] apps.connectx: selftest fixups --- src/apps/mellanox/connectx.lua | 24 ++++++++++++------------ src/apps/mellanox/connectx_test.lua | 6 +++--- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/src/apps/mellanox/connectx.lua b/src/apps/mellanox/connectx.lua index 858bf47fb4..f1d854426b 100644 --- a/src/apps/mellanox/connectx.lua +++ b/src/apps/mellanox/connectx.lua @@ -2235,7 +2235,7 @@ function selftest () for i = 1, each do local p = packet.allocate() ffi.fill(p.data, octets, 0) -- zero packet - local header = lib.hexundump("000000000001 000000000002 0800", 16) + local header = lib.hexundump("000000000001 000000000002 0800", 14) ffi.copy(p.data, header, #header) p.data[12] = 0x08 -- ethertype = 0x0800 p.length = octets @@ -2258,17 +2258,17 @@ function selftest () print("recv0", tonumber(counter.read(o0.stats.txpackets)), tonumber(counter.read(o0.stats.txbytes)), tonumber(counter.read(o0.stats.txdrop))) print("recv1", tonumber(counter.read(o1.stats.txpackets)), tonumber(counter.read(o1.stats.txbytes)), tonumber(counter.read(o1.stats.txdrop))) - print("payload snippets of first 5 packets") - print("port0") - for i = 1, 5 do - local p = link.receive(o0) - if p then print(p.length, lib.hexdump(ffi.string(p.data, math.min(32, p.length)))) end - end - print("port1") - for i = 1, 5 do - local p = link.receive(o1) - if p then print(p.length, lib.hexdump(ffi.string(p.data, math.min(32, p.length)))) end - end + -- print("payload snippets of first 5 packets") + -- print("port0") + -- for i = 1, 5 do + -- local p = link.receive(o0) + -- if p then print(p.length, lib.hexdump(ffi.string(p.data, math.min(32, p.length)))) end + -- end + -- print("port1") + -- for i = 1, 5 do + -- local p = link.receive(o1) + -- if p then print(p.length, lib.hexdump(ffi.string(p.data, math.min(32, p.length)))) end + -- end print() print(("%-16s %20s %20s"):format("hardware counter", pcidev0, pcidev1)) diff --git a/src/apps/mellanox/connectx_test.lua b/src/apps/mellanox/connectx_test.lua index 3ac816f66a..331b3bef52 100644 --- a/src/apps/mellanox/connectx_test.lua +++ b/src/apps/mellanox/connectx_test.lua @@ -176,10 +176,10 @@ function between (min, max) end function selftest () - local pci0 = os.getenv("SNABB_PCI_CONNECTX0") - local pci1 = os.getenv("SNABB_PCI_CONNECTX1") + local pci0 = os.getenv("SNABB_PCI_CONNECTX_0") + local pci1 = os.getenv("SNABB_PCI_CONNECTX_1") if not (pci0 and pci1) then - print("SNABB_PCI_CONNECTX0 and SNABB_PCI_CONNECTX1 must be set. Skipping selftest.") + print("SNABB_PCI_CONNECTX_0 and SNABB_PCI_CONNECTX_1 must be set. Skipping selftest.") os.exit(engine.test_skipped_code) end switch(pci0, pci1, 10e6, 1, 60, 1500, 100, 100, 4, 4, 1) From fc7dfd291926ea8282d3f2112f86b33f774e1a7a Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Wed, 25 Aug 2021 16:36:51 +0000 Subject: [PATCH 124/209] apps.mellanox.connectx: support RSS + MAC/VLAN --- src/apps/mellanox/connectx.lua | 126 ++++++++++++++++++++++++--------- 1 file changed, 93 insertions(+), 33 deletions(-) diff --git a/src/apps/mellanox/connectx.lua b/src/apps/mellanox/connectx.lua index f1d854426b..e4825e1f4f 100644 --- a/src/apps/mellanox/connectx.lua +++ b/src/apps/mellanox/connectx.lua @@ -211,14 +211,6 @@ function ConnectX:new (conf) local sendq_size = conf.sendq_size or 1024 local recvq_size = conf.recvq_size or 1024 - -- XXX Config says whether to setup queues with MAC+VLAN - -- dispatching ("VMDq") or to simply hash uniformly over them ("RSS"). - -- - -- To be replaced with a more generic algorithm that looks at the - -- configurations of the individual ports and creates an - -- appropriate flow table. - local macvlan = conf.macvlan - local mtu = conf.mtu or 9500 -- Perform a hard reset of the device to bring it into a blank state. @@ -280,8 +272,16 @@ function ConnectX:new (conf) -- List of queue counter IDs (ConnectX5 and up) local counter_set_ids = {} + -- Enable MAC/VLAN switching? + local usemac = false local usevlan = false + -- Sets of MACs and VLANs + local macs, vlans = {}, {} + + -- Lists of receive queues by macvlan + local macvlan_rqlist = {} + for _, queue in ipairs(conf.queues) do -- Create a shared memory object for controlling the queue pair local cxq = shm.create("group/pci/"..pciaddress.."/"..queue.id, cxq_t) @@ -334,24 +334,42 @@ function ConnectX:new (conf) -- CXQ is now fully initialized & ready for attach. assert(sync.cas(cxq.state, INIT, FREE)) + usemac = usemac or (queue.mac ~= nil) usevlan = usevlan or (queue.vlan ~= nil) -- XXX collect for flow table construction rqs[queue.id] = cxq.rqn rqlist[#rqlist+1] = cxq.rqn end - - local rxtable = hca:create_root_flow_table(NIC_RX) - local rule = 0 - if macvlan then - local flow_group_id = hca:create_flow_group_macvlan(rxtable, NIC_RX, 0, #conf.queues-1, usevlan) + + if usemac then + -- Collect macs, and vlans for _, queue in ipairs(conf.queues) do - local tir = hca:create_tir_direct(rqs[queue.id], tdomain) - hca:set_flow_table_entry_macvlan(rxtable, NIC_RX, flow_group_id, rule, tir, - ethernet:ptoi(queue.mac), queue.vlan) - rule = rule + 1 + assert(queue.mac, "Queue does not specifiy MAC: "..queue.id) + macs[queue.mac] = true + if usevlan then + assert(queue.vlan, "Queue does not specify a VLAN: "..queue.id) + vlans[queue.vlan] = true + else + vlans[false] = true + end end - else + + -- Collect macvlan_rqlist for flow table construction + for mac in pairs(macs) do + macvlan_rqlist[mac] = {} + for vlan in pairs(vlans) do + macvlan_rqlist[mac][vlan] = {} + for _, queue in ipairs(conf.queues) do + if queue.mac == mac and (queue.vlan or false) == vlan then + table.insert(macvlan_rqlist[mac][vlan], rqs[queue.id]) + end + end + end + end + end + + local function setup_rss_rxtable (rqlist, tdomain, level) -- Set up RSS accross all queues. Hashing is only performed for -- IPv4/IPv6 and TCP/UDP, i.e. non-IP packets as well as non -- TCP/UDP packets are mapped to Queue #1. Hashing is done by @@ -360,6 +378,8 @@ function ConnectX:new (conf) -- appropriate types of packets. local l3_protos = { 'v4', 'v6' } local l4_protos = { 'udp', 'tcp' } + local rxtable = hca:create_flow_table(NIC_RX, level, + #l3_protos * #l4_protos + 1) local rqt = hca:create_rqt(rqlist) local flow_group_ip = hca:create_flow_group_ip(rxtable, NIC_RX, 0, @@ -375,7 +395,7 @@ function ConnectX:new (conf) -- If the header is incomplete, the packet will fall through -- to the wildcard match and end up in the first queue. hca:set_flow_table_entry_ip(rxtable, NIC_RX, flow_group_ip, - index, tir, l3_proto, l4_proto) + index, TIR, tir, l3_proto, l4_proto) index = index + 1 end end @@ -384,9 +404,44 @@ function ConnectX:new (conf) hca:create_flow_group_wildcard(rxtable, NIC_RX, index, index) local tir_q1 = hca:create_tir_direct(rqlist[1], tdomain) hca:set_flow_table_entry_wildcard(rxtable, NIC_RX, - flow_group_wildcard, index, tir_q1) + flow_group_wildcard, index, TIR, tir_q1) + return rxtable + end + + local function setup_macvlan_rxtable (macvlan_rqlist, usevlan, tdomain, level) + -- Set up RSS across multiple queues with matching MAC+VLAN. + -- See notes on RSS in setup_rss_rxtable above. + local num_entries = 0 + for mac in pairs(macvlan_rqlist) do + for vlan, rqlist in pairs(macvlan_rqlist[mac]) do + if #rqlist > 0 then + num_entries = num_entries + 1 + end + end + end + local rxtable = hca:create_flow_table(NIC_RX, level or 0, num_entries) + local index = 0 + local flow_group_id = hca:create_flow_group_macvlan(rxtable, NIC_RX, 0, num_entries-1, usevlan) + for mac in pairs(macvlan_rqlist) do + for vlan, rqlist in pairs(macvlan_rqlist[mac]) do + if #rqlist > 0 then + local tid = setup_rss_rxtable(rqlist, tdomain, 1) + hca:set_flow_table_entry_macvlan(rxtable, NIC_RX, flow_group_id, index, + FLOW_TABLE, tid, ethernet:ptoi(mac), vlan) + index = index + 1 + end + end + end + return rxtable + end + + if usemac then + local rxtable = setup_macvlan_rxtable(macvlan_rqlist, usevlan, tdomain, 0) + hca:set_flow_table_root(rxtable, NIC_RX) + else + local rxtable = setup_rss_rxtable(rqlist, tdomain, 0) + hca:set_flow_table_root(rxtable, NIC_RX) end - hca:set_flow_table_root(rxtable, NIC_RX) self.shm = { mtu = {counter, mtu}, @@ -1380,12 +1435,16 @@ end NIC_RX = 0 -- Flow table type code for incoming packets NIC_TX = 1 -- Flow table type code for outgoing packets --- Create the root flow table. -function HCA:create_root_flow_table (table_type) +FLOW_TABLE = 1 -- Flow table entry destination_type for FLOW_TABLE +TIR = 2 -- Flow table entry destination_type for TIR + +-- Create a flow table. +function HCA:create_flow_table (table_type, level, size) self:command("CREATE_FLOW_TABLE", 0x3C, 0x0C) :input("opcode", 0x00, 31, 16, 0x930) :input("table_type", 0x10, 31, 24, table_type) - :input("log_size", 0x18 + 0x00, 7, 0, 10) -- XXX make parameter + :input("level", 0x18 + 0x00, 23, 16, level or 0) + :input("log_size", 0x18 + 0x00, 7, 0, math.ceil(math.log(size or 1024, 2))) :execute() local table_id = self:output(0x08, 23, 0) return table_id @@ -1416,7 +1475,7 @@ end -- Set a "wildcard" flow table entry that does not match on any fields. function HCA:set_flow_table_entry_wildcard (table_id, table_type, group_id, - flow_index, tir) + flow_index, dest_type, dest_id) self:command("SET_FLOW_TABLE_ENTRY", 0x40 + 0x300, 0x0C) :input("opcode", 0x00, 31, 16, 0x936) :input("opmod", 0x04, 15, 0, 0) -- new entry @@ -1426,8 +1485,8 @@ function HCA:set_flow_table_entry_wildcard (table_id, table_type, group_id, :input("group_id", 0x40 + 0x04, 31, 0, group_id) :input("action", 0x40 + 0x0C, 15, 0, 4) -- action = FWD_DST :input("dest_list_sz", 0x40 + 0x10, 23, 0, 1) -- destination list size - :input("dest_type", 0x40 + 0x300, 31, 24, 2) - :input("dest_id", 0x40 + 0x300, 23, 0, tir) + :input("dest_type", 0x40 + 0x300, 31, 24, dest_type) + :input("dest_id", 0x40 + 0x300, 23, 0, dest_id) :execute() end @@ -1450,7 +1509,7 @@ end -- Set a flow table entry that matches on the ethertype for IPv4/IPv6 -- as well as TCP/UDP protocol/next-header. function HCA:set_flow_table_entry_ip (table_id, table_type, group_id, - flow_index, tir, l3_proto, l4_proto) + flow_index, dest_type, dest_id, l3_proto, l4_proto) local ethertypes = { v4 = 0x0800, v6 = 0x86dd @@ -1472,8 +1531,8 @@ function HCA:set_flow_table_entry_ip (table_id, table_type, group_id, :input("dest_list_sz", 0x40 + 0x10, 23, 0, 1) -- destination list size :input("match_ether", 0x40 + 0x40 + 0x04, 15, 0, type) :input("match_proto", 0x40 + 0x40 + 0x10, 31, 24, proto) - :input("dest_type", 0x40 + 0x300, 31, 24, 2) -- TIR - :input("dest_id", 0x40 + 0x300, 23, 0, tir) + :input("dest_type", 0x40 + 0x300, 31, 24, dest_type) + :input("dest_id", 0x40 + 0x300, 23, 0, dest_id) :execute() end @@ -1497,7 +1556,8 @@ function HCA:create_flow_group_macvlan (table_id, table_type, start_ix, end_ix, end -- Set a DMAC+VLAN flow table rule. -function HCA:set_flow_table_entry_macvlan (table_id, table_type, group_id, flow_index, tir, dmac, vlanid) +function HCA:set_flow_table_entry_macvlan (table_id, table_type, group_id, + flow_index, dest_type, dest_id, dmac, vlanid) self:command("SET_FLOW_TABLE_ENTRY", 0x40 + 0x300, 0x0C) :input("opcode", 0x00, 31, 16, 0x936) :input("opmod", 0x04, 15, 0, 0) -- new entry @@ -1510,8 +1570,8 @@ function HCA:set_flow_table_entry_macvlan (table_id, table_type, group_id, flow_ :input("dmac0", 0x40 + 0x48, 31, 0, math.floor(dmac/2^16)) :input("dmac1", 0x40 + 0x4C, 31, 16, band(dmac, 0xFFFF)) :input("vlan", 0x40 + 0x4C, 11, 0, vlanid or 0) - :input("dest_type", 0x40 + 0x300, 31, 24, 2) - :input("dest_id", 0x40 + 0x300, 23, 0, tir) + :input("dest_type", 0x40 + 0x300, 31, 24, dest_type) + :input("dest_id", 0x40 + 0x300, 23, 0, dest_id) :execute() end From 55ea547d0ca18d2db9cf7374073085389d2ae5e4 Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Wed, 25 Aug 2021 16:37:33 +0000 Subject: [PATCH 125/209] Revert "apps.mellanox.connectx: support RSS + MAC/VLAN" This reverts commit 6a357a7a71c4a20f3587c22cbd0c83dcb03fe0f6. --- src/apps/mellanox/connectx.lua | 126 +++++++++------------------------ 1 file changed, 33 insertions(+), 93 deletions(-) diff --git a/src/apps/mellanox/connectx.lua b/src/apps/mellanox/connectx.lua index e4825e1f4f..f1d854426b 100644 --- a/src/apps/mellanox/connectx.lua +++ b/src/apps/mellanox/connectx.lua @@ -211,6 +211,14 @@ function ConnectX:new (conf) local sendq_size = conf.sendq_size or 1024 local recvq_size = conf.recvq_size or 1024 + -- XXX Config says whether to setup queues with MAC+VLAN + -- dispatching ("VMDq") or to simply hash uniformly over them ("RSS"). + -- + -- To be replaced with a more generic algorithm that looks at the + -- configurations of the individual ports and creates an + -- appropriate flow table. + local macvlan = conf.macvlan + local mtu = conf.mtu or 9500 -- Perform a hard reset of the device to bring it into a blank state. @@ -272,16 +280,8 @@ function ConnectX:new (conf) -- List of queue counter IDs (ConnectX5 and up) local counter_set_ids = {} - -- Enable MAC/VLAN switching? - local usemac = false local usevlan = false - -- Sets of MACs and VLANs - local macs, vlans = {}, {} - - -- Lists of receive queues by macvlan - local macvlan_rqlist = {} - for _, queue in ipairs(conf.queues) do -- Create a shared memory object for controlling the queue pair local cxq = shm.create("group/pci/"..pciaddress.."/"..queue.id, cxq_t) @@ -334,42 +334,24 @@ function ConnectX:new (conf) -- CXQ is now fully initialized & ready for attach. assert(sync.cas(cxq.state, INIT, FREE)) - usemac = usemac or (queue.mac ~= nil) usevlan = usevlan or (queue.vlan ~= nil) -- XXX collect for flow table construction rqs[queue.id] = cxq.rqn rqlist[#rqlist+1] = cxq.rqn end - - if usemac then - -- Collect macs, and vlans - for _, queue in ipairs(conf.queues) do - assert(queue.mac, "Queue does not specifiy MAC: "..queue.id) - macs[queue.mac] = true - if usevlan then - assert(queue.vlan, "Queue does not specify a VLAN: "..queue.id) - vlans[queue.vlan] = true - else - vlans[false] = true - end - end - -- Collect macvlan_rqlist for flow table construction - for mac in pairs(macs) do - macvlan_rqlist[mac] = {} - for vlan in pairs(vlans) do - macvlan_rqlist[mac][vlan] = {} - for _, queue in ipairs(conf.queues) do - if queue.mac == mac and (queue.vlan or false) == vlan then - table.insert(macvlan_rqlist[mac][vlan], rqs[queue.id]) - end - end - end + local rxtable = hca:create_root_flow_table(NIC_RX) + local rule = 0 + if macvlan then + local flow_group_id = hca:create_flow_group_macvlan(rxtable, NIC_RX, 0, #conf.queues-1, usevlan) + for _, queue in ipairs(conf.queues) do + local tir = hca:create_tir_direct(rqs[queue.id], tdomain) + hca:set_flow_table_entry_macvlan(rxtable, NIC_RX, flow_group_id, rule, tir, + ethernet:ptoi(queue.mac), queue.vlan) + rule = rule + 1 end - end - - local function setup_rss_rxtable (rqlist, tdomain, level) + else -- Set up RSS accross all queues. Hashing is only performed for -- IPv4/IPv6 and TCP/UDP, i.e. non-IP packets as well as non -- TCP/UDP packets are mapped to Queue #1. Hashing is done by @@ -378,8 +360,6 @@ function ConnectX:new (conf) -- appropriate types of packets. local l3_protos = { 'v4', 'v6' } local l4_protos = { 'udp', 'tcp' } - local rxtable = hca:create_flow_table(NIC_RX, level, - #l3_protos * #l4_protos + 1) local rqt = hca:create_rqt(rqlist) local flow_group_ip = hca:create_flow_group_ip(rxtable, NIC_RX, 0, @@ -395,7 +375,7 @@ function ConnectX:new (conf) -- If the header is incomplete, the packet will fall through -- to the wildcard match and end up in the first queue. hca:set_flow_table_entry_ip(rxtable, NIC_RX, flow_group_ip, - index, TIR, tir, l3_proto, l4_proto) + index, tir, l3_proto, l4_proto) index = index + 1 end end @@ -404,44 +384,9 @@ function ConnectX:new (conf) hca:create_flow_group_wildcard(rxtable, NIC_RX, index, index) local tir_q1 = hca:create_tir_direct(rqlist[1], tdomain) hca:set_flow_table_entry_wildcard(rxtable, NIC_RX, - flow_group_wildcard, index, TIR, tir_q1) - return rxtable - end - - local function setup_macvlan_rxtable (macvlan_rqlist, usevlan, tdomain, level) - -- Set up RSS across multiple queues with matching MAC+VLAN. - -- See notes on RSS in setup_rss_rxtable above. - local num_entries = 0 - for mac in pairs(macvlan_rqlist) do - for vlan, rqlist in pairs(macvlan_rqlist[mac]) do - if #rqlist > 0 then - num_entries = num_entries + 1 - end - end - end - local rxtable = hca:create_flow_table(NIC_RX, level or 0, num_entries) - local index = 0 - local flow_group_id = hca:create_flow_group_macvlan(rxtable, NIC_RX, 0, num_entries-1, usevlan) - for mac in pairs(macvlan_rqlist) do - for vlan, rqlist in pairs(macvlan_rqlist[mac]) do - if #rqlist > 0 then - local tid = setup_rss_rxtable(rqlist, tdomain, 1) - hca:set_flow_table_entry_macvlan(rxtable, NIC_RX, flow_group_id, index, - FLOW_TABLE, tid, ethernet:ptoi(mac), vlan) - index = index + 1 - end - end - end - return rxtable - end - - if usemac then - local rxtable = setup_macvlan_rxtable(macvlan_rqlist, usevlan, tdomain, 0) - hca:set_flow_table_root(rxtable, NIC_RX) - else - local rxtable = setup_rss_rxtable(rqlist, tdomain, 0) - hca:set_flow_table_root(rxtable, NIC_RX) + flow_group_wildcard, index, tir_q1) end + hca:set_flow_table_root(rxtable, NIC_RX) self.shm = { mtu = {counter, mtu}, @@ -1435,16 +1380,12 @@ end NIC_RX = 0 -- Flow table type code for incoming packets NIC_TX = 1 -- Flow table type code for outgoing packets -FLOW_TABLE = 1 -- Flow table entry destination_type for FLOW_TABLE -TIR = 2 -- Flow table entry destination_type for TIR - --- Create a flow table. -function HCA:create_flow_table (table_type, level, size) +-- Create the root flow table. +function HCA:create_root_flow_table (table_type) self:command("CREATE_FLOW_TABLE", 0x3C, 0x0C) :input("opcode", 0x00, 31, 16, 0x930) :input("table_type", 0x10, 31, 24, table_type) - :input("level", 0x18 + 0x00, 23, 16, level or 0) - :input("log_size", 0x18 + 0x00, 7, 0, math.ceil(math.log(size or 1024, 2))) + :input("log_size", 0x18 + 0x00, 7, 0, 10) -- XXX make parameter :execute() local table_id = self:output(0x08, 23, 0) return table_id @@ -1475,7 +1416,7 @@ end -- Set a "wildcard" flow table entry that does not match on any fields. function HCA:set_flow_table_entry_wildcard (table_id, table_type, group_id, - flow_index, dest_type, dest_id) + flow_index, tir) self:command("SET_FLOW_TABLE_ENTRY", 0x40 + 0x300, 0x0C) :input("opcode", 0x00, 31, 16, 0x936) :input("opmod", 0x04, 15, 0, 0) -- new entry @@ -1485,8 +1426,8 @@ function HCA:set_flow_table_entry_wildcard (table_id, table_type, group_id, :input("group_id", 0x40 + 0x04, 31, 0, group_id) :input("action", 0x40 + 0x0C, 15, 0, 4) -- action = FWD_DST :input("dest_list_sz", 0x40 + 0x10, 23, 0, 1) -- destination list size - :input("dest_type", 0x40 + 0x300, 31, 24, dest_type) - :input("dest_id", 0x40 + 0x300, 23, 0, dest_id) + :input("dest_type", 0x40 + 0x300, 31, 24, 2) + :input("dest_id", 0x40 + 0x300, 23, 0, tir) :execute() end @@ -1509,7 +1450,7 @@ end -- Set a flow table entry that matches on the ethertype for IPv4/IPv6 -- as well as TCP/UDP protocol/next-header. function HCA:set_flow_table_entry_ip (table_id, table_type, group_id, - flow_index, dest_type, dest_id, l3_proto, l4_proto) + flow_index, tir, l3_proto, l4_proto) local ethertypes = { v4 = 0x0800, v6 = 0x86dd @@ -1531,8 +1472,8 @@ function HCA:set_flow_table_entry_ip (table_id, table_type, group_id, :input("dest_list_sz", 0x40 + 0x10, 23, 0, 1) -- destination list size :input("match_ether", 0x40 + 0x40 + 0x04, 15, 0, type) :input("match_proto", 0x40 + 0x40 + 0x10, 31, 24, proto) - :input("dest_type", 0x40 + 0x300, 31, 24, dest_type) - :input("dest_id", 0x40 + 0x300, 23, 0, dest_id) + :input("dest_type", 0x40 + 0x300, 31, 24, 2) -- TIR + :input("dest_id", 0x40 + 0x300, 23, 0, tir) :execute() end @@ -1556,8 +1497,7 @@ function HCA:create_flow_group_macvlan (table_id, table_type, start_ix, end_ix, end -- Set a DMAC+VLAN flow table rule. -function HCA:set_flow_table_entry_macvlan (table_id, table_type, group_id, - flow_index, dest_type, dest_id, dmac, vlanid) +function HCA:set_flow_table_entry_macvlan (table_id, table_type, group_id, flow_index, tir, dmac, vlanid) self:command("SET_FLOW_TABLE_ENTRY", 0x40 + 0x300, 0x0C) :input("opcode", 0x00, 31, 16, 0x936) :input("opmod", 0x04, 15, 0, 0) -- new entry @@ -1570,8 +1510,8 @@ function HCA:set_flow_table_entry_macvlan (table_id, table_type, group_id, :input("dmac0", 0x40 + 0x48, 31, 0, math.floor(dmac/2^16)) :input("dmac1", 0x40 + 0x4C, 31, 16, band(dmac, 0xFFFF)) :input("vlan", 0x40 + 0x4C, 11, 0, vlanid or 0) - :input("dest_type", 0x40 + 0x300, 31, 24, dest_type) - :input("dest_id", 0x40 + 0x300, 23, 0, dest_id) + :input("dest_type", 0x40 + 0x300, 31, 24, 2) + :input("dest_id", 0x40 + 0x300, 23, 0, tir) :execute() end From 680355e2b0717d7ddc4d81c03692bfa29fc4d70b Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Fri, 27 Aug 2021 13:00:49 +0000 Subject: [PATCH 126/209] apps.mellanox.connext: fix data segment address translation in sq:transmit This is a bug where the physical addresses wider that 53 bits of payloads inserted into descriptors for DMA are truncated. The fix here is to truncate after masking. Probably better would be to use lib.htonl instead of bswap(tonumber(...)) throughout the driver. --- src/apps/mellanox/connectx.lua | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/apps/mellanox/connectx.lua b/src/apps/mellanox/connectx.lua index f1d854426b..4942fc14ae 100644 --- a/src/apps/mellanox/connectx.lua +++ b/src/apps/mellanox/connectx.lua @@ -1343,8 +1343,8 @@ function SQ:new (cxq, mmio) wqe.u32[12] = bswap(p.length - ninline) wqe.u32[13] = bswap(cxq.rlkey) local phy = memory.virtual_to_physical(p.data + ninline) - wqe.u32[14] = bswap(tonumber(phy) / 2^32) - wqe.u32[15] = bswap(tonumber(phy) % 2^32) + wqe.u32[14] = bswap(tonumber(shr(phy, 32))) + wqe.u32[15] = bswap(tonumber(band(phy, 0xFFFFFFFF))) -- Advance counters cxq.next_tx_wqeid = cxq.next_tx_wqeid + 1 next_slot = slot(cxq.next_tx_wqeid) From 740b63c9d962439597d0cf53949bf4b246c144b7 Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Fri, 27 Aug 2021 13:16:14 +0000 Subject: [PATCH 127/209] Revert "Revert "apps.mellanox.connectx: support RSS + MAC/VLAN"" This reverts commit d20c2fa46a59d6edf0157ae6319bd59cd8d13e0b. --- src/apps/mellanox/connectx.lua | 126 ++++++++++++++++++++++++--------- 1 file changed, 93 insertions(+), 33 deletions(-) diff --git a/src/apps/mellanox/connectx.lua b/src/apps/mellanox/connectx.lua index 4942fc14ae..fecbe3b97e 100644 --- a/src/apps/mellanox/connectx.lua +++ b/src/apps/mellanox/connectx.lua @@ -211,14 +211,6 @@ function ConnectX:new (conf) local sendq_size = conf.sendq_size or 1024 local recvq_size = conf.recvq_size or 1024 - -- XXX Config says whether to setup queues with MAC+VLAN - -- dispatching ("VMDq") or to simply hash uniformly over them ("RSS"). - -- - -- To be replaced with a more generic algorithm that looks at the - -- configurations of the individual ports and creates an - -- appropriate flow table. - local macvlan = conf.macvlan - local mtu = conf.mtu or 9500 -- Perform a hard reset of the device to bring it into a blank state. @@ -280,8 +272,16 @@ function ConnectX:new (conf) -- List of queue counter IDs (ConnectX5 and up) local counter_set_ids = {} + -- Enable MAC/VLAN switching? + local usemac = false local usevlan = false + -- Sets of MACs and VLANs + local macs, vlans = {}, {} + + -- Lists of receive queues by macvlan + local macvlan_rqlist = {} + for _, queue in ipairs(conf.queues) do -- Create a shared memory object for controlling the queue pair local cxq = shm.create("group/pci/"..pciaddress.."/"..queue.id, cxq_t) @@ -334,24 +334,42 @@ function ConnectX:new (conf) -- CXQ is now fully initialized & ready for attach. assert(sync.cas(cxq.state, INIT, FREE)) + usemac = usemac or (queue.mac ~= nil) usevlan = usevlan or (queue.vlan ~= nil) -- XXX collect for flow table construction rqs[queue.id] = cxq.rqn rqlist[#rqlist+1] = cxq.rqn end - - local rxtable = hca:create_root_flow_table(NIC_RX) - local rule = 0 - if macvlan then - local flow_group_id = hca:create_flow_group_macvlan(rxtable, NIC_RX, 0, #conf.queues-1, usevlan) + + if usemac then + -- Collect macs, and vlans for _, queue in ipairs(conf.queues) do - local tir = hca:create_tir_direct(rqs[queue.id], tdomain) - hca:set_flow_table_entry_macvlan(rxtable, NIC_RX, flow_group_id, rule, tir, - ethernet:ptoi(queue.mac), queue.vlan) - rule = rule + 1 + assert(queue.mac, "Queue does not specifiy MAC: "..queue.id) + macs[queue.mac] = true + if usevlan then + assert(queue.vlan, "Queue does not specify a VLAN: "..queue.id) + vlans[queue.vlan] = true + else + vlans[false] = true + end end - else + + -- Collect macvlan_rqlist for flow table construction + for mac in pairs(macs) do + macvlan_rqlist[mac] = {} + for vlan in pairs(vlans) do + macvlan_rqlist[mac][vlan] = {} + for _, queue in ipairs(conf.queues) do + if queue.mac == mac and (queue.vlan or false) == vlan then + table.insert(macvlan_rqlist[mac][vlan], rqs[queue.id]) + end + end + end + end + end + + local function setup_rss_rxtable (rqlist, tdomain, level) -- Set up RSS accross all queues. Hashing is only performed for -- IPv4/IPv6 and TCP/UDP, i.e. non-IP packets as well as non -- TCP/UDP packets are mapped to Queue #1. Hashing is done by @@ -360,6 +378,8 @@ function ConnectX:new (conf) -- appropriate types of packets. local l3_protos = { 'v4', 'v6' } local l4_protos = { 'udp', 'tcp' } + local rxtable = hca:create_flow_table(NIC_RX, level, + #l3_protos * #l4_protos + 1) local rqt = hca:create_rqt(rqlist) local flow_group_ip = hca:create_flow_group_ip(rxtable, NIC_RX, 0, @@ -375,7 +395,7 @@ function ConnectX:new (conf) -- If the header is incomplete, the packet will fall through -- to the wildcard match and end up in the first queue. hca:set_flow_table_entry_ip(rxtable, NIC_RX, flow_group_ip, - index, tir, l3_proto, l4_proto) + index, TIR, tir, l3_proto, l4_proto) index = index + 1 end end @@ -384,9 +404,44 @@ function ConnectX:new (conf) hca:create_flow_group_wildcard(rxtable, NIC_RX, index, index) local tir_q1 = hca:create_tir_direct(rqlist[1], tdomain) hca:set_flow_table_entry_wildcard(rxtable, NIC_RX, - flow_group_wildcard, index, tir_q1) + flow_group_wildcard, index, TIR, tir_q1) + return rxtable + end + + local function setup_macvlan_rxtable (macvlan_rqlist, usevlan, tdomain, level) + -- Set up RSS across multiple queues with matching MAC+VLAN. + -- See notes on RSS in setup_rss_rxtable above. + local num_entries = 0 + for mac in pairs(macvlan_rqlist) do + for vlan, rqlist in pairs(macvlan_rqlist[mac]) do + if #rqlist > 0 then + num_entries = num_entries + 1 + end + end + end + local rxtable = hca:create_flow_table(NIC_RX, level or 0, num_entries) + local index = 0 + local flow_group_id = hca:create_flow_group_macvlan(rxtable, NIC_RX, 0, num_entries-1, usevlan) + for mac in pairs(macvlan_rqlist) do + for vlan, rqlist in pairs(macvlan_rqlist[mac]) do + if #rqlist > 0 then + local tid = setup_rss_rxtable(rqlist, tdomain, 1) + hca:set_flow_table_entry_macvlan(rxtable, NIC_RX, flow_group_id, index, + FLOW_TABLE, tid, ethernet:ptoi(mac), vlan) + index = index + 1 + end + end + end + return rxtable + end + + if usemac then + local rxtable = setup_macvlan_rxtable(macvlan_rqlist, usevlan, tdomain, 0) + hca:set_flow_table_root(rxtable, NIC_RX) + else + local rxtable = setup_rss_rxtable(rqlist, tdomain, 0) + hca:set_flow_table_root(rxtable, NIC_RX) end - hca:set_flow_table_root(rxtable, NIC_RX) self.shm = { mtu = {counter, mtu}, @@ -1380,12 +1435,16 @@ end NIC_RX = 0 -- Flow table type code for incoming packets NIC_TX = 1 -- Flow table type code for outgoing packets --- Create the root flow table. -function HCA:create_root_flow_table (table_type) +FLOW_TABLE = 1 -- Flow table entry destination_type for FLOW_TABLE +TIR = 2 -- Flow table entry destination_type for TIR + +-- Create a flow table. +function HCA:create_flow_table (table_type, level, size) self:command("CREATE_FLOW_TABLE", 0x3C, 0x0C) :input("opcode", 0x00, 31, 16, 0x930) :input("table_type", 0x10, 31, 24, table_type) - :input("log_size", 0x18 + 0x00, 7, 0, 10) -- XXX make parameter + :input("level", 0x18 + 0x00, 23, 16, level or 0) + :input("log_size", 0x18 + 0x00, 7, 0, math.ceil(math.log(size or 1024, 2))) :execute() local table_id = self:output(0x08, 23, 0) return table_id @@ -1416,7 +1475,7 @@ end -- Set a "wildcard" flow table entry that does not match on any fields. function HCA:set_flow_table_entry_wildcard (table_id, table_type, group_id, - flow_index, tir) + flow_index, dest_type, dest_id) self:command("SET_FLOW_TABLE_ENTRY", 0x40 + 0x300, 0x0C) :input("opcode", 0x00, 31, 16, 0x936) :input("opmod", 0x04, 15, 0, 0) -- new entry @@ -1426,8 +1485,8 @@ function HCA:set_flow_table_entry_wildcard (table_id, table_type, group_id, :input("group_id", 0x40 + 0x04, 31, 0, group_id) :input("action", 0x40 + 0x0C, 15, 0, 4) -- action = FWD_DST :input("dest_list_sz", 0x40 + 0x10, 23, 0, 1) -- destination list size - :input("dest_type", 0x40 + 0x300, 31, 24, 2) - :input("dest_id", 0x40 + 0x300, 23, 0, tir) + :input("dest_type", 0x40 + 0x300, 31, 24, dest_type) + :input("dest_id", 0x40 + 0x300, 23, 0, dest_id) :execute() end @@ -1450,7 +1509,7 @@ end -- Set a flow table entry that matches on the ethertype for IPv4/IPv6 -- as well as TCP/UDP protocol/next-header. function HCA:set_flow_table_entry_ip (table_id, table_type, group_id, - flow_index, tir, l3_proto, l4_proto) + flow_index, dest_type, dest_id, l3_proto, l4_proto) local ethertypes = { v4 = 0x0800, v6 = 0x86dd @@ -1472,8 +1531,8 @@ function HCA:set_flow_table_entry_ip (table_id, table_type, group_id, :input("dest_list_sz", 0x40 + 0x10, 23, 0, 1) -- destination list size :input("match_ether", 0x40 + 0x40 + 0x04, 15, 0, type) :input("match_proto", 0x40 + 0x40 + 0x10, 31, 24, proto) - :input("dest_type", 0x40 + 0x300, 31, 24, 2) -- TIR - :input("dest_id", 0x40 + 0x300, 23, 0, tir) + :input("dest_type", 0x40 + 0x300, 31, 24, dest_type) + :input("dest_id", 0x40 + 0x300, 23, 0, dest_id) :execute() end @@ -1497,7 +1556,8 @@ function HCA:create_flow_group_macvlan (table_id, table_type, start_ix, end_ix, end -- Set a DMAC+VLAN flow table rule. -function HCA:set_flow_table_entry_macvlan (table_id, table_type, group_id, flow_index, tir, dmac, vlanid) +function HCA:set_flow_table_entry_macvlan (table_id, table_type, group_id, + flow_index, dest_type, dest_id, dmac, vlanid) self:command("SET_FLOW_TABLE_ENTRY", 0x40 + 0x300, 0x0C) :input("opcode", 0x00, 31, 16, 0x936) :input("opmod", 0x04, 15, 0, 0) -- new entry @@ -1510,8 +1570,8 @@ function HCA:set_flow_table_entry_macvlan (table_id, table_type, group_id, flow_ :input("dmac0", 0x40 + 0x48, 31, 0, math.floor(dmac/2^16)) :input("dmac1", 0x40 + 0x4C, 31, 16, band(dmac, 0xFFFF)) :input("vlan", 0x40 + 0x4C, 11, 0, vlanid or 0) - :input("dest_type", 0x40 + 0x300, 31, 24, 2) - :input("dest_id", 0x40 + 0x300, 23, 0, tir) + :input("dest_type", 0x40 + 0x300, 31, 24, dest_type) + :input("dest_id", 0x40 + 0x300, 23, 0, dest_id) :execute() end From 22d2619d9ec5398c4386b091679227287d4c4b97 Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Fri, 27 Aug 2021 13:55:10 +0000 Subject: [PATCH 128/209] apps.mellanox.connectx_test: test MACVLAN+RSS --- src/apps/mellanox/connectx_test.lua | 44 +++++++++++++++++++++++------ 1 file changed, 35 insertions(+), 9 deletions(-) diff --git a/src/apps/mellanox/connectx_test.lua b/src/apps/mellanox/connectx_test.lua index 331b3bef52..63a69c87c4 100644 --- a/src/apps/mellanox/connectx_test.lua +++ b/src/apps/mellanox/connectx_test.lua @@ -28,19 +28,20 @@ local lib = require("core.lib") -- Hardware queue count will be macs*vlans*rss on each interface. function switch (pci0, pci1, npackets, ncores, minlen, maxlen, minburst, maxburst, macs, vlans, rss) print("selftest: connectx_test switch") - assert(rss == 1, "rss not yet handled") assert(ncores == 1, "multicore not yet handled") -- Create queue definitions local queues = {} for vlan = 1, vlans do for mac = 1, macs do - local id = ("vlan%d.mac%d"):format(vlan, mac) - queues[#queues+1] = {id=id, vlan=vlan, mac="00:00:00:00:00:"..bit.tohex(mac, 2)} + for q = 1, rss do + local id = ("vlan%d.mac%d.rss%d"):format(vlan, mac, q) + queues[#queues+1] = {id=id, vlan=vlan, mac="00:00:00:00:00:"..bit.tohex(mac, 2)} + end end end -- Instantiate app network - local nic0 = connectx.ConnectX:new({pciaddress=pci0, queues=queues, macvlan=true}) - local nic1 = connectx.ConnectX:new({pciaddress=pci1, queues=queues, macvlan=true}) + local nic0 = connectx.ConnectX:new({pciaddress=pci0, queues=queues}) + local nic1 = connectx.ConnectX:new({pciaddress=pci1, queues=queues}) local io0 = {} -- io apps on nic0 local io1 = {} -- io apps on nic1 print(("creating %d queues per device..."):format(#queues)) @@ -73,19 +74,44 @@ function switch (pci0, pci1, npackets, ncores, minlen, maxlen, minburst, maxburs else -- rest are unicast to known mac p.data[5] = between(1, macs) end - - p.data[12] = 0x08 -- ipv4 -- MAC source for i = 7, 11 do p.data[i] = math.random(256) - 1 end + -- 802.1Q p.data[12] = 0x81 p.data[15] = between(1, vlans) -- vlan id can be out of expected range p.data[16] = 0x08 -- ipv4 + + local ip_ofs = 18 + + -- IPv4 + local ip = require("lib.protocol.ipv4"):new{ + src = lib.random_bytes(4), + dst = lib.random_bytes(4), + ttl = 64, + protocol = 17 -- UDP + } + ip:copy(p.data+ip_ofs, 'relocate') + ip:total_length(p.length-ip_ofs) + ip:checksum() + + -- UDP + local udp = require("lib.protocol.udp"):new{ + src_port = math.random(30000), + dst_port = math.random(30000) + } + udp:copy(p.data+ip_ofs+ip:sizeof(), 'relocate') + udp:length(p.length-(ip_ofs+ip:sizeof())) + -- Random payload - for i = 50, p.length-1 do + for i = ip_ofs+ip:sizeof()+udp:sizeof(), p.length-1 do p.data[i] = math.random(256) - 1 end + + -- UDP checksum + udp:checksum(p.data, p.length-(ip_ofs+ip:sizeof()+udp:sizeof()), ip) + --print(lib.hexdump(ffi.string(p.data, 32))) end -- Wait for linkup on both ports @@ -182,6 +208,6 @@ function selftest () print("SNABB_PCI_CONNECTX_0 and SNABB_PCI_CONNECTX_1 must be set. Skipping selftest.") os.exit(engine.test_skipped_code) end - switch(pci0, pci1, 10e6, 1, 60, 1500, 100, 100, 4, 4, 1) + switch(pci0, pci1, 10e6, 1, 60, 1500, 100, 100, 2, 2, 4) end From 4b0487d15924d0a6a73630e45da6cb4f6b645866 Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Fri, 27 Aug 2021 13:56:32 +0000 Subject: [PATCH 129/209] apps.mellanox.connectx: invert MAC/VLAN hierarchy in macvlan_rqlist --- src/apps/mellanox/connectx.lua | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/apps/mellanox/connectx.lua b/src/apps/mellanox/connectx.lua index fecbe3b97e..75a7602e2d 100644 --- a/src/apps/mellanox/connectx.lua +++ b/src/apps/mellanox/connectx.lua @@ -356,13 +356,13 @@ function ConnectX:new (conf) end -- Collect macvlan_rqlist for flow table construction - for mac in pairs(macs) do - macvlan_rqlist[mac] = {} for vlan in pairs(vlans) do - macvlan_rqlist[mac][vlan] = {} + macvlan_rqlist[vlan] = {} + for mac in pairs(macs) do + macvlan_rqlist[vlan][mac] = {} for _, queue in ipairs(conf.queues) do - if queue.mac == mac and (queue.vlan or false) == vlan then - table.insert(macvlan_rqlist[mac][vlan], rqs[queue.id]) + if (queue.vlan or false) == vlan and queue.mac == mac then + table.insert(macvlan_rqlist[vlan][mac], rqs[queue.id]) end end end @@ -422,8 +422,8 @@ function ConnectX:new (conf) local rxtable = hca:create_flow_table(NIC_RX, level or 0, num_entries) local index = 0 local flow_group_id = hca:create_flow_group_macvlan(rxtable, NIC_RX, 0, num_entries-1, usevlan) - for mac in pairs(macvlan_rqlist) do - for vlan, rqlist in pairs(macvlan_rqlist[mac]) do + for vlan in pairs(macvlan_rqlist) do + for mac, rqlist in pairs(macvlan_rqlist[vlan]) do if #rqlist > 0 then local tid = setup_rss_rxtable(rqlist, tdomain, 1) hca:set_flow_table_entry_macvlan(rxtable, NIC_RX, flow_group_id, index, From 704126c38228dadee32ef0f5752cb56e9a49ab6d Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Mon, 30 Aug 2021 12:23:18 +0000 Subject: [PATCH 130/209] apps.mellanox.connectx: switch multicast packets when using MAC+VLAN --- src/apps/mellanox/connectx.lua | 110 +++++++++++++++++++-------------- 1 file changed, 64 insertions(+), 46 deletions(-) diff --git a/src/apps/mellanox/connectx.lua b/src/apps/mellanox/connectx.lua index 75a7602e2d..d1e1838d26 100644 --- a/src/apps/mellanox/connectx.lua +++ b/src/apps/mellanox/connectx.lua @@ -276,10 +276,7 @@ function ConnectX:new (conf) local usemac = false local usevlan = false - -- Sets of MACs and VLANs - local macs, vlans = {}, {} - - -- Lists of receive queues by macvlan + -- Lists of receive queues by macvlan (used if usemac=true) local macvlan_rqlist = {} for _, queue in ipairs(conf.queues) do @@ -343,30 +340,24 @@ function ConnectX:new (conf) end if usemac then - -- Collect macs, and vlans + -- Collect macvlan_rqlist for flow table construction for _, queue in ipairs(conf.queues) do assert(queue.mac, "Queue does not specifiy MAC: "..queue.id) - macs[queue.mac] = true if usevlan then assert(queue.vlan, "Queue does not specify a VLAN: "..queue.id) - vlans[queue.vlan] = true - else - vlans[false] = true end - end - - -- Collect macvlan_rqlist for flow table construction - for vlan in pairs(vlans) do - macvlan_rqlist[vlan] = {} - for mac in pairs(macs) do + local vlan = queue.vlan or false + local mac = queue.mac + if not macvlan_rqlist[vlan] then + macvlan_rqlist[vlan] = {} + end + if not macvlan_rqlist[vlan][mac] then macvlan_rqlist[vlan][mac] = {} - for _, queue in ipairs(conf.queues) do - if (queue.vlan or false) == vlan and queue.mac == mac then - table.insert(macvlan_rqlist[vlan][mac], rqs[queue.id]) - end - end end + table.insert(macvlan_rqlist[vlan][mac], rqs[queue.id]) end + elseif usevlan then + error("NYI: promisc vlan") end local function setup_rss_rxtable (rqlist, tdomain, level) @@ -409,28 +400,50 @@ function ConnectX:new (conf) end local function setup_macvlan_rxtable (macvlan_rqlist, usevlan, tdomain, level) - -- Set up RSS across multiple queues with matching MAC+VLAN. - -- See notes on RSS in setup_rss_rxtable above. - local num_entries = 0 - for mac in pairs(macvlan_rqlist) do - for vlan, rqlist in pairs(macvlan_rqlist[mac]) do - if #rqlist > 0 then - num_entries = num_entries + 1 - end + -- Set up MAC+VLAN switching. + -- + -- For Unicast switch [MAC+VLAN->RSS->TIR]. I.e., forward packets + -- destined for a MAC+VLAN tuple to a RSS table containing all queues + -- belonging to that tuple. + -- (See notes on RSS in setup_rss_rxtable above.) + -- + -- For Multicast switch [VLAN->TIR+]. I.e., forward multicast packets + -- destined for a VLAN to the first queue of every MAC in that VLAN. + -- + local macvlan_size, mcast_size = 0, 0 + for vlan in pairs(macvlan_rqlist) do + mcast_size = mcast_size + 1 + for mac in pairs(macvlan_rqlist[vlan]) do + macvlan_size = macvlan_size + 1 end end - local rxtable = hca:create_flow_table(NIC_RX, level or 0, num_entries) + local rxtable = hca:create_flow_table(NIC_RX, level, macvlan_size+mcast_size) local index = 0 - local flow_group_id = hca:create_flow_group_macvlan(rxtable, NIC_RX, 0, num_entries-1, usevlan) + -- Unicast flow table entries + local idx0, idx1 = 0, macvlan_size-1 + local flow_group_macvlan = + hca:create_flow_group_macvlan(rxtable, NIC_RX, idx0, idx1, usevlan) for vlan in pairs(macvlan_rqlist) do for mac, rqlist in pairs(macvlan_rqlist[vlan]) do - if #rqlist > 0 then - local tid = setup_rss_rxtable(rqlist, tdomain, 1) - hca:set_flow_table_entry_macvlan(rxtable, NIC_RX, flow_group_id, index, - FLOW_TABLE, tid, ethernet:ptoi(mac), vlan) - index = index + 1 - end + local tid = setup_rss_rxtable(rqlist, tdomain, 1) + hca:set_flow_table_entry_macvlan(rxtable, NIC_RX, flow_group_macvlan, index, + FLOW_TABLE, tid, ethernet:ptoi(mac), vlan) + index = index + 1 + end + end + -- Multicast flow table entries + local idx0, idx1 = idx0+macvlan_size, idx1+mcast_size + local flow_group_mcast = + hca:create_flow_group_macvlan(rxtable, NIC_RX, idx0, idx1, usevlan, 'mcast') + local mac_mcast = ethernet:ptoi("01:00:00:00:00:00") + for vlan in pairs(macvlan_rqlist) do + local mcast_tirs = {} + for mac, rqlist in pairs(macvlan_rqlist[vlan]) do + mcast_tirs[#mcast_tirs+1] = hca:create_tir_direct(rqlist[1], tdomain) end + hca:set_flow_table_entry_macvlan(rxtable, NIC_RX, flow_group_mcast, index, + TIR, mcast_tirs, mac_mcast, vlan, 'mcast') + index = index + 1 end return rxtable end @@ -1537,7 +1550,9 @@ function HCA:set_flow_table_entry_ip (table_id, table_type, group_id, end -- Create a DMAC+VLAN flow group. -function HCA:create_flow_group_macvlan (table_id, table_type, start_ix, end_ix, usevlan) +function HCA:create_flow_group_macvlan (table_id, table_type, start_ix, end_ix, usevlan, mcast) + local dmac = (mcast and ethernet:ptoi("01:00:00:00:00:00")) + or ethernet:ptoi("ff:ff:ff:ff:ff:ff") self:command("CREATE_FLOW_GROUP", 0x3FC, 0x0C) :input("opcode", 0x00, 31, 16, 0x933) :input("table_type", 0x10, 31, 24, table_type) @@ -1545,8 +1560,8 @@ function HCA:create_flow_group_macvlan (table_id, table_type, start_ix, end_ix, :input("start_ix", 0x1C, 31, 0, start_ix) :input("end_ix", 0x24, 31, 0, end_ix) -- (inclusive) :input("match_criteria", 0x3C, 7, 0, 1) -- match outer headers - :input("dmac0", 0x40 + 0x08, 31, 0, 0xFFFFFFFF) - :input("dmac1", 0x40 + 0x0C, 31, 16, 0xFFFF) + :input("dmac0", 0x40 + 0x08, 31, 0, shr(dmac, 16)) + :input("dmac1", 0x40 + 0x0C, 31, 16, band(dmac, 0xFFFF)) if usevlan then self:input("vlanid", 0x40 + 0x0C, 11, 0, 0xFFF) end @@ -1557,8 +1572,9 @@ end -- Set a DMAC+VLAN flow table rule. function HCA:set_flow_table_entry_macvlan (table_id, table_type, group_id, - flow_index, dest_type, dest_id, dmac, vlanid) - self:command("SET_FLOW_TABLE_ENTRY", 0x40 + 0x300, 0x0C) + flow_index, dest_type, dest_id, dmac, vlanid, mcast) + local dest_ids = (mcast and dest_id) or {dest_id} + self:command("SET_FLOW_TABLE_ENTRY", 0x40 + 0x300 + 0x8*(#dest_ids-1), 0x0C) :input("opcode", 0x00, 31, 16, 0x936) :input("opmod", 0x04, 15, 0, 0) -- new entry :input("table_type", 0x10, 31, 24, table_type) @@ -1566,13 +1582,15 @@ function HCA:set_flow_table_entry_macvlan (table_id, table_type, group_id, :input("flow_index", 0x20, 31, 0, flow_index) :input("group_id", 0x40 + 0x04, 31, 0, group_id) :input("action", 0x40 + 0x0C, 15, 0, 4) -- action = FWD_DST - :input("dest_list_sz", 0x40 + 0x10, 23, 0, 1) -- destination list size - :input("dmac0", 0x40 + 0x48, 31, 0, math.floor(dmac/2^16)) + :input("dest_list_sz", 0x40 + 0x10, 23, 0, #dest_ids) -- destination list size + :input("dmac0", 0x40 + 0x48, 31, 0, shr(dmac, 16)) :input("dmac1", 0x40 + 0x4C, 31, 16, band(dmac, 0xFFFF)) :input("vlan", 0x40 + 0x4C, 11, 0, vlanid or 0) - :input("dest_type", 0x40 + 0x300, 31, 24, dest_type) - :input("dest_id", 0x40 + 0x300, 23, 0, dest_id) - :execute() + for i, dest_id in ipairs(dest_ids) do + self:input("dest_type", 0x40 + 0x300 + 0x8*(i-1), 31, 24, dest_type) + self:input("dest_id", 0x40 + 0x300 + 0x8*(i-1), 23, 0, dest_id) + end + self:execute() end --------------------------------------------------------------- From 6d18bea2b43a1af4af20f46306930c910fe6c40f Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Mon, 30 Aug 2021 13:19:09 +0000 Subject: [PATCH 131/209] apps.mellanox.connectx_test: exercise L3-only RSS --- src/apps/mellanox/connectx_test.lua | 42 ++++++++++++++++++----------- 1 file changed, 27 insertions(+), 15 deletions(-) diff --git a/src/apps/mellanox/connectx_test.lua b/src/apps/mellanox/connectx_test.lua index 63a69c87c4..42b8d6f36e 100644 --- a/src/apps/mellanox/connectx_test.lua +++ b/src/apps/mellanox/connectx_test.lua @@ -89,29 +89,41 @@ function switch (pci0, pci1, npackets, ncores, minlen, maxlen, minburst, maxburs local ip = require("lib.protocol.ipv4"):new{ src = lib.random_bytes(4), dst = lib.random_bytes(4), - ttl = 64, - protocol = 17 -- UDP + ttl = 64 } + if r < 0.50 then -- 50% of packets are UDP (have L4 header) + ip:protocol(17) -- UDP + else -- rest have random payloads + ip:protocol(253) + end ip:copy(p.data+ip_ofs, 'relocate') ip:total_length(p.length-ip_ofs) ip:checksum() - -- UDP - local udp = require("lib.protocol.udp"):new{ - src_port = math.random(30000), - dst_port = math.random(30000) - } - udp:copy(p.data+ip_ofs+ip:sizeof(), 'relocate') - udp:length(p.length-(ip_ofs+ip:sizeof())) + if ip:protocol() == 17 then + -- UDP + local udp = require("lib.protocol.udp"):new{ + src_port = math.random(30000), + dst_port = math.random(30000) + } + udp:copy(p.data+ip_ofs+ip:sizeof(), 'relocate') + udp:length(p.length-(ip_ofs+ip:sizeof())) + + -- Random payload + for i = ip_ofs+ip:sizeof()+udp:sizeof(), p.length-1 do + p.data[i] = math.random(256) - 1 + end - -- Random payload - for i = ip_ofs+ip:sizeof()+udp:sizeof(), p.length-1 do - p.data[i] = math.random(256) - 1 + -- UDP checksum + udp:checksum(p.data, p.length-(ip_ofs+ip:sizeof()+udp:sizeof()), ip) + + else + -- Random payload + for i = ip_ofs+ip:sizeof(), p.length-1 do + p.data[i] = math.random(256) - 1 + end end - -- UDP checksum - udp:checksum(p.data, p.length-(ip_ofs+ip:sizeof()+udp:sizeof()), ip) - --print(lib.hexdump(ffi.string(p.data, 32))) end -- Wait for linkup on both ports From 5f10b3b7af9a6d5c2b4c845fe645cb55663e5524 Mon Sep 17 00:00:00 2001 From: Alexander Gall Date: Mon, 30 Aug 2021 15:32:36 +0200 Subject: [PATCH 132/209] connectx.lua: convert tabs to spaces --- src/apps/mellanox/connectx.lua | 55 +++++++++++++++++----------------- 1 file changed, 27 insertions(+), 28 deletions(-) diff --git a/src/apps/mellanox/connectx.lua b/src/apps/mellanox/connectx.lua index ac1db83375..6bd654337b 100644 --- a/src/apps/mellanox/connectx.lua +++ b/src/apps/mellanox/connectx.lua @@ -171,16 +171,16 @@ local DEAD = 4 function shutdown(pid) for _, pciaddr in ipairs(shm.children("/"..pid.."/mellanox")) do for _, queue in ipairs(shm.children("/"..pid.."/mellanox/"..pciaddr)) do - local backlink = "/"..pid.."/mellanox/"..pciaddr.."/"..queue - local shm_name = "/"..pid.."/group/pci/"..pciaddr.."/"..queue - if shm.exists(shm_name) then - local cxq = shm.open(shm_name, cxq_t) - assert(sync.cas(cxq.state, IDLE, FREE) or - sync.cas(cxq.state, BUSY, FREE), - "ConnectX: failed to free "..shm_name.. - " during shutdown") - end - shm.unlink(backlink) + local backlink = "/"..pid.."/mellanox/"..pciaddr.."/"..queue + local shm_name = "/"..pid.."/group/pci/"..pciaddr.."/"..queue + if shm.exists(shm_name) then + local cxq = shm.open(shm_name, cxq_t) + assert(sync.cas(cxq.state, IDLE, FREE) or + sync.cas(cxq.state, BUSY, FREE), + "ConnectX: failed to free "..shm_name.. + " during shutdown") + end + shm.unlink(backlink) end end end @@ -286,13 +286,13 @@ function ConnectX:new (conf) local cxq = shm.create("group/pci/"..pciaddress.."/"..queue.id, cxq_t) local function check_qsize (type, size) - assert(check_pow2(size), - string.format("%s: %s queue size must be a power of 2: %d", - conf.pciaddress, type, size)) - assert(log2size(size) <= max_cap['log_max_wq_sz'], - string.format("%s: %s queue size too big: requested %d, allowed %d", - conf.pciaddress, type, size, - math.pow(2, max_cap['log_max_wq_sz']))) + assert(check_pow2(size), + string.format("%s: %s queue size must be a power of 2: %d", + conf.pciaddress, type, size)) + assert(log2size(size) <= max_cap['log_max_wq_sz'], + string.format("%s: %s queue size too big: requested %d, allowed %d", + conf.pciaddress, type, size, + math.pow(2, max_cap['log_max_wq_sz']))) end check_qsize("Send", sendq_size) @@ -311,7 +311,7 @@ function ConnectX:new (conf) local rq_stride = ffi.sizeof(ffi.typeof(cxq.rwq[0])) local sq_stride = ffi.sizeof(ffi.typeof(cxq.swq[0])) local workqueues = memory.dma_alloc(sq_stride * sendq_size + - rq_stride *recvq_size, 4096) + rq_stride *recvq_size, 4096) cxq.rwq = cast(ffi.typeof(cxq.rwq), workqueues) cxq.swq = cast(ffi.typeof(cxq.swq), workqueues + rq_stride * recvq_size) -- Create the queue objects @@ -323,9 +323,9 @@ function ConnectX:new (conf) end -- XXX order check cxq.sqn = hca:create_sq(scqn, pd, sq_stride, sendq_size, - cxq.doorbell, cxq.swq, uar, tis) + cxq.doorbell, cxq.swq, uar, tis) cxq.rqn = hca:create_rq(rcqn, pd, rq_stride, recvq_size, - cxq.doorbell, cxq.rwq, + cxq.doorbell, cxq.rwq, counter_set_id) hca:modify_sq(cxq.sqn, 0, 1) -- RESET -> READY hca:modify_rq(cxq.rqn, 0, 1) -- RESET -> READY @@ -384,12 +384,12 @@ function ConnectX:new (conf) -- Fall-through for non-TCP/UDP IP packets local flow_group_ip_l3 = hca:create_flow_group_ip(rxtable, NIC_RX, index, index + #l3_protos - 1, - "l3-only") + "l3-only") for _, l3_proto in ipairs(l3_protos) do - local tir = hca:create_tir_indirect(rqt, tdomain, l3_proto, nil) - hca:set_flow_table_entry_ip(rxtable, NIC_RX, flow_group_ip_l3, - index, tir, l3_proto, nil) - index = index + 1 + local tir = hca:create_tir_indirect(rqt, tdomain, l3_proto, nil) + hca:set_flow_table_entry_ip(rxtable, NIC_RX, flow_group_ip_l3, + index, tir, l3_proto, nil) + index = index + 1 end local flow_group_wildcard = @@ -966,7 +966,7 @@ function HCA:create_tir_indirect (rqt, transport_domain, l3_proto, l4_proto) else l4_proto = assert(l4_protos[l4_proto or 'tcp'], "invalid l4 proto") self:input("l4_prot_type", 0x20 + 0x50, 30, 30, l4_proto) - :input("selected_fields", 0x20 + 0x50, 29, 0, 15) -- SRC/DST/SPORT/DPORT + :input("selected_fields", 0x20 + 0x50, 29, 0, 15) -- SRC/DST/SPORT/DPORT end -- XXX Is random hash key a good solution? for i = 0x28, 0x4C, 4 do @@ -1161,7 +1161,7 @@ function IO:new (conf) local shmpath = "group/pci/"..pciaddress.."/"..queue self.backlink = "mellanox/"..pciaddress.."/"..queue if shm.exists(shmpath) then - shm.alias(self.backlink, shmpath) + shm.alias(self.backlink, shmpath) cxq = shm.open(shmpath, cxq_t) if sync.cas(cxq.state, FREE, IDLE) then sq = SQ:new(cxq, mmio) @@ -2314,4 +2314,3 @@ function selftest () error("selftest failed: unexpected counter values") end end - From d4a1083588add7e7107084d83c2742635490ffb4 Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Thu, 2 Sep 2021 09:15:16 +0000 Subject: [PATCH 133/209] lwaftr: keep full config in worker setup Instead of stripping other instances/queues from per-worker configurations, add the instance and queue id to to worker configurations. --- src/apps/lwaftr/lwutil.lua | 15 +++------------ src/lib/yang/snabb-softwire-v2.yang | 13 +++++++++++++ src/program/lwaftr/setup.lua | 13 +------------ 3 files changed, 17 insertions(+), 24 deletions(-) diff --git a/src/apps/lwaftr/lwutil.lua b/src/apps/lwaftr/lwutil.lua index a4cd412190..a65bd6a81a 100644 --- a/src/apps/lwaftr/lwutil.lua +++ b/src/apps/lwaftr/lwutil.lua @@ -21,18 +21,9 @@ local ntohs = lib.ntohs -- Return device PCI address, queue ID, and queue configuration. function parse_instance(conf) - local device, instance - for k, v in pairs(conf.softwire_config.instance) do - assert(device == nil, "configuration has more than one instance") - device, instance = k, v - end - assert(device ~= nil, "configuration has no instance") - local id, queue - for k, v in pairs(instance.queue) do - assert(id == nil, "configuration has more than one RSS queue") - id, queue = k, v - end - assert(id ~= nil, "configuration has no RSS queues") + local device = conf.worker_config.device + local id = conf.worker_config.queue_id + local queue = conf.softwire_config.instance[device].queue[id] return device, id, queue end diff --git a/src/lib/yang/snabb-softwire-v2.yang b/src/lib/yang/snabb-softwire-v2.yang index 47cc5aaae1..6af1db9735 100644 --- a/src/lib/yang/snabb-softwire-v2.yang +++ b/src/lib/yang/snabb-softwire-v2.yang @@ -886,5 +886,18 @@ module snabb-softwire-v2 { } } + container worker-config { + description + "Worker process configuration state. The contained leaves are used only + internally. Setting them has no effect."; + + leaf device { + type string; + } + leaf queue-id { + type uint8; + } + } + uses state-counters; } diff --git a/src/program/lwaftr/setup.lua b/src/program/lwaftr/setup.lua index 9fd214c4ad..55f1ff8e73 100644 --- a/src/program/lwaftr/setup.lua +++ b/src/program/lwaftr/setup.lua @@ -701,18 +701,7 @@ local function compute_worker_configs(conf) for id, _ in pairs(queues.queue) do local worker_id = string.format('%s/%s', device, id) local worker_config = make_copy() - local instance = worker_config.softwire_config.instance - for other_device, queues in pairs(conf.softwire_config.instance) do - if other_device ~= device then - instance[other_device] = nil - else - for other_id, _ in pairs(queues.queue) do - if other_id ~= id then - instance[device].queue[other_id] = nil - end - end - end - end + worker_config.worker_config = {device=device, queue_id=id} ret[worker_id] = worker_config end end From 907fb774881820a967cbf0702e94e6d686ddd7e1 Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Thu, 2 Sep 2021 14:10:31 +0000 Subject: [PATCH 134/209] lwaftr: add mellanox.connectx driver support --- src/program/lwaftr/setup.lua | 185 +++++++++++++++++++++++------------ 1 file changed, 123 insertions(+), 62 deletions(-) diff --git a/src/program/lwaftr/setup.lua b/src/program/lwaftr/setup.lua index 55f1ff8e73..6234908f55 100644 --- a/src/program/lwaftr/setup.lua +++ b/src/program/lwaftr/setup.lua @@ -220,6 +220,74 @@ function load_kernel_iface (c, conf, v4_nic_name, v6_nic_name) link_sink(c, v4_nic_name..'.'..dev_info.rx, v6_nic_name..'.'..dev_info.rx) end +local intel_mp = require("apps.intel_mp.intel_mp") +local connectx = require("apps.mellanox.connectx") + +function config_intel_mp(c, name, opt) + config.app(c, name, intel_mp.driver, { + pciaddr=opt.pci, + vmdq=true, -- Needed to enable MAC filtering/stamping. + rxq=opt.queue, + txq=opt.queue, + poolnum=0, + macaddr=ethernet:ntop(opt.mac), + vlan=opt.vlan, + rxcounter=opt.queue, + txcounter=opt.queue, + ring_buffer_size=opt.ring_buffer_size + }) + return name..'.input', name..'.output' +end + +function config_connectx(c, name, opt, lwconfig) + local function queue_id (opt, queue) + return ("%s.%s.%s"):format(ethernet:ntop(opt.mac), + opt.vlan or opt.vlan_tag, + queue or opt.queue) + end + local queues = {} + local min_queue + for id, queue in pairs(lwconfig.softwire_config.instance[opt.pci].queue) do + queues[#queues+1] = { + id = queue_id(queue.external_interface, id), + mac = ethernet:ntop(queue.external_interface.mac), + vlan = queue.external_interface.vlan_tag + } + queues[#queues+1] = { + id = queue_id(queue.internal_interface, id), + mac = ethernet:ntop(queue.internal_interface.mac), + vlan = queue.internal_interface.vlan_tag + } + min_queue = (not min_queue and id) or math.min(id, min_queue) + end + if opt.queue == min_queue then + config.app(c, "ConnectX_"..opt.pci:gsub("[%.:]", "_"), connectx.ConnectX, { + pciaddress = opt.pci, + queues = queues + }) + end + config.app(c, name, connectx.IO, { + pciaddress = opt.pci, + queue = queue_id(opt) + }) + local input, output = name..'.input', name..'.output' + if opt.vlan then + config.app(c, name.."_tag", vlan.Tagger, { tag=opt.vlan }) + config.link(c, name.."_tag.output -> "..input) + config.app(c, name.."_untag", vlan.Untagger, { tag=opt.vlan }) + config.link(c, output.." -> "..name.."_untag.input") + input, output = name.."_tag.input", name.."_untag.output" + end + return input, output +end + +function config_nic(c, name, driver, opt, lwconfig) + local config_fn = { [intel_mp.driver] = config_intel_mp, + [connectx.driver] = config_connectx } + local f = assert(config_fn[driver], "Unsupported device: "..opt.pci) + return f(c, name, opt, lwconfig) +end + function load_phy(c, conf, v4_nic_name, v6_nic_name, ring_buffer_size) local v6_pci, id, queue = lwutil.parse_instance(conf) local v4_pci = queue.external_interface.device @@ -228,31 +296,28 @@ function load_phy(c, conf, v4_nic_name, v6_nic_name, ring_buffer_size) validate_pci_devices({v4_pci, v6_pci}) lwaftr_app(c, conf, v4_pci) - config.app(c, v4_nic_name, require(v4_info.driver).driver, { - pciaddr=v4_pci, - vmdq=true, -- Needed to enable MAC filtering/stamping. - rxq=id, - txq=id, - poolnum=0, - vlan=queue.external_interface.vlan_tag, - rxcounter=id, - txcounter=id, - ring_buffer_size=ring_buffer_size, - macaddr=ethernet:ntop(queue.external_interface.mac)}) - config.app(c, v6_nic_name, require(v6_info.driver).driver, { - pciaddr=v6_pci, - vmdq=true, -- Needed to enable MAC filtering/stamping. - rxq=id, - txq=id, - poolnum=0, - vlan=queue.internal_interface.vlan_tag, - rxcounter=id, - txcounter=id, - ring_buffer_size=ring_buffer_size, - macaddr = ethernet:ntop(queue.internal_interface.mac)}) + local v4_nic_opt = { + pci = v4_pci, + queue = id, + mac = queue.external_interface.mac, + vlan = queue.external_interface.vlan_tag, + ring_buffer_size = ring_buffer_size + } + local v4_input, v4_output = + config_nic(c, v4_nic_name, require(v4_info.driver).driver, v4_nic_opt, conf) + + local v6_nic_opt = { + pci = v6_pci, + queue = id, + mac = queue.internal_interface.mac, + vlan = queue.internal_interface.vlan_tag, + ring_buffer_size = ring_buffer_size + } + local v6_input, v6_output = + config_nic(c, v6_nic_name, require(v6_info.driver).driver, v6_nic_opt, conf) - link_source(c, v4_nic_name..'.'..v4_info.tx, v6_nic_name..'.'..v6_info.tx) - link_sink(c, v4_nic_name..'.'..v4_info.rx, v6_nic_name..'.'..v6_info.rx) + link_source(c, v4_output, v6_output) + link_sink(c, v4_input, v6_input) end function load_xdp(c, conf, v4_nic_name, v6_nic_name, ring_buffer_size) @@ -393,17 +458,17 @@ function load_on_a_stick(c, conf, args) assert(queue.external_interface.vlan_tag == queue.internal_interface.vlan_tag) assert(ethernet:ntop(queue.external_interface.mac) == ethernet:ntop(queue.internal_interface.mac)) - config.app(c, 'nic', driver, { - pciaddr = pciaddr, - vmdq=true, -- Needed to enable MAC filtering/stamping. - rxq=id, - txq=id, - poolnum=0, - vlan=queue.external_interface.vlan_tag, - ring_buffer_size=args.ring_buffer_size, - rxcounter = id, - txcounter = id, - macaddr = ethernet:ntop(queue.external_interface.mac)}) + + local v4v6_nic_opt = { + pci = pciaddr, + queue = id, + mac = queue.external_interface.mac, + vlan = queue.internal_interface.vlan_tag, + ring_buffer_size = args.ring_buffer_size + } + local v4v6_input, v4v6_output = + config_nic(c, 'nic', driver, v4v6_nic_opt, conf) + if mirror then local Tap = require("apps.tap.tap").Tap local ifname = mirror @@ -415,37 +480,33 @@ function load_on_a_stick(c, conf, args) else config.app(c, v4v6, V4V6) end - config.link(c, 'nic.'..device.tx..' -> '..v4v6..'.input') - config.link(c, v4v6..'.output -> nic.'..device.rx) + config.link(c, v4v6_output..' -> '..v4v6..'.input') + config.link(c, v4v6..'.output -> '..v4v6_input) link_source(c, v4v6..'.v4', v4v6..'.v6') link_sink(c, v4v6..'.v4', v4v6..'.v6') else - config.app(c, v4_nic_name, driver, { - pciaddr = pciaddr, - vmdq=true, -- Needed to enable MAC filtering/stamping. - rxq=id, - txq=id, - poolnum=0, - vlan=queue.external_interface.vlan_tag, - ring_buffer_size=args.ring_buffer_size, - rxcounter = id, - txcounter = id, - macaddr = ethernet:ntop(queue.external_interface.mac)}) - config.app(c, v6_nic_name, driver, { - pciaddr = pciaddr, - vmdq=true, -- Needed to enable MAC filtering/stamping. - rxq=id, - txq=id, - poolnum=1, - vlan=queue.internal_interface.vlan_tag, - ring_buffer_size=args.ring_buffer_size, - rxcounter = id, - txcounter = id, - macaddr = ethernet:ntop(queue.internal_interface.mac)}) - - link_source(c, v4_nic_name..'.'..device.tx, v6_nic_name..'.'..device.tx) - link_sink(c, v4_nic_name..'.'..device.rx, v6_nic_name..'.'..device.rx) + local v4_nic_opt = { + pci = pciaddr, + queue = id, + mac = queue.external_interface.mac, + vlan = queue.external_interface.vlan_tag, + ring_buffer_size = args.ring_buffer_size + } + local v4_input, v4_output = + config_nic(c, v4_nic_name, driver, v4_nic_opt, conf) + local v6_nic_opt = { + pci = pciaddr, + queue = id, + mac = queue.internal_interface.mac, + vlan = queue.internal_interface.vlan_tag, + ring_buffer_size = args.ring_buffer_size + } + local v6_input, v6_output = + config_nic(c, v6_nic_name, driver, v6_nic_opt, conf) + + link_source(c, v4_output, v6_output) + link_sink(c, v4_input, v6_input) end end From 4c84dd0ee09826a273eecdc0974919a0ff3fd1cf Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Mon, 6 Sep 2021 13:42:23 +0000 Subject: [PATCH 135/209] apps.intel_avf: add multiqueue/rss support Design based on separate Manager/IO apps design from ConnectX driver. --- src/apps/intel_avf/intel_avf.lua | 618 +++++++++++++----- src/apps/intel_avf/tests/back2back/test.snabb | 36 + 2 files changed, 500 insertions(+), 154 deletions(-) diff --git a/src/apps/intel_avf/intel_avf.lua b/src/apps/intel_avf/intel_avf.lua index 52716a282f..79d5a82c00 100644 --- a/src/apps/intel_avf/intel_avf.lua +++ b/src/apps/intel_avf/intel_avf.lua @@ -7,6 +7,7 @@ module(..., package.seeall) local ffi = require("ffi") local lib = require("core.lib") +local sync = require("core.sync") local macaddress = require("lib.macaddress") local pci = require("lib.hardware.pci") local register = require("lib.hardware.register") @@ -24,6 +25,7 @@ local MAC_ADDR_BYTE_LEN = 6 Intel_avf = { config = { pciaddr = { required=true }, + nqueues = {}, ring_buffer_size = {default=2048} } } @@ -107,33 +109,51 @@ local virtchnl_msg_t = ffi.typeof([[ ]]) local virtchnl_msg_ptr_t = ffi.typeof("$ *", virtchnl_msg_t) -local virtchnl_q_pair_t = ffi.typeof([[ +local virtchnl_txq_info_t = ffi.typeof([[ struct { uint16_t vsi_id; - uint16_t num_queue_pairs; - uint32_t pad; + uint16_t queue_id; + uint16_t ring_len; + uint16_t deprecated0; + uint64_t dma_ring_addr; + uint64_t deprecated1; + } __attribute__((packed)) +]]) - uint16_t tx_vsi_id; - uint16_t tx_queue_id; - uint16_t tx_ring_len; - uint16_t tx_deprecated0; - uint64_t tx_dma_ring_addr; - uint64_t tx_deprecated1; - - uint16_t rx_vsi_id; - uint16_t rx_queue_id; - uint32_t rx_ring_len; - uint16_t rx_hdr_size; - uint16_t rx_deprecated0; - uint32_t rx_databuffer_size; - uint32_t rx_max_pkt_size; - uint32_t rx_pad0; - uint64_t rx_dma_ring_addr; - uint32_t rx_deprecated1; - uint32_t rx_pad1; +local virtchnl_rxq_info_t = ffi.typeof([[ + struct { + uint16_t vsi_id; + uint16_t queue_id; + uint32_t ring_len; + uint16_t hdr_size; + uint16_t deprecated0; + uint32_t databuffer_size; + uint32_t max_pkt_size; + uint32_t pad0; + uint64_t dma_ring_addr; + uint32_t deprecated1; + uint32_t pad1; } __attribute__((packed)) ]]) -local virtchnl_q_pair_ptr_t = ffi.typeof("$ *", virtchnl_q_pair_t) + +local virtchnl_queue_pair_info_t = ffi.typeof([[ + struct { + /* NOTE: vsi_id and queue_id should be indentical for both queues. */ + $ txq; + $ rxq; + } __attribute__((packed)) +]], virtchnl_txq_info_t, virtchnl_rxq_info_t) + +local virtchnl_queue_config_info_t = ffi.typeof([[ + struct { + uint16_t vsi_id; + uint16_t num_queue_pairs; + uint32_t pad; + $ qpair[1]; + } __attribute__((packed)) +]], virtchnl_queue_pair_info_t) + +local virtchnl_queue_config_info_ptr_t = ffi.typeof("$ *", virtchnl_queue_config_info_t) local virtchnl_ether_addr_t = ffi.typeof([[ struct { @@ -209,6 +229,7 @@ local virtchnl_rss_key_t = ffi.typeof([[ uint16_t vsi_id; uint16_t key_len; uint8_t key[1]; /* RSS hash key, packed bytes */ + uint8_t pad; } __attribute__((packed)) ]]) local virtchnl_rss_key_ptr_t = ffi.typeof('$*', virtchnl_rss_key_t) @@ -218,6 +239,7 @@ local virtchnl_rss_lut_t = ffi.typeof([[ uint16_t vsi_id; uint16_t lut_entries; uint8_t lut[1]; /* RSS lookup table*/ + uint8_t pad; } __attribute__((packed)) ]]) local virtchnl_rss_lut_ptr_t = ffi.typeof('$*', virtchnl_rss_lut_t) @@ -246,28 +268,161 @@ local mbox_q_t = ffi.typeof([[ ]]) local mbox_q_ptr_t = ffi.typeof('$*', mbox_q_t) -function Intel_avf:init_tx_q() - self.txdesc = ffi.cast(txdesc_ptr_t, - memory.dma_alloc(ffi.sizeof(txdesc_t) * self.ring_buffer_size)) - ffi.fill(self.txdesc, ffi.sizeof(txdesc_t) * self.ring_buffer_size) - self.txqueue = ffi.new("struct packet *[?]", self.ring_buffer_size) - for i=0, self.ring_buffer_size - 1 do - self.txqueue[i] = nil - self.txdesc[i].cmd_type_offset_bsz = 0 +--------------------------------------------------------------- +-- CXQ (Queue pair control object): +-- +-- A "CXQ" is an object that we define to represent a transmit/receive pair. +-- +-- CXQs are created and deleted by a "Control" app (Intel_avf) and, +-- in between, they are used by "IO" apps to send and receive packets. +-- +-- The lifecycle of a CXQ is managed using a state machine. This is +-- necessary because we allow Control and IO apps to start in any +-- order, for Control and IO apps to start/stop/restart independently, +-- for multiple IO apps to attempt to attach to the same CXQ, and even +-- for apps to stop in one Snabb process and be started in another +-- one. +-- +-- (This design is lifted from the apps.mellanox.connectx driver.) +-- +--------------------------------------------------------------- + +-- CXQs can be in one of five states: +-- INIT: CXQ is being initialized by the control app +-- FREE: CXQ is ready and available for use by an IO app. +-- IDLE: CXQ is owned by an app, but not actively processing right now. +-- BUSY: CXQ is owned by an app and is currently processing (e.g. push/pull). +-- DEAD: CXQ has been deallocated; IO app must try to open a new one. +-- +-- Once a CXQ is closed it stays in the DEAD state forever. However, a +-- replacement CXQ with the same name can be created and existing IO +-- apps can reattach to that instead. This will rerun the state machine. +-- +-- Here are the valid state transitions & when they occur: +-- +-- App Change Why +-- ---- ----------- -------------------------------------------------------- +-- CTRL none->INIT: Control app starts initialization. +-- CTRL INIT->FREE: Control app completes initialization. +-- IO FREE->IDLE: IO app starts and becomes owner of the CXQ. +-- IO IDLE->FREE: IO app stops and releases the CXQ for future use. +-- IO IDLE->BUSY: IO app starts running a pull/push method. +-- IO BUSY->IDLE: IO app stops running a pull/push method. +-- CTRL IDLE->DEAD: Control app closes the CXQ. (Replacement can be created.) +-- +-- These state transitions are *PROHIBITED* for important reasons: +-- +-- App Change Why *PROHIBITED* +-- ------ ----------- -------------------------------------------------------- +-- CTRL BUSY->DEAD Cannot close a CXQ while it is busy (must wait.) +-- IO DEAD->BUSY Cannot use a CXQ that is closed (must check.) +-- * DEAD->* Cannot transition from DEAD (must create new CXQ.) +-- +-- Further notes: +-- +-- Packet buffers for pending DMA (transmit or receive) are freed by +-- the Control app (which can disable DMA first) rather than by the IO +-- app (which shuts down with DMA still active.) +-- +-- Abnormal shutdown of the process hosting the Control app is *not* +-- supported. We just don’t have anywhere to free packets to in that +-- case. + +-- A CXQ is represented by one struct allocated in shared memory. +-- +-- The struct defines the fields in very specific terms so that it can +-- be used directly by the driver code (rather than copying back and +-- forth between the shared memory object and a separate native +-- format.) +local cxq_t = ffi.typeof([[ + struct { + int state[1]; // current state / availability + + // configuration information: + uint32_t qno; // queue number + uint32_t ring_size; // size of rx/tx rings + + // Transmit state + uint32_t tx_next; + uint32_t tx_cand; + uint32_t tx_desc_free; + $ txdesc; + struct packet *txqueue[64*1024]; + + // Receive state + uint32_t rx_tail; + $ rxdesc; + struct packet *rxqueue[64*1024]; + } __attribute((packed)) +]], txdesc_ptr_t, rxdesc_ptr_t) + +-- CXQ states: +local INIT = 0 -- Implicit initial state due to 0 value. +local BUSY = 1 +local IDLE = 2 +local FREE = 3 +local DEAD = 4 + +-- Release CXQ from IO apps after process termination. +-- Called from core.main.shutdown +function shutdown(pid) + for _, pciaddr in ipairs(shm.children("/"..pid.."/intel_avf")) do + for _, queue in ipairs(shm.children("/"..pid.."/intel_avf/"..pciaddr)) do + local backlink = "/"..pid.."/intel_avf/"..pciaddr.."/"..queue + local ok, cxq = pcall(shm.open, backlink, cxq_t) + if ok then + -- Allow reclaimation of CXQ + sync.cas(cxq.state, IDLE, FREE) + sync.cas(cxq.state, BUSY, FREE) + shm.unlink(backlink) + end + end end end -function Intel_avf:init_rx_q() - self.rxqueue = ffi.new("struct packet *[?]", self.ring_buffer_size) - self.rxdesc = ffi.cast(rxdesc_ptr_t, - memory.dma_alloc(ffi.sizeof(rxdesc_t) * self.ring_buffer_size), 128) +function Intel_avf:init_cxq (qno) + -- Create a shared memory object for controlling the queue pair + local cxq = shm.create("group/pci/"..self.pciaddress.."/"..qno, cxq_t) + cxq.qno = qno + cxq.ring_size = self.ring_buffer_size + self:init_tx_q(cxq) + self:init_rx_q(cxq) + return cxq +end + +function Intel_avf:free_cxq (cxq) + -- Free packets remaining in TX/RX queues. + for i = 0, cxq.ring_size-1 do + if cxq.txqueue[i] ~= nil then + packet.free(cxq.txqueue[i]) + end + packet.free(cxq.rxqueue[i]) + end + shm.unlink("group/pci/"..self.pciaddress.."/"..cxq.qno) + shm.unmap(cxq) +end + +function Intel_avf:init_tx_q(cxq) + cxq.txdesc = ffi.cast(txdesc_ptr_t, memory.dma_alloc(ffi.sizeof(txdesc_t) * self.ring_buffer_size)) + ffi.fill(cxq.txdesc, ffi.sizeof(txdesc_t) * self.ring_buffer_size) + for i=0, self.ring_buffer_size - 1 do + cxq.txqueue[i] = nil + cxq.txdesc[i].cmd_type_offset_bsz = 0 + end + cxq.tx_next = 0 + cxq.tx_cand = 0 + cxq.tx_desc_free = self.ring_buffer_size - 1 +end +function Intel_avf:init_rx_q(cxq) + cxq.rxdesc = ffi.cast(rxdesc_ptr_t, memory.dma_alloc(ffi.sizeof(rxdesc_t) * self.ring_buffer_size)) for i = 0, self.ring_buffer_size-1 do local p = packet.allocate() - self.rxqueue[i] = p - self.rxdesc[i].read.address = tophysical(p.data) - self.rxdesc[i].write.status_err_type_len = 0 + cxq.rxqueue[i] = p + cxq.rxdesc[i].read.address = tophysical(p.data) + cxq.rxdesc[i].write.status_err_type_len = 0 end + cxq.rx_tail = 0 end function Intel_avf:supported_hardware() @@ -362,135 +517,202 @@ function Intel_avf:mbox_setup_txq() self.r.VF_ATQLEN(bits({ ENABLE = 31 }) + self.mbox.q_len) end -function Intel_avf:mbox_sr_q() - local tt = self:mbox_send_buf(virtchnl_q_pair_ptr_t) +function Intel_avf:mbox_sr_q(cxqs) + local tt = self:mbox_send_buf(virtchnl_queue_config_info_ptr_t) tt.vsi_id = self.vsi_id - tt.num_queue_pairs = 1 - - tt.tx_vsi_id = self.vsi_id - tt.tx_queue_id = self.qno - tt.tx_ring_len = self.ring_buffer_size - tt.tx_dma_ring_addr = tophysical(self.txdesc) - - tt.rx_vsi_id = self.vsi_id - tt.rx_queue_id = self.qno - tt.rx_ring_len = self.ring_buffer_size - -- Only 32 byte rxdescs are supported, at least by the PF driver in - -- centos 7 3.10.0-957.1.3.el7.x86_64 - tt.rx_hdr_size = 32 - tt.rx_databuffer_size = packet.max_payload - tt.rx_max_pkt_size = packet.max_payload - tt.rx_dma_ring_addr = tophysical(self.rxdesc) - - self:mbox_sr('VIRTCHNL_OP_CONFIG_VSI_QUEUES', ffi.sizeof(virtchnl_q_pair_t) + 64) + tt.num_queue_pairs = #cxqs + + for i, cxq in ipairs(cxqs) do + tt.qpair[i-1].txq.vsi_id = self.vsi_id + tt.qpair[i-1].txq.queue_id = cxq.qno + tt.qpair[i-1].txq.ring_len = cxq.ring_size + tt.qpair[i-1].txq.dma_ring_addr = tophysical(cxq.txdesc) + + tt.qpair[i-1].rxq.vsi_id = self.vsi_id + tt.qpair[i-1].rxq.queue_id = cxq.qno + tt.qpair[i-1].rxq.ring_len = cxq.ring_size + -- Only 32 byte rxdescs are supported, at least by the PF driver in + -- centos 7 3.10.0-957.1.3.el7.x86_64 + tt.qpair[i-1].rxq.hdr_size = 32 + tt.qpair[i-1].rxq.databuffer_size = packet.max_payload + tt.qpair[i-1].rxq.max_pkt_size = packet.max_payload + tt.qpair[i-1].rxq.dma_ring_addr = tophysical(cxq.rxdesc) + end - self.r.rx_tail = self.r.QRX_TAIL[self.qno] - self.r.tx_tail = self.r.QTX_TAIL[self.qno] - self.rx_tail = 0 - self.r.rx_tail(self.ring_buffer_size - 1) + self:mbox_sr('VIRTCHNL_OP_CONFIG_VSI_QUEUES', + ffi.sizeof(virtchnl_queue_config_info_t) + + ffi.sizeof(virtchnl_queue_pair_info_t) * #cxqs) end -function Intel_avf:mbox_sr_enable_q () +function Intel_avf:mbox_sr_enable_q (nqueues) local tt = self:mbox_send_buf(queue_select_ptr_t) tt.vsi_id = self.vsi_id tt.pad = 0 - tt.rx_queues = bits({ ENABLE = self.qno }) - tt.tx_queues = bits({ ENABLE = self.qno }) + local q_enable_mask = lshift(1, nqueues) - 1 + tt.rx_queues = q_enable_mask + tt.tx_queues = q_enable_mask self:mbox_sr('VIRTCHNL_OP_ENABLE_QUEUES', ffi.sizeof(queue_select_t)) end -function Intel_avf:ringnext (index) - return band(index+1, self.ring_buffer_size - 1) +IO = { + config = { + pciaddr = {required=true}, + queue = {required=true} + } +} + +function IO:new (conf) + local self = setmetatable({}, { __index = IO }) + self.pciaddr = pci.qualified(conf.pciaddr) + self.qno = conf.queue + + -- This is also done in Intel_avf:new() but might not have + -- happened yet. + pci.unbind_device_from_linux(self.pciaddr) + + self.fd = pci.open_pci_resource_unlocked(self.pciaddr, 0) + self.base = pci.map_pci_memory(self.fd) + self.r = {} + Intel_avf.load_registers(self) -- Initialize registers at (self.r.*) + + self.online = false -- True when queue is up and running + self.cxq = nil -- shm object containing queue control information + self.open_throttle = -- Timer to throttle shm open attempts (10ms) + lib.throttle(0.25) + + return self +end + +function IO:stop() + if self.cxq then + assert(sync.cas(self.cxq.state, IDLE, FREE) or + self.cxq.state[0] == DEAD, + "illegal state detected") + self:close() + end +end + +-- Close the queue mapping. +function IO:close () + shm.unlink(self.backlink) + shm.unmap(self.cxq) + self.cxq = nil end -function Intel_avf:reclaim_txdesc () +-- Open the queue mapping. +function IO:open () + local shmpath = "group/pci/"..self.pciaddr.."/"..self.qno + self.backlink = "intel_avf/"..self.pciaddr.."/"..self.qno + if shm.exists(shmpath) then + shm.alias(self.backlink, shmpath) + self.cxq = shm.open(shmpath, cxq_t) + if sync.cas(self.cxq.state, FREE, IDLE) then + -- Select queue tail registers + self.r.rx_tail = self.r.QRX_TAIL[self.cxq.qno] + self.r.tx_tail = self.r.QTX_TAIL[self.cxq.qno] + else + close() -- Queue was not FREE. + end + end +end + +-- Return true on successful activation of the queue. +function IO:activate () + -- If not open then make a request on a regular schedule. + if self.cxq == nil and self.open_throttle() then + self:open() + end + if self.cxq then + -- Careful: Control app may have closed the CXQ. + if sync.cas(self.cxq.state, IDLE, BUSY) then + return true + else + assert(self.cxq.state[0] == DEAD, "illegal state detected") + self:close() + end + end +end + +-- Enter the idle state. +function IO:deactivate () + assert(sync.cas(self.cxq.state, BUSY, IDLE)) +end + +function IO:reclaim_txdesc () local RS = bits({ RS = 5 }) local COMPLETE = 15 - while band(self.txdesc[ self:ringnext(self.tx_cand) ].cmd_type_offset_bsz, COMPLETE) == COMPLETE - and self.tx_desc_free < self.ring_buffer_size - 1 do - local c = self.tx_cand - packet.free(self.txqueue[c]) - self.txqueue[c] = nil - self.tx_cand = self:ringnext(self.tx_cand) - self.tx_desc_free = self.tx_desc_free + 1 + local cxq = self.cxq + while band(cxq.txdesc[band(cxq.tx_cand+1, cxq.ring_size-1)].cmd_type_offset_bsz, COMPLETE) == COMPLETE + and cxq.tx_desc_free < cxq.ring_size - 1 do + local c = cxq.tx_cand + packet.free(cxq.txqueue[c]) + cxq.txqueue[c] = nil + cxq.tx_cand = band(cxq.tx_cand+1, cxq.ring_size-1) + cxq.tx_desc_free = cxq.tx_desc_free + 1 end end -function Intel_avf:push () - local li = self.input.input +function IO:transmit (li) if li == nil then return end local RS_EOP = bits({ EOP = 4, RS = 5 }) local SIZE_SHIFT = 34 self:reclaim_txdesc() - while not empty(li) and self.tx_desc_free > 0 do + local cxq = self.cxq + while not empty(li) and cxq.tx_desc_free > 0 do local p = receive(li) -- NB: need to extend size for 4 byte CRC (not clear from the spec.) local size = lshift(4ULL+p.length, SIZE_SHIFT) - self.txdesc[ self.tx_next ].address = tophysical(p.data) - self.txqueue[ self.tx_next ] = p - self.txdesc[ self.tx_next ].cmd_type_offset_bsz = RS_EOP + size - self.tx_next = self:ringnext(self.tx_next) - self.tx_desc_free = self.tx_desc_free - 1 + cxq.txdesc[ cxq.tx_next ].address = tophysical(p.data) + cxq.txqueue[ cxq.tx_next ] = p + cxq.txdesc[ cxq.tx_next ].cmd_type_offset_bsz = RS_EOP + size + cxq.tx_next = band(cxq.tx_next+1, cxq.ring_size-1) + cxq.tx_desc_free = cxq.tx_desc_free - 1 end C.full_memory_barrier() - self.r.tx_tail(band(self.tx_next, self.ring_buffer_size - 1)) - - if self.sync_stats_throttle() then - self:sync_stats() - end + self.r.tx_tail(band(cxq.tx_next, cxq.ring_size - 1)) end -function Intel_avf:pull() - local lo = self.output.output +function IO:receive (lo) if lo == nil then return end local pkts = 0 - while band(self.rxdesc[self.rx_tail].write.status_err_type_len, 0x01) == 1 and pkts < engine.pull_npackets do - local p = self.rxqueue[self.rx_tail] - p.length = rshift(self.rxdesc[self.rx_tail].write.status_err_type_len, 38) + local cxq = self.cxq + while band(cxq.rxdesc[cxq.rx_tail].write.status_err_type_len, 0x01) == 1 and pkts < engine.pull_npackets do + local p = cxq.rxqueue[cxq.rx_tail] + p.length = rshift(cxq.rxdesc[cxq.rx_tail].write.status_err_type_len, 38) transmit(lo, p) local np = packet.allocate() - self.rxqueue[self.rx_tail] = np - self.rxdesc[self.rx_tail].read.address = tophysical(np.data) - self.rxdesc[self.rx_tail].write.status_err_type_len = 0 - self.rx_tail = band(self.rx_tail + 1, self.ring_buffer_size-1) + cxq.rxqueue[cxq.rx_tail] = np + cxq.rxdesc[cxq.rx_tail].read.address = tophysical(np.data) + cxq.rxdesc[cxq.rx_tail].write.status_err_type_len = 0 + cxq.rx_tail = band(cxq.rx_tail+1, cxq.ring_size-1) pkts = pkts + 1 end -- This avoids the queue being full / empty when HEAD=TAIL C.full_memory_barrier() - self.r.rx_tail(band(self.rx_tail - 1, self.ring_buffer_size - 1)) - - if self.sync_stats_throttle() then - self:sync_stats() - end + self.r.rx_tail(band(cxq.rx_tail-1, cxq.ring_size-1)) end -function Intel_avf:sync_stats () - if self.mbox.state == self.mbox.opcodes['VIRTCHNL_OP_GET_STATS'] then - self:mbox_r_stats('async') - end - if self.mbox.state == self.mbox.opcodes['VIRTCHNL_OP_RESET_VF'] then - self:mbox_s_stats() +function IO:push () + if self:activate() then + self:transmit(self.input.input) + self:deactivate() end end -function Intel_avf:flush_stats () - if self.mbox.state == self.mbox.opcodes['VIRTCHNL_OP_GET_STATS'] then - self:mbox_r_stats() +function IO:pull () + if self:activate() then + self:receive(self.output.output) + self:deactivate() end - self:mbox_s_stats() - self:mbox_r_stats() end -function Intel_avf:rxdrop () return counter.read(self.shm.rxdrop) end -function Intel_avf:txdrop () return counter.read(self.shm.txdrop) end - function Intel_avf:mbox_setup() local dlen = 4096 self.mbox = { @@ -548,7 +770,7 @@ function Intel_avf:mbox_sr(opcode, datalen) return self:mbox_recv(opcode) end -function Intel_avf:mbox_send(opcode, datalen) +function Intel_avf:mbox_send(opcode, datalen, timeout) assert(opcode == 'VIRTCHNL_OP_RESET_VF' or self.mbox.state == self.mbox.opcodes['VIRTCHNL_OP_RESET_VF']) @@ -582,9 +804,11 @@ function Intel_avf:mbox_send(opcode, datalen) self.r.VF_ATQT(self.mbox.next_send_idx) lib.waitfor(function() + assert(not (timeout and timeout()), "timeout") return self.r.VF_ATQT() == self.mbox.next_send_idx end) lib.waitfor(function() + assert(not (timeout and timeout()), "timeout") -- 1 == bits({ DescriptorDone = 0 }) -- 2 == bits({ Complete = 1 }) @@ -685,12 +909,13 @@ function Intel_avf:mbox_recv(opcode, async) return ptr end -function Intel_avf:wait_for_vfgen_rstat() +function Intel_avf:wait_for_vfgen_rstat(timeout) -- Constant names stolen from DPDK drivers/net/avf/base/virtchnl.h -- Section 6.1 on page 51 local mask0 = bits( { VIRTCHNL_VFR_COMPLETED = 1 }) local mask1 = bits( { VIRTCHNL_VFR_VFACTIVE = 2 }) lib.waitfor(function () + assert(not (timeout and timeout()), "timeout") local v = self.r.VFGEN_RSTAT() return bit.band(mask0, v) == mask0 or bit.band(mask1, v) == mask1 end) @@ -698,15 +923,10 @@ end function Intel_avf:new(conf) local self = { - pciaddress = conf.pciaddr, + pciaddress = pci.qualified(conf.pciaddr), path = pci.path(conf.pciaddr), r = {}, ring_buffer_size = conf.ring_buffer_size, - - tx_next = 0, - tx_cand = 0, - tx_desc_free = conf.ring_buffer_size - 1, - qno = 0, shm = { rxbytes = {counter}, rxpackets = {counter}, @@ -730,7 +950,7 @@ function Intel_avf:new(conf) self = setmetatable(self, { __index = Intel_avf }) self:supported_hardware() - self.fd = pci.open_pci_resource_unlocked(self.pciaddress, 0) + self.fd = pci.open_pci_resource_locked(self.pciaddress, 0) pci.unbind_device_from_linux(self.pciaddress) pci.set_bus_master(self.pciaddress, true) self.base = pci.map_pci_memory(self.fd) @@ -738,29 +958,44 @@ function Intel_avf:new(conf) -- wait for the nic to be ready, setup the mailbox and then reset it -- that way it doesn't matter what state you where given the card - self:wait_for_vfgen_rstat() - self:mbox_setup() - self:reset() - - -- FIXME - -- I haven't worked out why the sleep is required but without it - -- self_mbox_set_version hangs indefinitely - --C.sleep(1) - -- See elaboration in Intel_avf:reset() + lib.waitfor(function () + return pcall(function () + self:wait_for_vfgen_rstat() + self:mbox_setup() + self:reset() -- reset can timeout + end) + end) -- setup the nic for real self:mbox_setup() self:mbox_sr_version() self:mbox_sr_caps() - self:mbox_s_rss() - self:init_tx_q() - self:init_rx_q() + self:mbox_s_rss(conf.nqueues or 1) + + -- Queue setup + self.cxqs = {} + for qno=0, (conf.nqueues or 1) - 1 do + self.cxqs[#self.cxqs+1] = self:init_cxq(qno) + end self:init_irq() self:mbox_sr_irq() - self:mbox_sr_q() - self:mbox_sr_enable_q() + self:mbox_sr_q(self.cxqs) + self:mbox_sr_enable_q(#self.cxqs) + + for _, cxq in ipairs(self.cxqs) do + -- CXQ is now fully initialized & ready for attach. + assert(sync.cas(cxq.state, INIT, FREE)) + end + + if not conf.nqueues then + -- If number of queues it not explicitly configured default to + -- old behavior and configure this app to do I/O on a single queue. + self.io = IO:new{pciaddr=self.pciaddress, queue=0} + self.io.input, self.io.output = {}, {} + end + return self end @@ -769,39 +1004,95 @@ function Intel_avf:link() if not shm.exists("pci/"..self.pciaddress) then shm.alias("pci/"..self.pciaddress, "apps/"..self.appname) end + + if self.io then + self.io.input, self.io.output = self.input, self.output + end +end + +function Intel_avf:push () + if self.io then + self.io:push() + end + if self.sync_stats_throttle() then + self:sync_stats() + end end +function Intel_avf:pull () + if self.io then + self.io:pull() + end + if self.sync_stats_throttle() then + self:sync_stats() + end +end + +function Intel_avf:sync_stats () + if self.mbox.state == self.mbox.opcodes['VIRTCHNL_OP_GET_STATS'] then + self:mbox_r_stats('async') + end + if self.mbox.state == self.mbox.opcodes['VIRTCHNL_OP_RESET_VF'] then + self:mbox_s_stats() + end +end + +function Intel_avf:flush_stats () + if self.mbox.state == self.mbox.opcodes['VIRTCHNL_OP_GET_STATS'] then + self:mbox_r_stats() + end + self:mbox_s_stats() + self:mbox_r_stats() +end + +function Intel_avf:rxdrop () return counter.read(self.shm.rxdrop) end +function Intel_avf:txdrop () return counter.read(self.shm.txdrop) end + function Intel_avf:reset() -- From "Appendix A Virtual Channel Protocol": -- VF sends this request to PF with no parameters PF does NOT respond! VF -- driver must delay then poll VFGEN_RSTAT register until reset completion -- is indicated. The admin queue must be reinitialized after this operation. - self:mbox_send('VIRTCHNL_OP_RESET_VF', 0) + self:mbox_send('VIRTCHNL_OP_RESET_VF', 0, lib.timeout(1)) -- As per the above we (the VF driver) must "delay". Sadly, the spec does -- (as of this time / to my knowledge) not give further clues as to how to -- detect that the delay is sufficient. One second turned out to be not -- enough in some cases, two seconds has always worked so far. C.usleep(2e6) - self:wait_for_vfgen_rstat() + self:wait_for_vfgen_rstat(lib.timeout(1)) end function Intel_avf:stop() self:reset() pci.set_bus_master(self.pciaddress, false) pci.close_pci_resource(self.fd, self.base) - -- Free packets remaining in TX/RX queues. - for i = 0, self.ring_buffer_size-1 do - if self.txqueue[i] ~= nil then - packet.free(self.txqueue[i]) - end + -- If we have an embedded IO app, stop it. + if self.io then + self.io:stop() end - for i = 0, self.ring_buffer_size-1 do - packet.free(self.rxqueue[i]) + -- Free packets remaining in TX/RX queues. + for _, cxq in ipairs(self.cxqs) do + local timeout = lib.timeout(3) + lib.waitfor(function () + assert(not timeout(), "Intel_avf: failed to free queue "..tonumber(cxq.qno)) + return sync.cas(cxq.state, FREE, DEAD) or sync.cas(cxq.state, IDLE, DEAD) + end) + self:free_cxq(cxq) end -- Unlink SHM alias. shm.unlink("pci/"..self.pciaddress) end +function Intel_avf:report () + self:flush_stats() + for _, c in ipairs{ + 'rxbytes', 'rxpackets', 'rxmcast', 'rxbcast', 'rxdrop', 'rx_unknown_protocol', + 'txbytes', 'txpackets', 'txmcast', 'txbcast', 'txdrop', 'txerrors' + } do + print((" %-20s %20s"):format(c, lib.comma_value(counter.read(self.shm[c])))) + end +end + function Intel_avf:init_irq() local intv = bit.lshift(20, 5) local v = bit.bor(bits({ ENABLE = 0, CLEARPBA = 1, ITR0 = 3, ITR1 = 4}), intv) @@ -828,13 +1119,32 @@ function Intel_avf:mbox_sr_add_mac() self:mbox_sr('VIRTCHNL_OP_ADD_ETH_ADDR', ffi.sizeof(virtchnl_ether_addr_t) + 8) end -function Intel_avf:mbox_s_rss() - -- pg83 - -- Forcefully disable the NICs RSS features. Contrary to the spec, RSS - -- capabilites are turned on by default and need to be disabled (as least - -- under Linux/some NICs.) - local tt = self:mbox_send_buf(virtchnl_rss_hena_ptr_t) - self:mbox_sr('VIRTCHNL_OP_SET_RSS_HENA', ffi.sizeof(virtchnl_rss_hena_t)) +function Intel_avf:mbox_s_rss(nqueues) + if nqueues == 1 then + -- pg83 + -- Forcefully disable the NICs RSS features. Contrary to the spec, RSS + -- capabilites are turned on by default and need to be disabled (as least + -- under Linux/some NICs.) + local tt = self:mbox_send_buf(virtchnl_rss_hena_ptr_t) + self:mbox_sr('VIRTCHNL_OP_SET_RSS_HENA', ffi.sizeof(virtchnl_rss_hena_t)) + end + -- Set random RSS key + local tt = self:mbox_send_buf(virtchnl_rss_key_ptr_t) + tt.vsi_id = self.vsi_id + tt.key_len = self.rss_key_size + ffi.copy(tt.key, lib.random_bytes(self.rss_key_size), self.rss_key_size) + self:mbox_sr('VIRTCHNL_OP_CONFIG_RSS_KEY', + ffi.sizeof(virtchnl_rss_key_t) + self.rss_key_size-1) + -- Setup LUT + local tt = self:mbox_send_buf(virtchnl_rss_lut_ptr_t) + tt.vsi_id = self.vsi_id + tt.lut_entries = self.rss_lut_size + for i=0, self.rss_lut_size-1 do + tt.lut[i] = i % nqueues -- fill LUT with configured queues + end + self:mbox_sr('VIRTCHNL_OP_CONFIG_RSS_LUT', + ffi.sizeof(virtchnl_rss_lut_t) + self.rss_lut_size-1) + end function Intel_avf:mbox_s_stats() diff --git a/src/apps/intel_avf/tests/back2back/test.snabb b/src/apps/intel_avf/tests/back2back/test.snabb index f4ec12c519..67fa2d6979 100755 --- a/src/apps/intel_avf/tests/back2back/test.snabb +++ b/src/apps/intel_avf/tests/back2back/test.snabb @@ -96,7 +96,43 @@ while true do engine.main({ duration = 1, no_report = true }) end engine.report_links() +engine.report_apps() assert(rx("nic1.output", "sink.input") >= tosend, "packets received do not match packets sent") + +-- Test RSS queues +local nqueues = 4 +local c = config.new() +local sizes = {64,128,192,256,384,512,1024,1500} +local packets = {} +for _=1,1000 do packets[#packets+1] = sizes[(#packets%#sizes)+1] end +config.app(c, "synth0", synth.Synth, { + sizes=packets, + src=src, + dst=dst, + random_payload=true +}) +config.app(c, "synth1", synth.Synth, { + sizes=packets, + src=dst, + dst=src, + random_payload=true +}) +config.app(c, "nic0", intel_avf.Intel_avf, { pciaddr = vf0, nqueues = nqueues }) +config.app(c, "nic1", intel_avf.Intel_avf, { pciaddr = vf1, nqueues = nqueues }) +config.app(c, "sink", basic.Sink) +for qno=0, nqueues-1 do + config.app(c, "nic0_io"..qno, intel_avf.IO, {pciaddr = vf0, queue = qno}) + config.link(c, "synth0.output"..qno.. " -> nic0_io"..qno..".input") + config.link(c, "nic0_io"..qno..".output -> sink.input_nic0_io"..qno) + config.app(c, "nic1_io"..qno, intel_avf.IO, {pciaddr = vf1, queue = qno}) + config.link(c, "synth1.output"..qno.. " -> nic1_io"..qno..".input") + config.link(c, "nic1_io"..qno..".output -> sink.input_nic1_io"..qno) +end +engine.configure(c) +engine.main({ duration = 1, no_report = true }) +engine.report_links() +engine.report_apps() + engine.stop() main.exit(0) From 8c151eab1acf9c217fd7069567e639ab9e58104c Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Wed, 8 Sep 2021 12:31:26 +0000 Subject: [PATCH 136/209] lwaftr: add support for intel_avf driver --- src/program/lwaftr/setup.lua | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/src/program/lwaftr/setup.lua b/src/program/lwaftr/setup.lua index 6234908f55..5fe10c895e 100644 --- a/src/program/lwaftr/setup.lua +++ b/src/program/lwaftr/setup.lua @@ -222,6 +222,7 @@ end local intel_mp = require("apps.intel_mp.intel_mp") local connectx = require("apps.mellanox.connectx") +local intel_avf = require("apps.intel_avf.intel_avf") function config_intel_mp(c, name, opt) config.app(c, name, intel_mp.driver, { @@ -281,9 +282,35 @@ function config_connectx(c, name, opt, lwconfig) return input, output end +function config_intel_avf(c, name, opt, lwconfig) + local nqueues = 0 + local min_queue + for device, instance in pairs(lwconfig.softwire_config.instance) do + for id, queue in pairs(instance.queue) do + if device == opt.pci or queue.external_interface.device == opt.pci then + nqueues = nqueues + 1 + min_queue = (not min_queue and id) or math.min(id, min_queue) + end + end + end + if opt.queue == min_queue then + config.app(c, "IntelAVF_"..opt.pci:gsub("[%.:]", "_"), intel_avf.Intel_avf, { + pciaddr = opt.pci, + vlan = opt.vlan, + nqueues = nqueues + }) + end + config.app(c, name, intel_avf.IO, { + pciaddr = opt.pci, + queue = opt.queue + }) + return name..'.input', name..'.output' +end + function config_nic(c, name, driver, opt, lwconfig) local config_fn = { [intel_mp.driver] = config_intel_mp, - [connectx.driver] = config_connectx } + [connectx.driver] = config_connectx, + [intel_avf.driver] = config_intel_avf } local f = assert(config_fn[driver], "Unsupported device: "..opt.pci) return f(c, name, opt, lwconfig) end From e3dd48ff4a3a5a272fba4ffad611c153a486e195 Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Wed, 8 Sep 2021 12:38:27 +0000 Subject: [PATCH 137/209] apps.intel_avf: add VLAN filter/stripping/insertion support Also: set a required reserved bit in txdesc (not sure if it had an effect) --- src/apps/intel_avf/intel_avf.lua | 41 +++++++++++++++++++++++++------- 1 file changed, 33 insertions(+), 8 deletions(-) diff --git a/src/apps/intel_avf/intel_avf.lua b/src/apps/intel_avf/intel_avf.lua index 79d5a82c00..54869ef021 100644 --- a/src/apps/intel_avf/intel_avf.lua +++ b/src/apps/intel_avf/intel_avf.lua @@ -12,7 +12,7 @@ local macaddress = require("lib.macaddress") local pci = require("lib.hardware.pci") local register = require("lib.hardware.register") local tophysical = core.memory.virtual_to_physical -local band, lshift, rshift = bit.band, bit.lshift, bit.rshift +local band, lshift, rshift, bor = bit.band, bit.lshift, bit.rshift, bit.bor local transmit, receive, empty = link.transmit, link.receive, link.empty local counter = require("core.counter") local shm = require("core.shm") @@ -26,6 +26,7 @@ Intel_avf = { config = { pciaddr = { required=true }, nqueues = {}, + vlan = {}, ring_buffer_size = {default=2048} } } @@ -251,6 +252,15 @@ local virtchnl_rss_hena_t = ffi.typeof([[ ]]) local virtchnl_rss_hena_ptr_t = ffi.typeof('$*', virtchnl_rss_hena_t) +local virtchnl_vlan_filter_list_t = ffi.typeof([[ + struct { + uint16_t vsi_id; + uint16_t num_elements; + uint16_t vlan_id[1]; + } __attribute__((packed)) +]]) +local virtchnl_vlan_filter_list_ptr_t = ffi.typeof('$*', virtchnl_vlan_filter_list_t) + local mbox_q_t = ffi.typeof([[ struct { uint8_t flags0; @@ -340,6 +350,7 @@ local cxq_t = ffi.typeof([[ // configuration information: uint32_t qno; // queue number + uint16_t vlan; // 802.1Q vlan tag uint32_t ring_size; // size of rx/tx rings // Transmit state @@ -384,6 +395,7 @@ function Intel_avf:init_cxq (qno) -- Create a shared memory object for controlling the queue pair local cxq = shm.create("group/pci/"..self.pciaddress.."/"..qno, cxq_t) cxq.qno = qno + cxq.vlan = self.vlan or 0 cxq.ring_size = self.ring_buffer_size self:init_tx_q(cxq) self:init_rx_q(cxq) @@ -658,18 +670,19 @@ end function IO:transmit (li) if li == nil then return end - local RS_EOP = bits({ EOP = 4, RS = 5 }) + local cxq = self.cxq + local RS_EOP = bor(bits({ EOP = 4, RS = 5, RSV = 6 }), (cxq.vlan>0 and bits{ IL2TAG1 = 7}) or 0) + local L2TAG1 = lshift(0ULL+cxq.vlan, 48) local SIZE_SHIFT = 34 self:reclaim_txdesc() - local cxq = self.cxq while not empty(li) and cxq.tx_desc_free > 0 do local p = receive(li) -- NB: need to extend size for 4 byte CRC (not clear from the spec.) local size = lshift(4ULL+p.length, SIZE_SHIFT) cxq.txdesc[ cxq.tx_next ].address = tophysical(p.data) cxq.txqueue[ cxq.tx_next ] = p - cxq.txdesc[ cxq.tx_next ].cmd_type_offset_bsz = RS_EOP + size + cxq.txdesc[ cxq.tx_next ].cmd_type_offset_bsz = bor(RS_EOP, size, L2TAG1) cxq.tx_next = band(cxq.tx_next+1, cxq.ring_size-1) cxq.tx_desc_free = cxq.tx_desc_free - 1 end @@ -744,7 +757,7 @@ function Intel_avf:mbox_setup() VIRTCHNL_OP_DISABLE_QUEUES = 9, -- VIRTCHNL_OP_ADD_ETH_ADDR = 10, -- VIRTCHNL_OP_DEL_ETH_ADDR = 11, - -- VIRTCHNL_OP_ADD_VLAN = 12, + VIRTCHNL_OP_ADD_VLAN = 12, -- VIRTCHNL_OP_DEL_VLAN = 13, -- VIRTCHNL_OP_CONFIG_PROMISCUOUS_MODE = 14, VIRTCHNL_OP_GET_STATS = 15, @@ -841,7 +854,7 @@ function Intel_avf:mbox_sr_caps() -- dpdk/drivers/net/avf/avf_vchnl.c local supported_caps = bits({ VIRTCHNL_VF_OFFLOAD_L2 = 0, - VIRTCHNL_VF_OFFLOAD_VLAN = 16, + VIRTCHNL_VF_OFFLOAD_VLAN = 16, -- NB: Could leave this bit off and let PF handle VLANs VIRTCHNL_VF_OFFLOAD_RX_POLLING = 17, VIRTCHNL_VF_OFFLOAD_RSS_PF = 19 }) @@ -925,6 +938,7 @@ function Intel_avf:new(conf) local self = { pciaddress = pci.qualified(conf.pciaddr), path = pci.path(conf.pciaddr), + vlan = conf.vlan, r = {}, ring_buffer_size = conf.ring_buffer_size, shm = { @@ -970,7 +984,10 @@ function Intel_avf:new(conf) self:mbox_setup() self:mbox_sr_version() self:mbox_sr_caps() - self:mbox_s_rss(conf.nqueues or 1) + self:mbox_sr_rss(conf.nqueues or 1) + if self.vlan then + self:mbox_sr_vlan() + end -- Queue setup self.cxqs = {} @@ -1119,7 +1136,7 @@ function Intel_avf:mbox_sr_add_mac() self:mbox_sr('VIRTCHNL_OP_ADD_ETH_ADDR', ffi.sizeof(virtchnl_ether_addr_t) + 8) end -function Intel_avf:mbox_s_rss(nqueues) +function Intel_avf:mbox_sr_rss(nqueues) if nqueues == 1 then -- pg83 -- Forcefully disable the NICs RSS features. Contrary to the spec, RSS @@ -1144,7 +1161,15 @@ function Intel_avf:mbox_s_rss(nqueues) end self:mbox_sr('VIRTCHNL_OP_CONFIG_RSS_LUT', ffi.sizeof(virtchnl_rss_lut_t) + self.rss_lut_size-1) +end +function Intel_avf:mbox_sr_vlan() + local tt = self:mbox_send_buf(virtchnl_vlan_filter_list_ptr_t) + tt.vsi_id = self.vsi_id + tt.num_elements = 1 + tt.vlan_id[0] = self.vlan + self:mbox_sr('VIRTCHNL_OP_ADD_VLAN', + ffi.sizeof(virtchnl_vlan_filter_list_t) + ffi.sizeof("uint16_t")*1) end function Intel_avf:mbox_s_stats() From dbc4f3430aa43ecd4869b82d7faffe05335d486b Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Fri, 10 Sep 2021 15:43:41 +0000 Subject: [PATCH 138/209] lwaftr/lwutil/setup: refactor, add num_queues, is_lowest_queue --- src/apps/lwaftr/lwutil.lua | 17 +++++++++++++++++ src/program/lwaftr/setup.lua | 20 +++++--------------- 2 files changed, 22 insertions(+), 15 deletions(-) diff --git a/src/apps/lwaftr/lwutil.lua b/src/apps/lwaftr/lwutil.lua index a65bd6a81a..0ce9ead217 100644 --- a/src/apps/lwaftr/lwutil.lua +++ b/src/apps/lwaftr/lwutil.lua @@ -32,6 +32,23 @@ function is_on_a_stick(device, queue) return device == queue.external_interface.device end +function is_lowest_queue(conf) + local device, id = parse_instance(conf) + for n in pairs(conf.softwire_config.instance[device].queue) do + if id > n then return false end + end + return true +end + +function num_queues(conf) + local n = 0 + local device, id = parse_instance(conf) + for _ in pairs(conf.softwire_config.instance[device].queue) do + n = n + 1 + end + return n +end + function get_ihl_from_offset(pkt, offset) local ver_and_ihl = pkt.data[offset] return band(ver_and_ihl, 0xf) * 4 diff --git a/src/program/lwaftr/setup.lua b/src/program/lwaftr/setup.lua index 5fe10c895e..2ff49f67bf 100644 --- a/src/program/lwaftr/setup.lua +++ b/src/program/lwaftr/setup.lua @@ -246,9 +246,9 @@ function config_connectx(c, name, opt, lwconfig) opt.vlan or opt.vlan_tag, queue or opt.queue) end + local device = lwutil.parse_instance(lwconfig) local queues = {} - local min_queue - for id, queue in pairs(lwconfig.softwire_config.instance[opt.pci].queue) do + for id, queue in pairs(lwconfig.softwire_config.instance[device].queue) do queues[#queues+1] = { id = queue_id(queue.external_interface, id), mac = ethernet:ntop(queue.external_interface.mac), @@ -259,9 +259,8 @@ function config_connectx(c, name, opt, lwconfig) mac = ethernet:ntop(queue.internal_interface.mac), vlan = queue.internal_interface.vlan_tag } - min_queue = (not min_queue and id) or math.min(id, min_queue) end - if opt.queue == min_queue then + if lwutil.is_lowest_queue(lwconfig) then config.app(c, "ConnectX_"..opt.pci:gsub("[%.:]", "_"), connectx.ConnectX, { pciaddress = opt.pci, queues = queues @@ -283,17 +282,8 @@ function config_connectx(c, name, opt, lwconfig) end function config_intel_avf(c, name, opt, lwconfig) - local nqueues = 0 - local min_queue - for device, instance in pairs(lwconfig.softwire_config.instance) do - for id, queue in pairs(instance.queue) do - if device == opt.pci or queue.external_interface.device == opt.pci then - nqueues = nqueues + 1 - min_queue = (not min_queue and id) or math.min(id, min_queue) - end - end - end - if opt.queue == min_queue then + local nqueues = lwutil.num_queues(lwconfig) + if lwutil.is_lowest_queue(lwconfig) then config.app(c, "IntelAVF_"..opt.pci:gsub("[%.:]", "_"), intel_avf.Intel_avf, { pciaddr = opt.pci, vlan = opt.vlan, From 448d626aad725b8f5f5f4c729c0fe0b6edc3120a Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Fri, 10 Sep 2021 15:44:17 +0000 Subject: [PATCH 139/209] lib.hardware.pci: add Intel X710 device info --- src/lib/hardware/pci.lua | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/lib/hardware/pci.lua b/src/lib/hardware/pci.lua index 6618b793b1..0f28328b9f 100644 --- a/src/lib/hardware/pci.lua +++ b/src/lib/hardware/pci.lua @@ -67,6 +67,7 @@ model = { ["X520"] = 'Intel X520', ["i350"] = 'Intel 350', ["i210"] = 'Intel 210', + ["X710"] = 'Intel X710', ["XL710_VF"] = 'Intel XL710/X710 Virtual Function', ["AVF"] = 'Intel AVF' } @@ -85,6 +86,7 @@ local cards = { ["0x157b"] = {model = model["i210"], driver = 'apps.intel_mp.intel_mp'}, ["0x154c"] = {model = model["XL710_VF"], driver = 'apps.intel_avf.intel_avf'}, ["0x1889"] = {model = model["AVF"], driver = 'apps.intel_avf.intel_avf'}, + ["0x1572"] = {model = model["X710"], driver = nil}, }, ["0x1924"] = { ["0x0903"] = {model = 'SFN7122F', driver = 'apps.solarflare.solarflare'} From d54e29d1f861acceb0dc797f9a9ae0dc8d20b5bd Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Fri, 10 Sep 2021 15:48:31 +0000 Subject: [PATCH 140/209] lwaftr: support auto-config of AVF-compatble NICs --- src/program/lwaftr/setup.lua | 75 +++++++++++++++++++++++++++++------- 1 file changed, 61 insertions(+), 14 deletions(-) diff --git a/src/program/lwaftr/setup.lua b/src/program/lwaftr/setup.lua index 2ff49f67bf..55f013cce3 100644 --- a/src/program/lwaftr/setup.lua +++ b/src/program/lwaftr/setup.lua @@ -223,6 +223,17 @@ end local intel_mp = require("apps.intel_mp.intel_mp") local connectx = require("apps.mellanox.connectx") local intel_avf = require("apps.intel_avf.intel_avf") +local intel_avf_pf = require("apps.intel_avf.intel_avf_pf") + +local function cmd(...) + local cmd + for _, part in ipairs({...}) do + if not cmd then cmd = part + else cmd = cmd.." "..part end + end + print("shell:", cmd) + assert(os.execute(cmd)) +end function config_intel_mp(c, name, opt) config.app(c, name, intel_mp.driver, { @@ -297,11 +308,56 @@ function config_intel_avf(c, name, opt, lwconfig) return name..'.input', name..'.output' end +function config_intel_avf_pf(c, name, opt, lwconfig) + local path = "/sys/bus/pci/devices/"..pci.qualified(opt.pci) + local ifname = lib.firstfile(path.."/net") + assert(ifname and lib.can_write(path.."/sriov_numvfs"), + "Unsupported device: "..opt.pci) + local vf = 0 -- which vf should this interface be on? + local numvf = 1 -- how many vfs do we need to create on the pf? + local vfmac = {} -- MACs to assign to vfs + local device, _, queue = lwutil.parse_instance(lwconfig) + if lwutil.is_on_a_stick(device, queue) then + numvf = 2 + vfmac[0] = queue.external_interface.mac + vfmac[1] = queue.internal_interface.mac + if ethernet:ntop(opt.mac) == ethernet:ntop(queue.internal_interface.mac) then + vf = 1 + end + else + vfmac[0] = opt.mac + end + if lwutil.is_lowest_queue(lwconfig) then + print("Setting "..path.."/sriov_numvfs = "..numvf) + assert(lib.writefile(path.."/sriov_numvfs", numvf)) + cmd('ip link set up', 'dev', ifname) + cmd('ip link set', ifname, 'vf', 0, 'mac', ethernet:ntop(vfmac[0])) + cmd('ip link set', ifname, 'vf', 0, 'spoofchk off') + pcall(cmd, 'ip link set', ifname, 'vf', 0, 'trust on') + if numvf == 2 then + cmd('ip link set', ifname, 'vf', 1, 'mac', ethernet:ntop(vfmac[1])) + cmd('ip link set', ifname, 'vf', 1, 'spoofchk off') + pcall(cmd, 'ip link set', ifname, 'vf', 1, 'trust on') + end + end + local vfpci = lib.basename(lib.readlink(path.."/virtfn"..vf)) + local avf_opt = { + pci = vfpci, + queue = opt.queue, + mac = opt.mac, + vlan = opt.vlan, + ring_buffer_size = opt.ring_buffer_size + } + return config_intel_avf(c, name, avf_opt, lwconfig) +end + function config_nic(c, name, driver, opt, lwconfig) local config_fn = { [intel_mp.driver] = config_intel_mp, [connectx.driver] = config_connectx, - [intel_avf.driver] = config_intel_avf } - local f = assert(config_fn[driver], "Unsupported device: "..opt.pci) + [intel_avf.driver] = config_intel_avf, + ['maybe_avf?'] = config_intel_avf_pf} + local f = assert(config_fn[(driver and require(driver).driver) or 'maybe_avf?'], + "Unsupported device: "..opt.pci) return f(c, name, opt, lwconfig) end @@ -321,7 +377,7 @@ function load_phy(c, conf, v4_nic_name, v6_nic_name, ring_buffer_size) ring_buffer_size = ring_buffer_size } local v4_input, v4_output = - config_nic(c, v4_nic_name, require(v4_info.driver).driver, v4_nic_opt, conf) + config_nic(c, v4_nic_name, v4_info.driver, v4_nic_opt, conf) local v6_nic_opt = { pci = v6_pci, @@ -331,7 +387,7 @@ function load_phy(c, conf, v4_nic_name, v6_nic_name, ring_buffer_size) ring_buffer_size = ring_buffer_size } local v6_input, v6_output = - config_nic(c, v6_nic_name, require(v6_info.driver).driver, v6_nic_opt, conf) + config_nic(c, v6_nic_name, v6_info.driver, v6_nic_opt, conf) link_source(c, v4_output, v6_output) link_sink(c, v4_input, v6_input) @@ -389,15 +445,6 @@ function xdp_ifsetup(conf) for qid in pairs(instance.queue) do assert(qid < nqueues) end - local function cmd(...) - local cmd - for _, part in ipairs({...}) do - if not cmd then cmd = part - else cmd = cmd.." "..part end - end - print("shell:", cmd) - assert(os.execute(cmd)) - end local function ifsetup(ifname, cfg, opts, ip_ntop) cmd('ip link set down', 'dev', ifname) cmd('ip address flush', 'dev', ifname) @@ -457,7 +504,7 @@ end function load_on_a_stick(c, conf, args) local pciaddr, id, queue = lwutil.parse_instance(conf) local device = pci.device_info(pciaddr) - local driver = require(device.driver).driver + local driver = device.driver validate_pci_devices({pciaddr}) lwaftr_app(c, conf, pciaddr) local v4_nic_name, v6_nic_name, v4v6, mirror = args.v4_nic_name, From 80557cfe891d6c8a7b8ad49264cfc3d1b09808e2 Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Thu, 16 Sep 2021 10:19:36 +0000 Subject: [PATCH 141/209] intel_avf: move bits() constructor out of fast-path lib.bits uses pairs which is a JIT NYI, leading to split traces and GC activity due to snapshotting. --- src/apps/intel_avf/intel_avf.lua | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/apps/intel_avf/intel_avf.lua b/src/apps/intel_avf/intel_avf.lua index 54869ef021..108a0c4a4e 100644 --- a/src/apps/intel_avf/intel_avf.lua +++ b/src/apps/intel_avf/intel_avf.lua @@ -652,9 +652,9 @@ function IO:deactivate () assert(sync.cas(self.cxq.state, BUSY, IDLE)) end +local RS = bits({ RS = 5 }) +local COMPLETE = 15 function IO:reclaim_txdesc () - local RS = bits({ RS = 5 }) - local COMPLETE = 15 local cxq = self.cxq while band(cxq.txdesc[band(cxq.tx_cand+1, cxq.ring_size-1)].cmd_type_offset_bsz, COMPLETE) == COMPLETE @@ -667,11 +667,13 @@ function IO:reclaim_txdesc () end end +local RS_EOP = bits{ EOP = 4, RS = 5, RSV = 6 } +local IL2TAG1 = bits{ IL2TAG1 = 7} function IO:transmit (li) if li == nil then return end local cxq = self.cxq - local RS_EOP = bor(bits({ EOP = 4, RS = 5, RSV = 6 }), (cxq.vlan>0 and bits{ IL2TAG1 = 7}) or 0) + local RS_EOP_IL2TAG1 = bor(RS_EOP, (cxq.vlan>0 and IL2TAG1) or 0) local L2TAG1 = lshift(0ULL+cxq.vlan, 48) local SIZE_SHIFT = 34 @@ -682,7 +684,7 @@ function IO:transmit (li) local size = lshift(4ULL+p.length, SIZE_SHIFT) cxq.txdesc[ cxq.tx_next ].address = tophysical(p.data) cxq.txqueue[ cxq.tx_next ] = p - cxq.txdesc[ cxq.tx_next ].cmd_type_offset_bsz = bor(RS_EOP, size, L2TAG1) + cxq.txdesc[ cxq.tx_next ].cmd_type_offset_bsz = bor(RS_EOP_IL2TAG1, size, L2TAG1) cxq.tx_next = band(cxq.tx_next+1, cxq.ring_size-1) cxq.tx_desc_free = cxq.tx_desc_free - 1 end From be7dc08400b28a5290c7597c854d7cd223ca8282 Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Mon, 20 Sep 2021 15:53:52 +0000 Subject: [PATCH 142/209] program.lwaftr.setup: remove bogus stale require --- src/program/lwaftr/setup.lua | 1 - 1 file changed, 1 deletion(-) diff --git a/src/program/lwaftr/setup.lua b/src/program/lwaftr/setup.lua index 55f013cce3..f15579cba1 100644 --- a/src/program/lwaftr/setup.lua +++ b/src/program/lwaftr/setup.lua @@ -223,7 +223,6 @@ end local intel_mp = require("apps.intel_mp.intel_mp") local connectx = require("apps.mellanox.connectx") local intel_avf = require("apps.intel_avf.intel_avf") -local intel_avf_pf = require("apps.intel_avf.intel_avf_pf") local function cmd(...) local cmd From 263c4e2f8fca51b7aba314a3c866ca47e23c36f5 Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Mon, 20 Sep 2021 16:24:41 +0000 Subject: [PATCH 143/209] lwaftr.setup: fix error reporting in cmd util --- src/program/lwaftr/setup.lua | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/program/lwaftr/setup.lua b/src/program/lwaftr/setup.lua index f15579cba1..5f796092e2 100644 --- a/src/program/lwaftr/setup.lua +++ b/src/program/lwaftr/setup.lua @@ -231,7 +231,8 @@ local function cmd(...) else cmd = cmd.." "..part end end print("shell:", cmd) - assert(os.execute(cmd)) + local status = os.execute(cmd) + assert(status == 0, ("Command failed with return code %d"):format(status)) end function config_intel_mp(c, name, opt) From 6df64ca89f9b55ae6838cc7b7ec01d4c8d5cb38a Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Mon, 20 Sep 2021 16:56:03 +0000 Subject: [PATCH 144/209] core.lib: fix error detection in writefile --- src/core/lib.lua | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/core/lib.lua b/src/core/lib.lua index fcf6a92c64..3d8b330e86 100644 --- a/src/core/lib.lua +++ b/src/core/lib.lua @@ -71,8 +71,7 @@ function writefile (filename, value) local f = io.open(filename, "w") if f == nil then error("Unable to open file: " .. filename) end local result = f:write(value) - f:close() - return result + return f:close() and result end function readlink (path) From 720235f05ba00cdea8ffab46ba5ec6a6c2e65acb Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Mon, 20 Sep 2021 16:56:56 +0000 Subject: [PATCH 145/209] lwaftr.setup: print more useful error message... when unable to write sriov_numvfs --- src/program/lwaftr/setup.lua | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/program/lwaftr/setup.lua b/src/program/lwaftr/setup.lua index 5f796092e2..6b5654bbad 100644 --- a/src/program/lwaftr/setup.lua +++ b/src/program/lwaftr/setup.lua @@ -329,7 +329,8 @@ function config_intel_avf_pf(c, name, opt, lwconfig) end if lwutil.is_lowest_queue(lwconfig) then print("Setting "..path.."/sriov_numvfs = "..numvf) - assert(lib.writefile(path.."/sriov_numvfs", numvf)) + assert(lib.writefile(path.."/sriov_numvfs", numvf), + "Failed to allocate VFs.") cmd('ip link set up', 'dev', ifname) cmd('ip link set', ifname, 'vf', 0, 'mac', ethernet:ntop(vfmac[0])) cmd('ip link set', ifname, 'vf', 0, 'spoofchk off') From fa553a6e824c4f2591791f4a0e0f8cc50a026241 Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Wed, 22 Sep 2021 12:47:05 +0000 Subject: [PATCH 146/209] apps.intel_avf: move device stats to pci/ --- src/apps/intel_avf/intel_avf.lua | 77 ++++++++++++++++---------------- 1 file changed, 39 insertions(+), 38 deletions(-) diff --git a/src/apps/intel_avf/intel_avf.lua b/src/apps/intel_avf/intel_avf.lua index 108a0c4a4e..c41ea28ccf 100644 --- a/src/apps/intel_avf/intel_avf.lua +++ b/src/apps/intel_avf/intel_avf.lua @@ -943,22 +943,25 @@ function Intel_avf:new(conf) vlan = conf.vlan, r = {}, ring_buffer_size = conf.ring_buffer_size, - shm = { - rxbytes = {counter}, - rxpackets = {counter}, - rxmcast = {counter}, - rxbcast = {counter}, - rxdrop = {counter}, - rx_unknown_protocol = {counter}, - txbytes = {counter}, - txpackets = {counter}, - txmcast = {counter}, - txbcast = {counter}, - txdrop = {counter}, - txerrors = {counter} - }, sync_stats_throttle = lib.throttle(1) } + -- PCI device statistics + local frame = { + macaddr = {counter}, + rxbytes = {counter}, + rxpackets = {counter}, + rxmcast = {counter}, + rxbcast = {counter}, + rxdrop = {counter}, + rxerrors = {counter}, + txbytes = {counter}, + txpackets = {counter}, + txmcast = {counter}, + txbcast = {counter}, + txdrop = {counter}, + txerrors = {counter} + } + self.stats = shm.create_frame("pci/"..self.pciaddress, frame) -- pg79 /* number of descriptors, multiple of 32 */ assert(self.ring_buffer_size % 32 == 0, @@ -990,6 +993,9 @@ function Intel_avf:new(conf) if self.vlan then self:mbox_sr_vlan() end + + -- publish device MAC address to SHM + counter.set(self.stats.macaddr, self.mac.bits) -- Queue setup self.cxqs = {} @@ -1019,11 +1025,6 @@ function Intel_avf:new(conf) end function Intel_avf:link() - -- Alias SHM frame to canonical location. - if not shm.exists("pci/"..self.pciaddress) then - shm.alias("pci/"..self.pciaddress, "apps/"..self.appname) - end - if self.io then self.io.input, self.io.output = self.input, self.output end @@ -1056,7 +1057,7 @@ function Intel_avf:sync_stats () end end -function Intel_avf:flush_stats () +function Intel_avf:flush_stats () if self.mbox.state == self.mbox.opcodes['VIRTCHNL_OP_GET_STATS'] then self:mbox_r_stats() end @@ -1064,8 +1065,8 @@ function Intel_avf:flush_stats () self:mbox_r_stats() end -function Intel_avf:rxdrop () return counter.read(self.shm.rxdrop) end -function Intel_avf:txdrop () return counter.read(self.shm.txdrop) end +function Intel_avf:rxdrop () return counter.read(self.stats.rxdrop) end +function Intel_avf:txdrop () return counter.read(self.stats.txdrop) end function Intel_avf:reset() -- From "Appendix A Virtual Channel Protocol": @@ -1098,17 +1099,17 @@ function Intel_avf:stop() end) self:free_cxq(cxq) end - -- Unlink SHM alias. + -- Unlink stats frame. shm.unlink("pci/"..self.pciaddress) end function Intel_avf:report () self:flush_stats() for _, c in ipairs{ - 'rxbytes', 'rxpackets', 'rxmcast', 'rxbcast', 'rxdrop', 'rx_unknown_protocol', + 'rxbytes', 'rxpackets', 'rxmcast', 'rxbcast', 'rxdrop', 'rxdrop', 'txbytes', 'txpackets', 'txmcast', 'txbcast', 'txdrop', 'txerrors' } do - print((" %-20s %20s"):format(c, lib.comma_value(counter.read(self.shm[c])))) + print((" %-20s %20s"):format(c, lib.comma_value(counter.read(self.stats[c])))) end end @@ -1187,18 +1188,18 @@ function Intel_avf:mbox_r_stats(async) local stats = ffi.cast(eth_stats_ptr_t, ret) local set = counter.set - set(self.shm.rxbytes, stats.rx_bytes) - set(self.shm.rxpackets, stats.rx_unicast) - set(self.shm.rxmcast, stats.rx_multicast) - set(self.shm.rxbcast, stats.rx_broadcast) - set(self.shm.rxdrop, stats.rx_discards) - set(self.shm.rx_unknown_protocol, stats.rx_unknown_protocol) - - set(self.shm.txbytes, stats.tx_bytes) - set(self.shm.txpackets, stats.tx_unicast) - set(self.shm.txmcast, stats.tx_multicast) - set(self.shm.txbcast, stats.tx_broadcast) - set(self.shm.txdrop, stats.tx_discards) - set(self.shm.txerrors, stats.tx_errors) + set(self.stats.rxbytes, stats.rx_bytes) + set(self.stats.rxpackets, stats.rx_unicast) + set(self.stats.rxmcast, stats.rx_multicast) + set(self.stats.rxbcast, stats.rx_broadcast) + set(self.stats.rxdrop, stats.rx_discards) + set(self.stats.rxdrop, stats.rx_unknown_protocol) + + set(self.stats.txbytes, stats.tx_bytes) + set(self.stats.txpackets, stats.tx_unicast) + set(self.stats.txmcast, stats.tx_multicast) + set(self.stats.txbcast, stats.tx_broadcast) + set(self.stats.txdrop, stats.tx_discards) + set(self.stats.txerrors, stats.tx_errors) end From 85428eab2214562db3b7b96dec4c71e3eb678ca5 Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Wed, 22 Sep 2021 12:48:02 +0000 Subject: [PATCH 147/209] snabb top: handle devices that do not specify speed --- src/program/top/top.lua | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/program/top/top.lua b/src/program/top/top.lua index 4ae37c5286..6db5d3617f 100644 --- a/src/program/top/top.lua +++ b/src/program/top/top.lua @@ -643,7 +643,7 @@ function compute_display_tree.interface(tree, prev, dt, t) rchars('%s:', tag:upper()), lchars('%.3f %sPPS', scale(pps)), lchars('%.3f %sbps', scale(bps)), - lchars('%.2f%%', bps/max*100), + max > 0 and lchars('%.2f%%', bps/max*100) or nil, drops > 0 and rchars('%.3f %sPPS dropped', scale(drops)) or nil) end local function show_pci(addr, pci, prev) @@ -651,7 +651,8 @@ function compute_display_tree.interface(tree, prev, dt, t) gridrow(rchars('| '), lchars('')) gridrow(rchars('\\-'), rchars('%s:', addr), - lchars('%d %sbE, MAC: %s', bps, tag, + lchars('%sMAC: %s', + (bps > 0 and ("%d %sbE, "):format(bps, tag)) or '', macaddr_string(tonumber(pci.macaddr and pci.macaddr.value) or 0))) show_traffic('rx', pci, prev) show_traffic('tx', pci, prev) From 89c48fc670de1c98c611456a7642793a95b867ab Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Fri, 1 Oct 2021 11:30:22 +0000 Subject: [PATCH 148/209] lib.ptree: start manager before setting initial configuration We observed a deadlock in the syscall migrate_pages when calling it from the worker processes concurrently to the manager process. This seems to avoid this by making sure that the manager process binds to a NUMA node before forking the worker processes. The relevant call chain here is: Manager:start() -> cpuset:bind_to_numa_node() -> numa.bind_to_numa_node() -> S.migrate_pages(...) --- src/lib/ptree/ptree.lua | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lib/ptree/ptree.lua b/src/lib/ptree/ptree.lua index a6aa22232a..e1deb0bae3 100644 --- a/src/lib/ptree/ptree.lua +++ b/src/lib/ptree/ptree.lua @@ -110,10 +110,10 @@ function new_manager (conf) ret.rpc_callee = rpc.prepare_callee('snabb-config-leader-v1') ret.rpc_handler = rpc.dispatch_handler(ret, 'rpc_', ret.trace) - ret:set_initial_configuration(conf.initial_configuration) - ret:start() + ret:set_initial_configuration(conf.initial_configuration) + return ret end From 4379d1ccd577a2e4408eafa778aa54ed63baf084 Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Fri, 1 Oct 2021 14:45:33 +0000 Subject: [PATCH 149/209] snabb-softwire-v2.yang: fix descriptions of device leaves --- src/lib/yang/snabb-softwire-v2.yang | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/lib/yang/snabb-softwire-v2.yang b/src/lib/yang/snabb-softwire-v2.yang index 6af1db9735..21401af086 100644 --- a/src/lib/yang/snabb-softwire-v2.yang +++ b/src/lib/yang/snabb-softwire-v2.yang @@ -573,8 +573,8 @@ module snabb-softwire-v2 { description "The PCI device the instance should use during lwAFTR operation. If device is configured in on-a-stick mode, the 'external-interface' - device should not be configured. If the 'external-interface is - specified this option should specify the PCI device of the + device should not be configured. If the 'external-interface' + device is specified this option should specify the PCI device of the 'internal-interface' (IPv6 traffic only)."; } @@ -605,7 +605,7 @@ module snabb-softwire-v2 { } leaf device { description - "PCI device of the instance uses for external IPv6 traffic. If this + "PCI device of the instance uses for external IPv4 traffic. If this is left unspecified the lwAFTR configures itself in on-a-stick mode."; type string; From 98e97b407bbae8b8bfb7888b6a9485f1e3cd8eb3 Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Fri, 1 Oct 2021 14:46:13 +0000 Subject: [PATCH 150/209] Revert "snabb-softwire-v2.yang: fix descriptions of device leaves" This reverts commit 4379d1ccd577a2e4408eafa778aa54ed63baf084. --- src/lib/yang/snabb-softwire-v2.yang | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/lib/yang/snabb-softwire-v2.yang b/src/lib/yang/snabb-softwire-v2.yang index 21401af086..6af1db9735 100644 --- a/src/lib/yang/snabb-softwire-v2.yang +++ b/src/lib/yang/snabb-softwire-v2.yang @@ -573,8 +573,8 @@ module snabb-softwire-v2 { description "The PCI device the instance should use during lwAFTR operation. If device is configured in on-a-stick mode, the 'external-interface' - device should not be configured. If the 'external-interface' - device is specified this option should specify the PCI device of the + device should not be configured. If the 'external-interface is + specified this option should specify the PCI device of the 'internal-interface' (IPv6 traffic only)."; } @@ -605,7 +605,7 @@ module snabb-softwire-v2 { } leaf device { description - "PCI device of the instance uses for external IPv4 traffic. If this + "PCI device of the instance uses for external IPv6 traffic. If this is left unspecified the lwAFTR configures itself in on-a-stick mode."; type string; From 75a3373eee0c2aeee4f513c9e1bcdc97e482e6f7 Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Fri, 1 Oct 2021 14:55:48 +0000 Subject: [PATCH 151/209] Revert "lwaftr: keep full config in worker setup" This reverts commit d4a1083588add7e7107084d83c2742635490ffb4. Instead we attach instance device/queue_id meta-data to the worker configs using a setmetatable, and ensure that all config manipulation based on the worker queue id is performed within the manager process. Specifically, this means moving select_instance() from apps.lwaftr.lwaftr to apps.lwaftr.lwutil and calling it from program.lwaftr.setup. (I also took the liberty to make this function pure to avoid future confusion on my end.) --- src/apps/lwaftr/lwaftr.lua | 18 +----------------- src/apps/lwaftr/lwutil.lua | 20 ++++++++++++++++++++ src/lib/yang/snabb-softwire-v2.yang | 13 ------------- src/program/lwaftr/setup.lua | 6 +++--- 4 files changed, 24 insertions(+), 33 deletions(-) diff --git a/src/apps/lwaftr/lwaftr.lua b/src/apps/lwaftr/lwaftr.lua index 8fd945c129..0259cd3fc3 100644 --- a/src/apps/lwaftr/lwaftr.lua +++ b/src/apps/lwaftr/lwaftr.lua @@ -339,21 +339,6 @@ local function drop(pkt) packet.free(pkt) end -local function select_instance(conf) - local function table_merge(t1, t2) - local ret = {} - for k,v in pairs(t1) do ret[k] = v end - for k,v in pairs(t2) do ret[k] = v end - return ret - end - local device, id, queue = lwutil.parse_instance(conf) - conf.softwire_config.external_interface = table_merge( - conf.softwire_config.external_interface, queue.external_interface) - conf.softwire_config.internal_interface = table_merge( - conf.softwire_config.internal_interface, queue.internal_interface) - return conf -end - LwAftr = { yang_schema = 'snabb-softwire-v2' } -- Fields: -- - direction: "in", "out", "hairpin", "drop"; @@ -420,9 +405,8 @@ LwAftr.shm = { function LwAftr:new(conf) if conf.debug then debug = true end local o = setmetatable({}, {__index=LwAftr}) - conf = select_instance(conf).softwire_config + conf = conf.softwire_config o.conf = conf - o.binding_table = bt.load(conf.binding_table) o.inet_lookup_queue = bt.BTLookupQueue.new(o.binding_table) o.hairpin_lookup_queue = bt.BTLookupQueue.new(o.binding_table) diff --git a/src/apps/lwaftr/lwutil.lua b/src/apps/lwaftr/lwutil.lua index 0ce9ead217..023a6edc89 100644 --- a/src/apps/lwaftr/lwutil.lua +++ b/src/apps/lwaftr/lwutil.lua @@ -21,6 +21,7 @@ local ntohs = lib.ntohs -- Return device PCI address, queue ID, and queue configuration. function parse_instance(conf) + assert(conf.worker_config, "conf missing instance/queue metadata.") local device = conf.worker_config.device local id = conf.worker_config.queue_id local queue = conf.softwire_config.instance[device].queue[id] @@ -49,6 +50,25 @@ function num_queues(conf) return n end +function select_instance(conf) + local function table_merge(t1, t2) + local ret = {} + for k,v in pairs(t1) do ret[k] = v end + for k,v in pairs(t2) do ret[k] = v end + return ret + end + local device, id, queue = parse_instance(conf) + local copy = {softwire_config={}} + for k,v in pairs(conf.softwire_config) do + copy.softwire_config[k] = v + end + copy.softwire_config.external_interface = table_merge( + conf.softwire_config.external_interface, queue.external_interface) + copy.softwire_config.internal_interface = table_merge( + conf.softwire_config.internal_interface, queue.internal_interface) + return copy +end + function get_ihl_from_offset(pkt, offset) local ver_and_ihl = pkt.data[offset] return band(ver_and_ihl, 0xf) * 4 diff --git a/src/lib/yang/snabb-softwire-v2.yang b/src/lib/yang/snabb-softwire-v2.yang index 6af1db9735..47cc5aaae1 100644 --- a/src/lib/yang/snabb-softwire-v2.yang +++ b/src/lib/yang/snabb-softwire-v2.yang @@ -886,18 +886,5 @@ module snabb-softwire-v2 { } } - container worker-config { - description - "Worker process configuration state. The contained leaves are used only - internally. Setting them has no effect."; - - leaf device { - type string; - } - leaf queue-id { - type uint8; - } - } - uses state-counters; } diff --git a/src/program/lwaftr/setup.lua b/src/program/lwaftr/setup.lua index 6b5654bbad..697e0b6297 100644 --- a/src/program/lwaftr/setup.lua +++ b/src/program/lwaftr/setup.lua @@ -87,7 +87,7 @@ function lwaftr_app(c, conf) { address = convert_ipv4(iexternal_interface.ip) }) config.app(c, "icmpechov6", ipv6_echo.ICMPEcho, { address = iinternal_interface.ip }) - config.app(c, "lwaftr", lwaftr.LwAftr, conf) + config.app(c, "lwaftr", lwaftr.LwAftr, lwutil.select_instance(conf)) config.app(c, "fragmenterv4", ipv4_fragment.Fragmenter, { mtu=gexternal_interface.mtu }) config.app(c, "fragmenterv6", ipv6_fragment.Fragmenter, @@ -827,8 +827,8 @@ local function compute_worker_configs(conf) for id, _ in pairs(queues.queue) do local worker_id = string.format('%s/%s', device, id) local worker_config = make_copy() - worker_config.worker_config = {device=device, queue_id=id} - ret[worker_id] = worker_config + local meta = {worker_config = {device=device, queue_id=id}} + ret[worker_id] = setmetatable(worker_config, {__index=meta}) end end return ret From bff9d0fab194a12b318bf5c2668627cb21aae0c8 Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Mon, 4 Oct 2021 11:01:24 +0000 Subject: [PATCH 152/209] lib.numa: gracefully handle unset PATH in assert_irqbalanced_disabled --- src/lib/numa.lua | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/lib/numa.lua b/src/lib/numa.lua index 04756c9f6e..49ea0da3bf 100644 --- a/src/lib/numa.lua +++ b/src/lib/numa.lua @@ -145,7 +145,9 @@ local irqbalanced_checked = false local function assert_irqbalanced_disabled (warn) if irqbalanced_checked then return end irqbalanced_checked = true - for path in os.getenv('PATH'):split(':') do + local env_path = os.getenv('PATH') + if not env_path then return end + for path in env_path:split(':') do if S.stat(path..'/irqbalance') then if S.stat('/etc/default/irqbalance') then for line in io.lines('/etc/default/irqbalance') do From b97ccc25c2c1e38f248757d45c5a71f27c3059f5 Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Mon, 4 Oct 2021 13:37:12 +0000 Subject: [PATCH 153/209] Revert "snabb-softwire-v2: allow more than two queues" This reverts commit bce8579eb8ea0f2d8d4298b848952f3e1ec624e8. --- src/lib/yang/snabb-softwire-v2.yang | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/lib/yang/snabb-softwire-v2.yang b/src/lib/yang/snabb-softwire-v2.yang index 47cc5aaae1..17239e71a1 100644 --- a/src/lib/yang/snabb-softwire-v2.yang +++ b/src/lib/yang/snabb-softwire-v2.yang @@ -583,7 +583,7 @@ module snabb-softwire-v2 { key "id"; leaf id { - type uint8; + type uint8 { range 0..1; } description "RSS queue on which to attach. Traffic will be partitioned evenly between instances servicing queues on the same @@ -591,7 +591,12 @@ module snabb-softwire-v2 { is a function of the TCP or UDP source and destination ports (if any) and the source and destination IPv4 or IPv6 addresses. Fragmented packets will be delivered to the - lowest-numbered queue."; + lowest-numbered queue. + + Note that currently the lwAFTR is restricted to running at + most 2 RSS workers per device. This limitation may be lifted + to 4 soon. Raising it farther is possible but needs changes + to how the lwAFTR uses its PCI devices."; } container external-interface { From 21013be89fc81964b4db3401304b39011d8f5fa6 Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Mon, 4 Oct 2021 13:37:45 +0000 Subject: [PATCH 154/209] Revert "snabb-softwire-v2.yang: documentation edits" This reverts commit 1cdc6352d48d51e7e53b7d50b91c4be43ca7e19f. --- src/lib/yang/snabb-softwire-v2.yang | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/lib/yang/snabb-softwire-v2.yang b/src/lib/yang/snabb-softwire-v2.yang index 17239e71a1..5942f682d2 100644 --- a/src/lib/yang/snabb-softwire-v2.yang +++ b/src/lib/yang/snabb-softwire-v2.yang @@ -678,12 +678,12 @@ module snabb-softwire-v2 { leaf ip { type inet:ipv6-address; description - "IPv6 address of the next hop for the internal-facing NIC. - The lwAFTR will resolve this to a MAC address using NDP."; + "IPv4 address of the next hop for the internet-facing NIC. + The lwAFTR will resolve this to a MAC address using ARP."; } leaf resolved-mac { config false; - description "Resolved next-hop mac address found by NDP."; + description "Resolved next-hop mac address found by ARP."; type yang:mac-address; } } @@ -692,7 +692,7 @@ module snabb-softwire-v2 { type yang:mac-address; description "Statically configured MAC address of the next hop for the - internal-facing NIC."; + internet-facing NIC."; } } } From f903e0f84b4b05088c2ac08de9196eb6ae613bcc Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Mon, 4 Oct 2021 13:38:00 +0000 Subject: [PATCH 155/209] Revert "snabb-softwire-v2: add default for leaf" This reverts commit dca300464830d7ded3403a6cea88a4f8b94ac7fe. --- src/lib/yang/snabb-softwire-v2.yang | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/lib/yang/snabb-softwire-v2.yang b/src/lib/yang/snabb-softwire-v2.yang index 5942f682d2..bd14a9e760 100644 --- a/src/lib/yang/snabb-softwire-v2.yang +++ b/src/lib/yang/snabb-softwire-v2.yang @@ -11,11 +11,6 @@ module snabb-softwire-v2 { description "Configuration for the Snabb Switch lwAFTR."; - revision 2021-07-13 { - description - "Add default value for error-rate-limiting/packets."; - } - revision 2019-09-17 { description "Add discontinuity time to softwire-state."; @@ -513,7 +508,6 @@ module snabb-softwire-v2 { container error-rate-limiting { leaf packets { type uint32; - default 200; description "The number of ICMP error messages which can be sent within the specified time period."; From 0288de4ac39e3907ace4a79a124eaa6ac568086d Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Mon, 4 Oct 2021 16:28:19 +0000 Subject: [PATCH 156/209] lib.yang.data: parse defaults of nested leaves This fixes a bug where the YANG parser would not return default values for leaves which nested in containers left unspecified in the configuration. --- src/lib/yang/data.lua | 65 ++++++++++++++++++++++++++++++++++--------- 1 file changed, 52 insertions(+), 13 deletions(-) diff --git a/src/lib/yang/data.lua b/src/lib/yang/data.lua index 2d172eb0a7..c06c52e50f 100644 --- a/src/lib/yang/data.lua +++ b/src/lib/yang/data.lua @@ -388,10 +388,9 @@ end local function struct_parser(keyword, members, ctype) local keys = {} for k,v in pairs(members) do table.insert(keys, k) end - local function init() return nil end - local function parse1(P) - local ret = {} - local expanded_members = {} + local ret, expanded_members + local function init() + ret, expanded_members = {}, {} for _,k in ipairs(keys) do if members[k].represents then -- Choice fields don't include the name of the choice block in the data. They @@ -407,6 +406,8 @@ local function struct_parser(keyword, members, ctype) expanded_members[k] = members[k] end end + end + local function parse1(P) P:skip_whitespace() P:consume("{") P:skip_whitespace() @@ -422,10 +423,6 @@ local function struct_parser(keyword, members, ctype) ret[id] = sub.parse(P, ret[id], k) P:skip_whitespace() end - for k,_ in pairs(expanded_members) do - local id = normalize_id(k) - ret[id] = expanded_members[k].finish(ret[id], k) - end return ret end local function parse(P, out) @@ -434,12 +431,14 @@ local function struct_parser(keyword, members, ctype) end local struct_t = ctype and typeof(ctype) local function finish(out, leaf) + for k,_ in pairs(expanded_members) do + out = out or {} + local id = normalize_id(k) + out[id] = expanded_members[k].finish(out[id], k) + end -- FIXME check mandatory values. if struct_t then - local ret - if out == nil then ret = struct_t() - else ret = struct_t(out) end - return ret + return struct_t(out) else return out end @@ -633,7 +632,9 @@ local function table_parser(keyword, keys, values, native_key, key_ctype, return assoc end local function finish(assoc) - return assoc:finish() + if assoc then + return assoc:finish() + end end return {init=init, parse=parse, finish=finish} end @@ -1843,6 +1844,44 @@ function selftest() ]]) assert(object.summary.shelves_active) + -- Test nested defaults + local default_schema = [[module default-schema { + namespace "urn:ietf:params:xml:ns:yang:default-schema"; + prefix "default"; + + container optional { + leaf default { + type string; + default "foo"; + } + } + }]] + local loaded_schema = schema.load_schema(default_schema) + local object = load_config_for_schema(loaded_schema, + mem.open_input_string "") + assert(object.optional) + assert(object.optional.default == "foo") + + local default2_schema = [[module default2-schema { + namespace "urn:ietf:params:xml:ns:yang:default2-schema"; + prefix "default"; + + container optional1 { + container optional2 { + leaf default { + type string; + default "foo"; + } + } + } + }]] + local loaded_schema = schema.load_schema(default2_schema) + local object = load_config_for_schema(loaded_schema, + mem.open_input_string "") + assert(object.optional1) + assert(object.optional1.optional2) + assert(object.optional1.optional2.default == "foo") + -- Test choice field. local choice_schema = schema.load_schema([[module choice-schema { namespace "urn:ietf:params:xml:ns:yang:choice-schema"; From eea42d2b660d73445691ba49ac5c97538ebc31df Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Mon, 4 Oct 2021 16:28:52 +0000 Subject: [PATCH 157/209] lib.numa: do not migrate pages from preferred node --- src/lib/numa.lua | 1 + 1 file changed, 1 insertion(+) diff --git a/src/lib/numa.lua b/src/lib/numa.lua index 49ea0da3bf..f3ab02418a 100644 --- a/src/lib/numa.lua +++ b/src/lib/numa.lua @@ -232,6 +232,7 @@ function bind_to_numa_node (node, policy) -- Migrate any pages that might have the wrong affinity. local from_mask = assert(S.get_mempolicy(nil, nil, nil, 'mems_allowed')).mask + from_mask[node] = false local ok, err = S.migrate_pages(0, from_mask, node) if not ok then warn("Failed to migrate pages to NUMA node %d: %s\n", From 1588452edc3ec7714140e7416ffbb5d926198465 Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Tue, 19 Oct 2021 09:50:52 +0000 Subject: [PATCH 158/209] lib.protocol.ethernet: fix truncation bug in ptoi bit munging was broken because of truncation to u8 --- src/lib/protocol/ethernet.lua | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/lib/protocol/ethernet.lua b/src/lib/protocol/ethernet.lua index ad8e4dd2db..da9a122e88 100644 --- a/src/lib/protocol/ethernet.lua +++ b/src/lib/protocol/ethernet.lua @@ -70,12 +70,12 @@ end function ethernet:ptoi (p) local n = ethernet:pton(p) assert(ffi.abi("le")) - return bit.bor(bit.lshift(n[0], 40), - bit.lshift(n[1], 32), - bit.lshift(n[2], 24), - bit.lshift(n[3], 16), - bit.lshift(n[4], 8), - bit.lshift(n[5], 0)) + return bit.bor(bit.lshift(0ULL+n[0], 40), + bit.lshift(0ULL+n[1], 32), + bit.lshift(0ULL+n[2], 24), + bit.lshift(0ULL+n[3], 16), + bit.lshift(0ULL+n[4], 8), + bit.lshift(0ULL+n[5], 0)) end -- Mapping of an IPv6 multicast address to a MAC address per RFC2464, From a6734ae53c9e9295bee4696eacbca2d474cc7d55 Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Wed, 20 Oct 2021 09:46:22 +0000 Subject: [PATCH 159/209] lwaftr: qualify shared_next_mac_key with vlan This allows testing with 1-to-n port setups by e.g. using n 10G ports in n vlans for load generation connected to one >10G port used by lwaftr. --- src/program/lwaftr/setup.lua | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/program/lwaftr/setup.lua b/src/program/lwaftr/setup.lua index 697e0b6297..d7f25ecb2b 100644 --- a/src/program/lwaftr/setup.lua +++ b/src/program/lwaftr/setup.lua @@ -96,14 +96,16 @@ function lwaftr_app(c, conf) { self_ip = iinternal_interface.ip, self_mac = iinternal_interface.mac, next_mac = iinternal_interface.next_hop.mac, - shared_next_mac_key = "group/"..device.."-ipv6-next-mac", + shared_next_mac_key = ("group/%s-ipv6-next-mac-%d"):format( + device, iinternal_interface.vlan_tag or 0), next_ip = iinternal_interface.next_hop.ip, alarm_notification = conf.alarm_notification }) config.app(c, "arp", arp.ARP, { self_ip = convert_ipv4(iexternal_interface.ip), self_mac = iexternal_interface.mac, next_mac = iexternal_interface.next_hop.mac, - shared_next_mac_key = "group/"..device.."-ipv4-next-mac", + shared_next_mac_key = ("group/%s-ipv4-next-mac-%d"):format( + device, iexternal_interface.vlan_tag or 0), next_ip = convert_ipv4(iexternal_interface.next_hop.ip), alarm_notification = conf.alarm_notification }) From c1c4854d17ef0a9aa77ea222427d1aaf06e90f5a Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Wed, 20 Oct 2021 10:55:01 +0000 Subject: [PATCH 160/209] core.packet: fix account_free physical capacity formula --- src/core/packet.lua | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/core/packet.lua b/src/core/packet.lua index 1495483403..0916149f0f 100644 --- a/src/core/packet.lua +++ b/src/core/packet.lua @@ -269,7 +269,8 @@ function account_free (p) counter.add(engine.freebytes, p.length) -- Calculate bits of physical capacity required for packet on 10GbE -- Account for minimum data size and overhead of CRC and inter-packet gap - counter.add(engine.freebits, (math.max(p.length, 46) + 4 + 5) * 8) + -- https://en.wikipedia.org/wiki/Ethernet_frame + counter.add(engine.freebits, (12 + 8 + math.max(p.length, 60) + 4) * 8) end function free (p) From 8c44e614d805173f7fa081edf49f9ae0c557bac0 Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Fri, 5 Nov 2021 13:32:21 +0000 Subject: [PATCH 161/209] apps.intel_avf: fix IRQ setup for additional queues --- src/apps/intel_avf/intel_avf.lua | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/apps/intel_avf/intel_avf.lua b/src/apps/intel_avf/intel_avf.lua index c41ea28ccf..144167fff1 100644 --- a/src/apps/intel_avf/intel_avf.lua +++ b/src/apps/intel_avf/intel_avf.lua @@ -1004,7 +1004,7 @@ function Intel_avf:new(conf) end self:init_irq() - self:mbox_sr_irq() + self:mbox_sr_irq(conf.nqueues or 1) self:mbox_sr_q(self.cxqs) self:mbox_sr_enable_q(#self.cxqs) @@ -1121,12 +1121,12 @@ function Intel_avf:init_irq() self.r.VFINT_DYN_CTLN[0](v) end -function Intel_avf:mbox_sr_irq() +function Intel_avf:mbox_sr_irq(nqueues) local tt = self:mbox_send_buf(virtchnl_irq_map_info_ptr_t) tt.num_vectors = 1 tt.vsi_id = self.vsi_id tt.vector_id = 0 - tt.rxq_map = 1 + tt.rxq_map = 2^nqueues-1 -- disable interrupts for all queues self:mbox_sr("VIRTCHNL_OP_CONFIG_IRQ_MAP", ffi.sizeof(virtchnl_irq_map_info_t) + 12) end From 18fc9f5bff0e5021163a57c88a742e90badc6830 Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Mon, 8 Nov 2021 09:32:56 +0000 Subject: [PATCH 162/209] lwaftr: fix bugs introduced in 75a3373e NB: extra leaves got stripped by config serialization. Merge interface configs in lwaftr:new, pass valid-per-schema config to app but remove all but the respective instance/queue. Update lwutil.parse_instance to fall back to old behavior (pre d4a10835). --- src/apps/lwaftr/lwaftr.lua | 2 +- src/apps/lwaftr/lwutil.lua | 50 ++++++++++++++++++++++++++++++-------- 2 files changed, 41 insertions(+), 11 deletions(-) diff --git a/src/apps/lwaftr/lwaftr.lua b/src/apps/lwaftr/lwaftr.lua index 0259cd3fc3..72f98fa1dc 100644 --- a/src/apps/lwaftr/lwaftr.lua +++ b/src/apps/lwaftr/lwaftr.lua @@ -405,7 +405,7 @@ LwAftr.shm = { function LwAftr:new(conf) if conf.debug then debug = true end local o = setmetatable({}, {__index=LwAftr}) - conf = conf.softwire_config + conf = lwutil.merge_instance(conf).softwire_config o.conf = conf o.binding_table = bt.load(conf.binding_table) o.inet_lookup_queue = bt.BTLookupQueue.new(o.binding_table) diff --git a/src/apps/lwaftr/lwutil.lua b/src/apps/lwaftr/lwutil.lua index 023a6edc89..13312908d7 100644 --- a/src/apps/lwaftr/lwutil.lua +++ b/src/apps/lwaftr/lwutil.lua @@ -7,6 +7,7 @@ local bit = require("bit") local ffi = require("ffi") local lib = require("core.lib") local cltable = require("lib.cltable") +local binary = require("lib.yang.binary") local band = bit.band local cast = ffi.cast @@ -21,11 +22,23 @@ local ntohs = lib.ntohs -- Return device PCI address, queue ID, and queue configuration. function parse_instance(conf) - assert(conf.worker_config, "conf missing instance/queue metadata.") - local device = conf.worker_config.device - local id = conf.worker_config.queue_id - local queue = conf.softwire_config.instance[device].queue[id] - return device, id, queue + if conf.worker_config then + local device = conf.worker_config.device + local id = conf.worker_config.queue_id + local queue = conf.softwire_config.instance[device].queue[id] + return device, id, queue + else + local device, id + for dev in pairs(conf.softwire_config.instance) do + assert(not device, "Config contains more than one device") + device = dev + end + for queue in pairs(conf.softwire_config.instance[device].queue) do + assert(not id, "Config contains more than one queue") + id = queue + end + return device, id, conf.softwire_config.instance[device].queue[id] + end end function is_on_a_stick(device, queue) @@ -51,17 +64,34 @@ function num_queues(conf) end function select_instance(conf) + local copier = binary.config_copier_for_schema_by_name('snabb-softwire-v2') + local device, id = parse_instance(conf) + local copy = copier(conf)() + local instance = copy.softwire_config.instance + for other_device, queues in pairs(conf.softwire_config.instance) do + if other_device ~= device then + instance[other_device] = nil + else + for other_id, _ in pairs(queues.queue) do + if other_id ~= id then + instance[device].queue[other_id] = nil + end + end + end + end + return copy +end + +function merge_instance (conf) local function table_merge(t1, t2) local ret = {} for k,v in pairs(t1) do ret[k] = v end for k,v in pairs(t2) do ret[k] = v end return ret end - local device, id, queue = parse_instance(conf) - local copy = {softwire_config={}} - for k,v in pairs(conf.softwire_config) do - copy.softwire_config[k] = v - end + local copier = binary.config_copier_for_schema_by_name('snabb-softwire-v2') + local copy = copier(conf)() + local _, _, queue = parse_instance(conf) copy.softwire_config.external_interface = table_merge( conf.softwire_config.external_interface, queue.external_interface) copy.softwire_config.internal_interface = table_merge( From 3c669a9050badbc2d5dbb2a4c63f82fef9008e45 Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Mon, 8 Nov 2021 09:47:18 +0000 Subject: [PATCH 163/209] lwaftr: create snabb-softwire-v3 --- src/lib/yang/snabb-softwire-v3.yang | 894 ++++++++++++++++++++++++++++ 1 file changed, 894 insertions(+) create mode 100644 src/lib/yang/snabb-softwire-v3.yang diff --git a/src/lib/yang/snabb-softwire-v3.yang b/src/lib/yang/snabb-softwire-v3.yang new file mode 100644 index 0000000000..a0d18bfb9d --- /dev/null +++ b/src/lib/yang/snabb-softwire-v3.yang @@ -0,0 +1,894 @@ +module snabb-softwire-v3 { + yang-version 1.1; + namespace snabb:softwire-v3; + prefix softwire; + + import ietf-inet-types { prefix inet; } + import ietf-yang-types { prefix yang; } + + organization "Snabb"; + contact "Max Rottenkolber "; + description + "Configuration for the Snabb lwAFTR."; + + revision 2021-11-08 { + description + "Change module+namespace to v3. Update organization and contact."; + } + + revision 2019-09-17 { + description + "Add discontinuity time to softwire-state."; + } + + revision 2018-10-13 { + description + "Add flow-label setting."; + } + + revision 2017-04-17 { + description + "Removal of br-address leaf-list and br leaf. It adds the + addition of br-address binding_table.softwire. This is to + make the schema more yang-like. One now only need to specify + the br-address on the softwire rather than managing the index's + to a leaf-list of them. + + This also removes the psid-map list and adds a new port-set + container on the softwire container instead. This will help + adding the softwires as well as bring it more inline with the + ietf-softwire schema. + + The addition of /softwire-config/instance allows for configuring + multiple instances of the lwAFTR with a shared binding table and + other common configuration properties."; + } + + revision 2016-11-04 { + description + "Initial revision."; + } + + grouping state-counters { + container softwire-state { + + description "State data about interface."; + config false; + + leaf discontinuity-time { + type yang:date-and-time; + mandatory true; + description + "The time of the most recent occasion on which the lwaftr instance + suffered a discontinuity. This is set to the current time whenever + the lwaftr instance is started or configured."; + } + + leaf drop-all-ipv4-iface-bytes { + type yang:zero-based-counter64; + description + "All dropped packets and bytes that came in over IPv4 interfaces, + whether or not they actually IPv4 (they only include data about + packets that go in/out over the wires, excluding internally generated + ICMP packets)."; + } + leaf drop-all-ipv4-iface-packets { + type yang:zero-based-counter64; + description + "All dropped packets and bytes that came in over IPv4 interfaces, + whether or not they actually IPv4 (they only include data about + packets that go in/out over the wires, excluding internally generated + ICMP packets)."; + } + leaf drop-all-ipv6-iface-bytes { + type yang:zero-based-counter64; + description + "All dropped packets and bytes that came in over IPv6 interfaces, + whether or not they actually IPv6 (they only include data about packets + that go in/out over the wires, excluding internally generated ICMP + packets)."; + } + leaf drop-all-ipv6-iface-packets { + type yang:zero-based-counter64; + description + "All dropped packets and bytes that came in over IPv6 interfaces, + whether or not they actually IPv6 (they only include data about packets + that go in/out over the wires, excluding internally generated ICMP + packets)."; + } + leaf drop-bad-checksum-icmpv4-bytes { + type yang:zero-based-counter64; + description "ICMPv4 packets dropped because of a bad checksum."; + } + leaf drop-bad-checksum-icmpv4-packets { + type yang:zero-based-counter64; + description "ICMPv4 packets dropped because of a bad checksum."; + } + leaf drop-in-by-policy-icmpv4-bytes { + type yang:zero-based-counter64; + description "Incoming ICMPv4 packets dropped because of current policy."; + } + leaf drop-in-by-policy-icmpv4-packets { + type yang:zero-based-counter64; + description "Incoming ICMPv4 packets dropped because of current policy."; + } + leaf drop-in-by-policy-icmpv6-bytes { + type yang:zero-based-counter64; + description "Incoming ICMPv6 packets dropped because of current policy."; + } + leaf drop-in-by-policy-icmpv6-packets { + type yang:zero-based-counter64; + description "Incoming ICMPv6 packets dropped because of current policy."; + } + leaf drop-in-by-rfc7596-icmpv4-bytes { + type yang:zero-based-counter64; + description + "Incoming ICMPv4 packets with no destination (RFC 7596 section 8.1)."; + } + leaf drop-in-by-rfc7596-icmpv4-packets { + type yang:zero-based-counter64; + description + "Incoming ICMPv4 packets with no destination (RFC 7596 section 8.1)."; + } + leaf drop-ipv4-frag-disabled { + type yang:zero-based-counter64; + description + "If fragmentation is disabled, the only potentially non-zero IPv4 + fragmentation counter is drop-ipv4-frag-disabled. If fragmentation is + enabled, it will always be zero."; + } + leaf drop-ipv4-frag-invalid-reassembly { + type yang:zero-based-counter64; + description + "Two or more IPv4 fragments were received, and reassembly was started, + but was invalid and dropped. Causes include multiple fragments claiming + they are the last fragment, overlapping fragment offsets, or the packet + was being reassembled from too many fragments (the setting is + max_fragments_per_reassembly_packet, and the default is that no packet + should be reassembled from more than 40)."; + } + leaf drop-ipv4-frag-random-evicted { + type yang:zero-based-counter64; + description + "Reassembling an IPv4 packet from fragments was in progress, but the + configured amount of packets to reassemble at once was exceeded, so one + was dropped at random. Consider increasing the setting + max_ipv4_reassembly_packets."; + } + leaf drop-ipv6-frag-disabled { + type yang:zero-based-counter64; + description + "If fragmentation is disabled, the only potentially non-zero IPv6 + fragmentation counter is drop-ipv6-frag-disabled. If fragmentation is + enabled, it will always be zero."; + } + leaf drop-ipv6-frag-invalid-reassembly { + type yang:zero-based-counter64; + description + "Two or more IPv6 fragments were received, and reassembly was started, + but was invalid and dropped. Causes include multiple fragments claiming + they are the last fragment, overlapping fragment offsets, or the packet + was being reassembled from too many fragments (the setting is + max_fragments_per_reassembly_packet, and the default is that no packet + should be reassembled from more than 40)."; + } + leaf drop-ipv6-frag-random-evicted { + type yang:zero-based-counter64; + description + "Reassembling an IPv6 packet from fragments was in progress, but the + configured amount of packets to reassemble at once was exceeded, so one + was dropped at random. Consider increasing the setting + max_ipv6_reassembly_packets."; + } + leaf drop-misplaced-not-ipv4-bytes { + type yang:zero-based-counter64; + description "Non-IPv4 packets incoming on the IPv4 link."; + } + leaf drop-misplaced-not-ipv4-packets { + type yang:zero-based-counter64; + description "Non-IPv4 packets incoming on the IPv4 link."; + } + leaf drop-misplaced-not-ipv6-bytes { + type yang:zero-based-counter64; + description "Non-IPv6 packets incoming on IPv6 link."; + } + leaf drop-misplaced-not-ipv6-packets { + type yang:zero-based-counter64; + description "Non-IPv6 packets incoming on IPv6 link."; + } + leaf drop-no-dest-softwire-ipv4-bytes { + type yang:zero-based-counter64; + description + "No matching destination softwire in the binding table; incremented + whether or not the reason was RFC7596."; + } + leaf drop-no-dest-softwire-ipv4-packets { + type yang:zero-based-counter64; + description + "No matching destination softwire in the binding table; incremented + whether or not the reason was RFC7596."; + } + leaf drop-no-source-softwire-ipv6-bytes { + type yang:zero-based-counter64; + description + "No matching source softwire in the binding table; incremented whether + or not the reason was RFC7596."; + } + leaf drop-no-source-softwire-ipv6-packets { + type yang:zero-based-counter64; + description + "No matching source softwire in the binding table; incremented whether + or not the reason was RFC7596."; + } + leaf drop-out-by-policy-icmpv4-packets { + type yang:zero-based-counter64; + description + "Internally generated ICMPv4 error packets dropped because of current + policy."; + } + leaf drop-out-by-policy-icmpv6-packets { + type yang:zero-based-counter64; + description + "Internally generated ICMPv6 packets dropped because of current + policy."; + } + leaf drop-over-mtu-but-dont-fragment-ipv4-bytes { + type yang:zero-based-counter64; + description + "IPv4 packets whose size exceeded the MTU, but the DF (Don't Fragment) + flag was set."; + } + leaf drop-over-mtu-but-dont-fragment-ipv4-packets { + type yang:zero-based-counter64; + description + "IPv4 packets whose size exceeded the MTU, but the DF (Don't Fragment) + flag was set."; + } + leaf drop-over-rate-limit-icmpv6-bytes { + type yang:zero-based-counter64; + description + "Packets dropped because the outgoing ICMPv6 rate limit was reached."; + } + leaf drop-over-rate-limit-icmpv6-packets { + type yang:zero-based-counter64; + description + "Packets dropped because the outgoing ICMPv6 rate limit was reached."; + } + leaf drop-over-time-but-not-hop-limit-icmpv6-bytes { + type yang:zero-based-counter64; + description + "Packet's time limit was exceeded, but the hop limit was not."; + } + leaf drop-over-time-but-not-hop-limit-icmpv6-packets { + type yang:zero-based-counter64; + description + "Packet's time limit was exceeded, but the hop limit was not."; + } + leaf drop-too-big-type-but-not-code-icmpv6-bytes { + type yang:zero-based-counter64; + description + "Packet's ICMP type was 'Packet too big' but its ICMP code was not an + acceptable one for this type."; + } + leaf drop-too-big-type-but-not-code-icmpv6-packets { + type yang:zero-based-counter64; + description + "Packet's ICMP type was 'Packet too big' but its ICMP code was not an + acceptable one for this type."; + } + leaf drop-ttl-zero-ipv4-bytes { + type yang:zero-based-counter64; + description "IPv4 packets dropped because their TTL was zero."; + } + leaf drop-ttl-zero-ipv4-packets { + type yang:zero-based-counter64; + description "IPv4 packets dropped because their TTL was zero."; + } + leaf drop-unknown-protocol-icmpv6-bytes { + type yang:zero-based-counter64; + description "Packets with an unknown ICMPv6 protocol."; + } + leaf drop-unknown-protocol-icmpv6-packets { + type yang:zero-based-counter64; + description "Packets with an unknown ICMPv6 protocol."; + } + leaf drop-unknown-protocol-ipv6-bytes { + type yang:zero-based-counter64; + description "Packets with an unknown IPv6 protocol."; + } + leaf drop-unknown-protocol-ipv6-packets { + type yang:zero-based-counter64; + description "Packets with an unknown IPv6 protocol."; + } + leaf hairpin-ipv4-bytes { + type yang:zero-based-counter64; + description "IPv4 packets going to a known b4 (hairpinned)."; + } + leaf hairpin-ipv4-packets { + type yang:zero-based-counter64; + description "IPv4 packets going to a known b4 (hairpinned)."; + } + leaf in-ipv4-bytes { + type yang:zero-based-counter64; + description "All valid outgoing IPv4 packets."; + } + leaf in-ipv4-frag-needs-reassembly { + type yang:zero-based-counter64; + description "An IPv4 fragment was received."; + } + leaf in-ipv4-frag-reassembled { + type yang:zero-based-counter64; + description "A packet was successfully reassembled from IPv4 fragments."; + } + leaf in-ipv4-frag-reassembly-unneeded { + type yang:zero-based-counter64; + description + "An IPv4 packet which was not a fragment was received - consequently, + it did not need to be reassembled. This should be the usual case."; + } + leaf in-ipv4-packets { + type yang:zero-based-counter64; + description "All valid outgoing IPv4 packets."; + } + leaf in-ipv6-bytes { + type yang:zero-based-counter64; + description "All valid outgoing IPv4 packets."; + } + leaf in-ipv6-frag-needs-reassembly { + type yang:zero-based-counter64; + description "An IPv6 fragment was received."; + } + leaf in-ipv6-frag-reassembled { + type yang:zero-based-counter64; + description "A packet was successfully reassembled from IPv6 fragments."; + } + leaf in-ipv6-frag-reassembly-unneeded { + type yang:zero-based-counter64; + description + "An IPv6 packet which was not a fragment was received - consequently, it + did not need to be reassembled. This should be the usual case."; + } + leaf in-ipv6-packets { + type yang:zero-based-counter64; + description "All valid outgoing IPv4 packets."; + } + leaf ingress-packet-drops { + type yang:zero-based-counter64; + description "Packets dropped due to ingress filters."; + } + leaf memuse-ipv4-frag-reassembly-buffer { + type yang:zero-based-counter64; + description + "The amount of memory being used by the statically sized data structure + for reassembling IPv4 fragments. This is directly proportional to the + setting max_ipv4_reassembly_packets."; + } + leaf memuse-ipv6-frag-reassembly-buffer { + type yang:zero-based-counter64; + description + "The amount of memory being used by the statically sized data structure + for reassembling IPv6 fragments. This is directly proportional to the + setting max_ipv6_reassembly_packets."; + } + leaf out-icmpv4-bytes { + type yang:zero-based-counter64; + description "Internally generated ICMPv4 packets."; + } + leaf out-icmpv4-packets { + type yang:zero-based-counter64; + description "Internally generated ICMPv4 packets."; + } + leaf out-icmpv6-bytes { + type yang:zero-based-counter64; + description "Internally generted ICMPv6 error packets."; + } + leaf out-icmpv6-packets { + type yang:zero-based-counter64; + description "Internally generted ICMPv6 error packets."; + } + leaf out-ipv4-bytes { + type yang:zero-based-counter64; + description "Valid outgoing IPv4 packets."; + } + leaf out-ipv4-frag { + type yang:zero-based-counter64; + description + "An outgoing packet exceeded the configured IPv4 MTU, so needed to be + fragmented. This may happen, but should be unusual."; + } + leaf out-ipv4-frag-not { + type yang:zero-based-counter64; + description + "An outgoing packet was small enough to pass through unfragmented - this + should be the usual case."; + } + leaf out-ipv4-packets { + type yang:zero-based-counter64; + description "Valid outgoing IPv4 packets."; + } + leaf out-ipv6-bytes { + type yang:zero-based-counter64; + description "All valid outgoing IPv6 packets."; + } + leaf out-ipv6-frag { + type yang:zero-based-counter64; + description + "An outgoing packet exceeded the configured IPv6 MTU, so needed to be + fragmented. This may happen, but should be unusual."; + } + leaf out-ipv6-frag-not { + type yang:zero-based-counter64; + description + "An outgoing packet was small enough to pass through unfragmented - this + should be the usual case."; + } + leaf out-ipv6-packets { + type yang:zero-based-counter64; + description "All valid outgoing IPv6 packets."; + } + } + } + + container softwire-config { + description + "Configuration for Snabb lwaftr."; + + leaf name { + type string; + description + "Name of lwAFTR instance. This must be unique amongst the Snabb + processes on the system. This may be specified either here, in the + YANG configuration or via the command line when the lwAFTR is started. + + The order of presidence for this leaf is as followers: + 1. The name set on an already running lwAFTR instance via snabb set. + 2. A command line option to specify the name upon starting the lwAFTR + instance (i.e. overriding this value). + 3. The value here in the configuration when starting a lwaftr instance. + + If no name is specified the lwaftr can be referred to using the PID of + the lwAFTR process on the system."; + } + + grouping traffic-filters { + description + "Ingress and egress filters describing the set of packets + that should be allowed to pass, as pflang filters. pflang + is the language of tcpdump, libpcap and other tools. Note + that if VLAN tagging is enabled, the filters run on packets + after VLAN tags have been stripped off."; + leaf ingress-filter { + type string; + description + "Filter for incoming traffic. Packets that do not match + the filter will be silently dropped."; + } + leaf egress-filter { + type string; + description + "Filter for outgoing traffic. Packets that do not match + the filter will be silently dropped."; + } + } + + grouping icmp-policy { + description + "The lwAFTR can be configured to allow or drop incoming ICMP + messages, and to generate outgoing ICMP error messages or + not."; + + leaf allow-incoming-icmp { + type boolean; + default true; + description + "Whether to allow incoming ICMP packets."; + } + + leaf generate-icmp-errors { + type boolean; + default true; + description + "Whether to generate outgoing ICMP error messages."; + } + } + + grouping vlan-tagging { + description + "802.1Q Ethernet tagging."; + + leaf vlan-tag { + type uint16 { + range 0..4095; + } + description + "802.1Q Ethernet VLAN tag for this interface."; + } + } + + grouping error-rate-limiting { + description + "These settings limit the rate of ICMP error message + transmission."; + + container error-rate-limiting { + leaf packets { + type uint32; + description + "The number of ICMP error messages which can be sent within + the specified time period."; + } + + leaf period { + type uint32 { range 1..max; } + default 2; + description + "The time period given in seconds."; + } + } + } + + grouping reassembly { + description + "These settings limit the resources devoted to reassembling + fragmented packets."; + + container reassembly { + leaf max-fragments-per-packet { + type uint32 { range 1..max; } + default 20; + description + "The maximum number of fragments per reassembled packet. + Attempts to reassemble a packet using more fragments than + this threshold will fail and the reassembly data will be + discarded."; + } + + leaf max-packets { + type uint32; + default 20000; + description + "The maximum number of concurrent reassembly attempts. If + this limit is reached, an additional reassembly will cause + random eviction of an ongoing reassembly. Note that this + setting directly affects memory usage; the memory buffer + allocated to reassembly is this maximum number of + reassemblies times 25 kilobytes each."; + } + } + } + + + list instance { + description + "Provides configuration for specific instances of the lwAFTR. + These configuration options will only affect the specific lwaftr + with the given name specified in the name leaf. The other options + not present in this list are shared amongst all instances."; + + key "device"; + + leaf device { + type string; + description + "The PCI device the instance should use during lwAFTR operation. If + device is configured in on-a-stick mode, the 'external-interface' + device should not be configured. If the 'external-interface is + specified this option should specify the PCI device of the + 'internal-interface' (IPv6 traffic only)."; + } + + list queue { + description "List of Receive-Side Scaling (RSS) queues."; + key "id"; + + leaf id { + type uint8 { range 0..1; } + description + "RSS queue on which to attach. Traffic will be partitioned + evenly between instances servicing queues on the same + interface. The queue to which an incoming packet is assigned + is a function of the TCP or UDP source and destination ports + (if any) and the source and destination IPv4 or IPv6 + addresses. Fragmented packets will be delivered to the + lowest-numbered queue. + + Note that currently the lwAFTR is restricted to running at + most 2 RSS workers per device. This limitation may be lifted + to 4 soon. Raising it farther is possible but needs changes + to how the lwAFTR uses its PCI devices."; + } + + container external-interface { + leaf ip { + type inet:ipv4-address; + mandatory true; + description + "L3 Address of the internet-facing network interface. Used + when generating error messages and responding to ICMP echo + requests."; + } + leaf device { + description + "PCI device of the instance uses for external IPv6 traffic. If this + is left unspecified the lwAFTR configures itself in on-a-stick + mode."; + type string; + } + leaf mac { + type yang:mac-address; + mandatory true; + description + "MAC address of the internet-facing NIC."; + } + + uses vlan-tagging; + + container next-hop { + choice address { + mandatory true; + case ip { + leaf ip { + type inet:ipv4-address; + description + "IPv4 address of the next hop for the internet-facing NIC. + The lwAFTR will resolve this to a MAC address using ARP."; + } + leaf resolved-mac { + config false; + description "Resolved next-hop mac address found by ARP."; + type yang:mac-address; + } + } + case mac { + leaf mac { + type yang:mac-address; + description + "Statically configured MAC address of the next hop for the + internet-facing NIC."; + } + } + } + } + } + + container internal-interface { + leaf ip { + type inet:ipv6-address; + mandatory true; + description + "L3 Address of the internal-facing network interface. Used + when generating error messages and responding to ICMP echo + requests."; + } + leaf mac { + type yang:mac-address; + mandatory true; + description + "MAC address of the internal-facing NIC."; + } + + uses vlan-tagging; + + + container next-hop { + choice address { + mandatory true; + case ip { + leaf ip { + type inet:ipv6-address; + description + "IPv4 address of the next hop for the internet-facing NIC. + The lwAFTR will resolve this to a MAC address using ARP."; + } + leaf resolved-mac { + config false; + description "Resolved next-hop mac address found by ARP."; + type yang:mac-address; + } + } + case mac { + leaf mac { + type yang:mac-address; + description + "Statically configured MAC address of the next hop for the + internet-facing NIC."; + } + } + } + } + } + } + + uses state-counters; + } + + container external-interface { + description + "Configuration for the external, internet-facing IPv4 + interface."; + + leaf mtu { + type uint16; + default 1460; + description + "Maximum packet size to send on the IPv4 interface."; + } + + leaf mru { + type uint16; + default 1460; + description + "Maximum packet size to receive on the IPv4 interface."; + } + + uses traffic-filters; + uses icmp-policy; + uses error-rate-limiting; + uses reassembly; + + + } + + container internal-interface { + description + "Configuration for the internal IPv6 interface."; + + leaf mtu { + type uint16; + default 1500; + description + "Maximum packet size to sent on the IPv6 interface."; + } + + leaf mru { + type uint16; + default 1460; + description + "Maximum packet size to recieve on the IPv6 interface."; + } + + leaf flow-label { + type uint32; + default 0; + description + "IPv6 flow label"; + } + + uses traffic-filters; + uses icmp-policy; + uses vlan-tagging; + uses error-rate-limiting; + uses reassembly; + + leaf hairpinning { + type boolean; + default true; + description + "Indicates whether to support hairpinning of traffic between + two B4s."; + } + } + + container binding-table { + description + "A collection of softwires (tunnels), along with a description + of the IPv4 and IPv6 addresses handled by the lwAFTR."; + + list softwire { + key "ipv4 psid"; + + leaf ipv4 { + type inet:ipv4-address; + mandatory true; + description + "Public IPv4 address of the softwire."; + } + + leaf padding { + type uint16; + default 0; + } + + leaf br-address { + type inet:ipv6-address; + mandatory true; + description + "The B4-facing address of the lwAFTR for this softwire."; + } + + leaf b4-ipv6 { + type inet:ipv6-address; + mandatory true; + description + "B4 address."; + } + + leaf psid { + type uint16; + mandatory true; + description "Port set ID."; + } + + container port-set { + description + "The set of IPv4 addresses managed by the lwAFTR, along with + the way in which those IPv4 addresses share ports. A PSID map + entry associates a PSID length and reserved-ports-bit-count + with each IPv4 address served by the lwAFTR. + + The lightweight 4-over-6 architecture supports sharing of + IPv4 addresses by partitioning the space of TCP/UDP/ICMP + ports into disjoint \"port sets\". Each softwire associated + with an IPv4 address corresponds to a different set of ports + on that address. The way that the ports are partitioned is + specified in RFC 7597: each address has an associated set + of parameters that specifies how to compute a \"port set + identifier\" (PSID) from a given port. + + 0 1 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 + +-----------+-----------+-------+ + Ports in | A | PSID | j | + the CE port set | > 0 | | | + +-----------+-----------+-------+ + | a bits | k bits |m bits | + + Figure 2: Structure of a Port-Restricted Port Field + + Source: http://tools.ietf.org/html/rfc7597#section-5.1 + + We find the specification's names to be a bit obtuse, so we + refer to them using the following names: + + a bits = reserved-ports-bit-count. + k bits = psid-length. + m bits = shift. + + The shift parameter is calculated from psid-length and + reserved-ports-bit-count. The calculation performed to + get the value of shift is: + + shift = 16 - psid-length - reserved-ports-bit-count"; + + leaf psid-length { + type uint8 { range 0..16; } + mandatory true; + description + "The number of bits devoted to the PSID in the port map. + If the psid-length is N, then the IPv4 address will be + shared 2^N ways. Note that psid-length, shift, and + reserved-ports-bit-count must add up to 16."; + } + + leaf reserved-ports-bit-count { + type uint8 { range 0..16; } + default 0; + description + "Reserve the lowest 2^N ports so that they map to no + softwire. This can be useful to prevent the low 1024 + ports (for example) from being mapped to customers. Note + that psid-length and shift must add up to less than or + equal to 16."; + } + } + } + + container version { + description + "Optional versioning for binding table. The vesioning information + will change on every update or change to the binding table."; + + leaf number { + type uint64; + description "Incremental version number."; + } + leaf date { + type yang:date-and-time; + description "Timestamp of last change."; + } + } + } + } + + uses state-counters; +} From faeb6ffaa0affeebce86341cb816caf1d5145196 Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Mon, 8 Nov 2021 10:00:45 +0000 Subject: [PATCH 164/209] snabb-softwire-v3: fix mistakes in leaf descr. --- src/lib/yang/snabb-softwire-v3.yang | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/src/lib/yang/snabb-softwire-v3.yang b/src/lib/yang/snabb-softwire-v3.yang index a0d18bfb9d..90e796140c 100644 --- a/src/lib/yang/snabb-softwire-v3.yang +++ b/src/lib/yang/snabb-softwire-v3.yang @@ -13,7 +13,8 @@ module snabb-softwire-v3 { revision 2021-11-08 { description - "Change module+namespace to v3. Update organization and contact."; + "Change module+namespace to v3. Update organization and contact. + Fix mistakes in leaf descriptions."; } revision 2019-09-17 { @@ -572,7 +573,7 @@ module snabb-softwire-v3 { description "The PCI device the instance should use during lwAFTR operation. If device is configured in on-a-stick mode, the 'external-interface' - device should not be configured. If the 'external-interface is + device should not be configured. If 'external-interface' is specified this option should specify the PCI device of the 'internal-interface' (IPv6 traffic only)."; } @@ -609,7 +610,7 @@ module snabb-softwire-v3 { } leaf device { description - "PCI device of the instance uses for external IPv6 traffic. If this + "PCI device of the instance used for external IPv4 traffic. If this is left unspecified the lwAFTR configures itself in on-a-stick mode."; type string; @@ -677,12 +678,12 @@ module snabb-softwire-v3 { leaf ip { type inet:ipv6-address; description - "IPv4 address of the next hop for the internet-facing NIC. - The lwAFTR will resolve this to a MAC address using ARP."; + "IPv6 address of the next hop for the internal-facing NIC. + The lwAFTR will resolve this to a MAC address using NDP."; } leaf resolved-mac { config false; - description "Resolved next-hop mac address found by ARP."; + description "Resolved next-hop mac address found by NDP."; type yang:mac-address; } } @@ -691,7 +692,7 @@ module snabb-softwire-v3 { type yang:mac-address; description "Statically configured MAC address of the next hop for the - internet-facing NIC."; + internal-facing NIC."; } } } From 2dddff659dffcf731bb9bad01ae00a51a4678b6c Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Mon, 8 Nov 2021 10:02:32 +0000 Subject: [PATCH 165/209] snabb-softwire-v3: Add default value for error-rate-limiting/packets --- src/lib/yang/snabb-softwire-v3.yang | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/lib/yang/snabb-softwire-v3.yang b/src/lib/yang/snabb-softwire-v3.yang index 90e796140c..4ca8db5441 100644 --- a/src/lib/yang/snabb-softwire-v3.yang +++ b/src/lib/yang/snabb-softwire-v3.yang @@ -14,7 +14,8 @@ module snabb-softwire-v3 { revision 2021-11-08 { description "Change module+namespace to v3. Update organization and contact. - Fix mistakes in leaf descriptions."; + Fix mistakes in leaf descriptions. + Add default value for error-rate-limiting/packets."; } revision 2019-09-17 { @@ -514,6 +515,7 @@ module snabb-softwire-v3 { container error-rate-limiting { leaf packets { type uint32; + default 200; description "The number of ICMP error messages which can be sent within the specified time period."; From eff1a9ae40ffe3179a4e8ec6e3f6174b60cf761b Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Mon, 8 Nov 2021 10:04:17 +0000 Subject: [PATCH 166/209] snabb-softwire-v3: allow more than two queues --- src/lib/yang/snabb-softwire-v3.yang | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/src/lib/yang/snabb-softwire-v3.yang b/src/lib/yang/snabb-softwire-v3.yang index 4ca8db5441..dd00040a90 100644 --- a/src/lib/yang/snabb-softwire-v3.yang +++ b/src/lib/yang/snabb-softwire-v3.yang @@ -15,7 +15,8 @@ module snabb-softwire-v3 { description "Change module+namespace to v3. Update organization and contact. Fix mistakes in leaf descriptions. - Add default value for error-rate-limiting/packets."; + Add default value for error-rate-limiting/packets. + Allow more than two queues (lift id leaf range restriction)."; } revision 2019-09-17 { @@ -585,7 +586,7 @@ module snabb-softwire-v3 { key "id"; leaf id { - type uint8 { range 0..1; } + type uint8; description "RSS queue on which to attach. Traffic will be partitioned evenly between instances servicing queues on the same @@ -593,12 +594,7 @@ module snabb-softwire-v3 { is a function of the TCP or UDP source and destination ports (if any) and the source and destination IPv4 or IPv6 addresses. Fragmented packets will be delivered to the - lowest-numbered queue. - - Note that currently the lwAFTR is restricted to running at - most 2 RSS workers per device. This limitation may be lifted - to 4 soon. Raising it farther is possible but needs changes - to how the lwAFTR uses its PCI devices."; + lowest-numbered queue."; } container external-interface { From 1f8ee8172e1d023cd363e18e4e40a7622e50d0b6 Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Mon, 8 Nov 2021 10:11:33 +0000 Subject: [PATCH 167/209] snabb-softwire-v3: external-interface/device -> external-device --- src/lib/yang/snabb-softwire-v3.yang | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/src/lib/yang/snabb-softwire-v3.yang b/src/lib/yang/snabb-softwire-v3.yang index dd00040a90..a146c0f0cb 100644 --- a/src/lib/yang/snabb-softwire-v3.yang +++ b/src/lib/yang/snabb-softwire-v3.yang @@ -16,7 +16,8 @@ module snabb-softwire-v3 { "Change module+namespace to v3. Update organization and contact. Fix mistakes in leaf descriptions. Add default value for error-rate-limiting/packets. - Allow more than two queues (lift id leaf range restriction)."; + Allow more than two queues (lift id leaf range restriction). + Move leaf external-interface/device up as external-device."; } revision 2019-09-17 { @@ -575,12 +576,20 @@ module snabb-softwire-v3 { type string; description "The PCI device the instance should use during lwAFTR operation. If - device is configured in on-a-stick mode, the 'external-interface' - device should not be configured. If 'external-interface' is + device is configured in on-a-stick mode, 'external-device' + should not be configured. If 'external-device' is specified this option should specify the PCI device of the 'internal-interface' (IPv6 traffic only)."; } + leaf external-device { + type string; + description + "PCI device the instance should use for the 'external-interface' + (IPv4 traffic only). If this is left unspecified the lwAFTR + configures itself in on-a-stick mode."; + } + list queue { description "List of Receive-Side Scaling (RSS) queues."; key "id"; @@ -606,13 +615,6 @@ module snabb-softwire-v3 { when generating error messages and responding to ICMP echo requests."; } - leaf device { - description - "PCI device of the instance used for external IPv4 traffic. If this - is left unspecified the lwAFTR configures itself in on-a-stick - mode."; - type string; - } leaf mac { type yang:mac-address; mandatory true; From d75f079309900dd38c7ffebcbb4cf9172d443ea3 Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Mon, 8 Nov 2021 11:08:29 +0000 Subject: [PATCH 168/209] lwaftr: use snabb-softwire-v3 schema --- src/apps/lwaftr/binding_table.lua | 2 +- src/apps/lwaftr/lwaftr.lua | 4 +-- src/apps/lwaftr/lwutil.lua | 11 ++++--- ...-softwire-v2.lua => snabb-softwire-v3.lua} | 32 +++++++++---------- src/lib/yang/path_data.lua | 2 +- src/lib/yang/schema.lua | 2 +- src/program/alarms/set_operator_state/README | 2 +- .../lwaftr/compile_configuration/README | 2 +- .../compile_configuration.lua | 2 +- src/program/lwaftr/counters.lua | 4 +-- src/program/lwaftr/doc/configuration.md | 31 ++++++++---------- .../lwaftr/generate_configuration/README | 2 +- .../migrate_configuration.lua | 6 ++-- src/program/lwaftr/run/README | 2 +- src/program/lwaftr/run/run.lua | 13 ++++---- src/program/lwaftr/setup.lua | 17 +++++----- .../lwaftr/tests/propbased/genyang.lua | 4 +-- .../lwaftr/tests/subcommands/config_test.py | 6 ++-- 18 files changed, 71 insertions(+), 73 deletions(-) rename src/lib/ptree/support/{snabb-softwire-v2.lua => snabb-softwire-v3.lua} (96%) diff --git a/src/apps/lwaftr/binding_table.lua b/src/apps/lwaftr/binding_table.lua index 977606f4b5..b2b0c77f6f 100644 --- a/src/apps/lwaftr/binding_table.lua +++ b/src/apps/lwaftr/binding_table.lua @@ -279,7 +279,7 @@ function selftest() local mem = require("lib.stream.mem") local yang = require('lib.yang.yang') local data = require('lib.yang.data') - local schema = yang.load_schema_by_name('snabb-softwire-v2') + local schema = yang.load_schema_by_name('snabb-softwire-v3') local grammar = data.config_grammar_from_schema(schema) local subgrammar = assert(grammar.members['softwire-config']) local subgrammar = assert(subgrammar.members['binding-table']) diff --git a/src/apps/lwaftr/lwaftr.lua b/src/apps/lwaftr/lwaftr.lua index 72f98fa1dc..405423dc5f 100644 --- a/src/apps/lwaftr/lwaftr.lua +++ b/src/apps/lwaftr/lwaftr.lua @@ -339,7 +339,7 @@ local function drop(pkt) packet.free(pkt) end -LwAftr = { yang_schema = 'snabb-softwire-v2' } +LwAftr = { yang_schema = 'snabb-softwire-v3' } -- Fields: -- - direction: "in", "out", "hairpin", "drop"; -- If "direction" is "drop": @@ -449,7 +449,7 @@ end -- The following two methods are called by lib.ptree.worker in reaction -- to binding table changes, via --- lib/ptree/support/snabb-softwire-v2.lua. +-- lib/ptree/support/snabb-softwire-v3.lua. function LwAftr:add_softwire_entry(entry_blob) self.binding_table:add_softwire_entry(entry_blob) end diff --git a/src/apps/lwaftr/lwutil.lua b/src/apps/lwaftr/lwutil.lua index 13312908d7..5556f2f9f8 100644 --- a/src/apps/lwaftr/lwutil.lua +++ b/src/apps/lwaftr/lwutil.lua @@ -41,9 +41,10 @@ function parse_instance(conf) end end -function is_on_a_stick(device, queue) - if not queue.external_interface.device and device then return true end - return device == queue.external_interface.device +function is_on_a_stick(conf, device) + local instance = conf.softwire_config.instance[device] + if not instance.external_device then return true end + return device == instance.external_device end function is_lowest_queue(conf) @@ -64,7 +65,7 @@ function num_queues(conf) end function select_instance(conf) - local copier = binary.config_copier_for_schema_by_name('snabb-softwire-v2') + local copier = binary.config_copier_for_schema_by_name('snabb-softwire-v3') local device, id = parse_instance(conf) local copy = copier(conf)() local instance = copy.softwire_config.instance @@ -89,7 +90,7 @@ function merge_instance (conf) for k,v in pairs(t2) do ret[k] = v end return ret end - local copier = binary.config_copier_for_schema_by_name('snabb-softwire-v2') + local copier = binary.config_copier_for_schema_by_name('snabb-softwire-v3') local copy = copier(conf)() local _, _, queue = parse_instance(conf) copy.softwire_config.external_interface = table_merge( diff --git a/src/lib/ptree/support/snabb-softwire-v2.lua b/src/lib/ptree/support/snabb-softwire-v3.lua similarity index 96% rename from src/lib/ptree/support/snabb-softwire-v2.lua rename to src/lib/ptree/support/snabb-softwire-v3.lua index dd917d225e..8b3c9c6b81 100644 --- a/src/lib/ptree/support/snabb-softwire-v2.lua +++ b/src/lib/ptree/support/snabb-softwire-v3.lua @@ -19,7 +19,7 @@ local path_data = require('lib.yang.path_data') local generic = require('lib.ptree.support').generic_schema_config_support local binding_table = require("apps.lwaftr.binding_table") --- Packs snabb-softwire-v2 softwire entry into softwire and PSID blob +-- Packs snabb-softwire-v3 softwire entry into softwire and PSID blob -- -- The data plane stores a separate table of psid maps and softwires. It -- requires that we give it a blob it can quickly add. These look rather @@ -65,7 +65,7 @@ end local softwire_grammar local function get_softwire_grammar() if not softwire_grammar then - local schema = yang.load_schema_by_name('snabb-softwire-v2') + local schema = yang.load_schema_by_name('snabb-softwire-v3') local grammar = data.config_grammar_from_schema(schema) softwire_grammar = assert(grammar.members['softwire-config']. @@ -234,7 +234,7 @@ local function schema_getter(schema_name, path) end local function snabb_softwire_getter(path) - return schema_getter('snabb-softwire-v2', path) + return schema_getter('snabb-softwire-v3', path) end local function ietf_softwire_br_getter(path) @@ -323,7 +323,7 @@ local function ietf_softwire_br_translator () softwire_payload_mtu = int.mtu, softwire_path_mru = ext.mtu, -- FIXME: There's no equivalent of softwire-num-max in - -- snabb-softwire-v2. + -- snabb-softwire-v3. softwire_num_max = 0xffffffff, enable_hairpinning = int.hairpinning, binding_table = { @@ -423,7 +423,7 @@ local function ietf_softwire_br_translator () } local path_tail = path_tails[leaf] if path_tail then - return {{'set', {schema='snabb-softwire-v2', + return {{'set', {schema='snabb-softwire-v3', path='/softwire-config/'..path_tail, config=tostring(arg)}}} else @@ -439,15 +439,15 @@ local function ietf_softwire_br_translator () } local path_tail = path_tails[leaf] if path_tail then - return {{'set', {schema='snabb-softwire-v2', + return {{'set', {schema='snabb-softwire-v3', path='/softwire-config/'..path_tail, config=tostring(arg)}}} elseif leaf == 'icmpv4-rate' then local head = '/softwire-config/external-interface/error-rate-limiting' return { - {'set', {schema='snabb-softwire-v2', path=head..'/packets', + {'set', {schema='snabb-softwire-v3', path=head..'/packets', config=tostring(arg * 2)}}, - {'set', {schema='snabb-softwire-v2', path=head..'/period', + {'set', {schema='snabb-softwire-v3', path=head..'/period', config='2'}}} else error('unrecognized leaf: '..leaf) @@ -457,15 +457,15 @@ local function ietf_softwire_br_translator () then local leaf = path[#path].name if leaf == 'generate-icmpv6-errors' then - return {{'set', {schema='snabb-softwire-v2', + return {{'set', {schema='snabb-softwire-v3', path='/softwire-config/internal-interface/generate-icmp-errors', config=tostring(arg)}}} elseif leaf == 'icmpv6-rate' then local head = '/softwire-config/internal-interface/error-rate-limiting' return { - {'set', {schema='snabb-softwire-v2', path=head..'/packets', + {'set', {schema='snabb-softwire-v3', path=head..'/packets', config=tostring(arg * 2)}}, - {'set', {schema='snabb-softwire-v2', path=head..'/period', + {'set', {schema='snabb-softwire-v3', path=head..'/period', config='2'}}} else error('unrecognized leaf: '..leaf) @@ -480,7 +480,7 @@ local function ietf_softwire_br_translator () not path_has_query(path, #path) then local bt = native_binding_table_from_ietf(arg) - return {{'set', {schema='snabb-softwire-v2', + return {{'set', {schema='snabb-softwire-v3', path='/softwire-config/binding-table', config=serialize_binding_table(bt)}}} else @@ -536,7 +536,7 @@ local function ietf_softwire_br_translator () -- to add a check here that the IPv4/PSID is not present in the -- binding table. table.insert(updates, - {'remove', {schema='snabb-softwire-v2', + {'remove', {schema='snabb-softwire-v3', path=softwire_path..old_query}}) local config_str = string.format([[{ @@ -553,7 +553,7 @@ local function ietf_softwire_br_translator () path[entry_path_len].query['binding-ipv6info'], new.port_set.psid_len, new.port_set.psid_offset) table.insert(updates, - {'add', {schema='snabb-softwire-v2', + {'add', {schema='snabb-softwire-v3', path=softwire_path, config=config_str}}) return updates @@ -613,7 +613,7 @@ local function ietf_softwire_br_translator () table.insert(additions, config_str) end table.insert(updates, - {'add', {schema='snabb-softwire-v2', + {'add', {schema='snabb-softwire-v3', path=softwire_path, config=table.concat(additions, '\n')}}) return updates @@ -642,7 +642,7 @@ local function ietf_softwire_br_translator () return string.format('[ipv4=%s][psid=%s]', ipv4_ntop(ipv4), psid) end local query = q(entry.binding_ipv4_addr, entry.port_set.psid) - return {{'remove', {schema='snabb-softwire-v2', + return {{'remove', {schema='snabb-softwire-v3', path=softwire_path..query}}} else return error('unsupported path: '..path_str) diff --git a/src/lib/yang/path_data.lua b/src/lib/yang/path_data.lua index c85bd352f4..ac7d11a90d 100644 --- a/src/lib/yang/path_data.lua +++ b/src/lib/yang/path_data.lua @@ -834,7 +834,7 @@ function selftest() local checker = consistency_checker_from_schema_by_name('ietf-alarms', false) assert(checker) - local scm = schema.load_schema_by_name('snabb-softwire-v2') + local scm = schema.load_schema_by_name('snabb-softwire-v3') local grammar = data.config_grammar_from_schema(scm) setter_for_grammar(grammar, "/softwire-config/instance[device=test]/".. "queue[id=0]/external-interface/ip 208.118.235.148") diff --git a/src/lib/yang/schema.lua b/src/lib/yang/schema.lua index 21f5ad1af4..c3b121cb65 100644 --- a/src/lib/yang/schema.lua +++ b/src/lib/yang/schema.lua @@ -1303,7 +1303,7 @@ function selftest() load_schema_by_name('ietf-softwire-common') load_schema_by_name('ietf-softwire-br') - load_schema_by_name('snabb-softwire-v2') + load_schema_by_name('snabb-softwire-v3') local br = load_schema_by_name('ietf-softwire-br') local binding = br.body['br-instances'].body['br-type'].body['binding'] diff --git a/src/program/alarms/set_operator_state/README b/src/program/alarms/set_operator_state/README index f88e83eb13..b890c0a1f1 100644 --- a/src/program/alarms/set_operator_state/README +++ b/src/program/alarms/set_operator_state/README @@ -21,7 +21,7 @@ An OPERATOR-STATE can take the following values: 'none', 'ack', 'closed', Typical usage: -$ snabb alarms set-operator-state --schema snabb-softwire-v2 lwaftr resource arp-resolution ack +$ snabb alarms set-operator-state --schema snabb-softwire-v3 lwaftr resource arp-resolution ack See https://github.com/Igalia/snabb/blob/lwaftr/src/program/alarms/README.md for full documentation. diff --git a/src/program/lwaftr/compile_configuration/README b/src/program/lwaftr/compile_configuration/README index 7f2e11bd0a..941da374eb 100644 --- a/src/program/lwaftr/compile_configuration/README +++ b/src/program/lwaftr/compile_configuration/README @@ -6,7 +6,7 @@ Usage: compile-configuration LWAFTR.CONF [LWAFTR.O] Validate and compile a configuration file. A lwAFTR configuration file follows the schema described in -`lib/yang/snabb-softwire-v2.yang`. It consists of several containers +`lib/yang/snabb-softwire-v3.yang`. It consists of several containers such as `binding-table`, `external-interface` and `internal-interface` and `instance`. diff --git a/src/program/lwaftr/compile_configuration/compile_configuration.lua b/src/program/lwaftr/compile_configuration/compile_configuration.lua index 04fe1e1259..62f3e37bfc 100644 --- a/src/program/lwaftr/compile_configuration/compile_configuration.lua +++ b/src/program/lwaftr/compile_configuration/compile_configuration.lua @@ -19,7 +19,7 @@ end function run(args) local filein, fileout = parse_args(args) local success, err = pcall(yang.load_configuration, filein, - {schema_name='snabb-softwire-v2', compiled_filename=fileout}) + {schema_name='snabb-softwire-v3', compiled_filename=fileout}) if not success then print(tostring(err)) main.exit(1) diff --git a/src/program/lwaftr/counters.lua b/src/program/lwaftr/counters.lua index ac04964023..1dd6f01d4d 100644 --- a/src/program/lwaftr/counters.lua +++ b/src/program/lwaftr/counters.lua @@ -8,7 +8,7 @@ local S = require('syscall') function counter_names () local names = {} - local schema = schema.load_schema_by_name('snabb-softwire-v2') + local schema = schema.load_schema_by_name('snabb-softwire-v3') for k, node in pairs(schema.body['softwire-state'].body) do if node.kind == 'leaf' then names[k] = data.normalize_id(k) @@ -18,7 +18,7 @@ function counter_names () end function read_counters (pid) - local reader = state.state_reader_from_schema_by_name('snabb-softwire-v2') + local reader = state.state_reader_from_schema_by_name('snabb-softwire-v3') local s = reader(state.counters_for_pid(pid or S.getpid())) local ret = {} for k, id in pairs(counter_names()) do diff --git a/src/program/lwaftr/doc/configuration.md b/src/program/lwaftr/doc/configuration.md index 4a01d68a30..b929f25a26 100644 --- a/src/program/lwaftr/doc/configuration.md +++ b/src/program/lwaftr/doc/configuration.md @@ -2,7 +2,7 @@ The lwAFTR's configuration is modelled by a [YANG](https://tools.ietf.org/html/rfc6020) schema, -[snabb-softwire-v2](../../../lib/yang/snabb-softwire-v2.yang). +[snabb-softwire-v3](../../../lib/yang/snabb-softwire-v3.yang). The lwAFTR takes its configuration from the user in the form of a text file. That file's grammar is derived from the YANG schema; see the @@ -120,11 +120,10 @@ softwire-config { The lwaftr will spawn a number of worker processes that perform packet forwarding. Each `queue` statement in the configuration corresponds to one process servicing one RSS queue on one or two network devices. For -on-a-stick operation, only the `device` leaf that is part of the -`instance` leaf will be specified. For bump-in-the-wire operation, the -`instance` device will handle IPv6 traffic, and the `device` specified -in the `external-interface` that's part of the `queue` will handle IPv4 -traffic. +on-a-stick operation, only the `device` leaf will be specified. +For bump-in-the-wire operation, `device` will handle IPv6 traffic, and +IPv4 traffic will be handled on the device specified in the +`external-device` leaf. The `external-interface` define parameters around the IPv4 interface that communicates with the internet and the `internal-interface` section @@ -151,7 +150,7 @@ the given *PID* to reload its configuration from the given file. ## In-depth configuration explanation See the embedded descriptions in the -[snabb-softwire-v2](../../../lib/yang/snabb-softwire-v2.yang) schema +[snabb-softwire-v3](../../../lib/yang/snabb-softwire-v3.yang) schema file. ## Binding tables @@ -309,10 +308,10 @@ example, here's a bump-in-the-wire configuration with two RSS workers: ``` instance { device 83:00.0; + external-device 83:00.1; queue { id 0; external-interface { - device 83:00.1; ip 10.10.10.10; mac 56:56:56:56:56:56; next-hop { mac 02:68:68:68:68:68; } @@ -326,7 +325,6 @@ example, here's a bump-in-the-wire configuration with two RSS workers: queue { id 1; external-interface { - device 83:00.1; ip 10.10.10.10; mac 56:56:56:56:56:56; next-hop { mac 02:68:68:68:68:68; } @@ -341,8 +339,7 @@ example, here's a bump-in-the-wire configuration with two RSS workers: ``` These queues are configured on the `83:00.0` instance, and because the -queues have a different device configured on the `external-interface` -containers, that makes this configuration a bump-in-the-wire +instance specifies an `external-device` this is a bump-in-the-wire configuration. The two queues are identical with the exception of their `id` fields. Incoming IPv6 traffic on `83:00.0` and IPv4 traffic on `83:00.1` will be evenly split between these two worker processes using @@ -384,7 +381,7 @@ lwAFTR is addressable using the [`ietf-softwire-br`](../../../lib/yang/ietf-softwire-br.yang) YANG schema. The lwAFTR also has a "native" schema that exposes more configuration information, -[`snabb-softwire-v2`](../../../lib/yang/snabb-softwire-v2.yang). Pass +[`snabb-softwire-v3`](../../../lib/yang/snabb-softwire-v3.yang). Pass the `-s` argument to the `snabb config` tools to specify a non-default YANG schema. @@ -393,7 +390,7 @@ next-hop address of the external interface on lwaftr instance `lwaftr`'s queue `0` on device `83:00.0`: ``` -$ snabb config set -s snabb-softwire-v2 lwaftr \ +$ snabb config set -s snabb-softwire-v3 lwaftr \ /softwire-config/instance[device=83:00.0]/queue[id=0]/external-interface/next-hop/mac \ 02:02:02:02:02:02 ``` @@ -402,7 +399,7 @@ $ snabb config set -s snabb-softwire-v2 lwaftr \ Firstly, we suggest getting a lwAFTR configuration working that runs on only one interface and one queue. Once you have that working, do a -`snabb config get -s snabb-softwire-v2 lwaftr /softwire-config/instance` +`snabb config get -s snabb-softwire-v3 lwaftr /softwire-config/instance` to get the `instance` configuration for the `lwaftr` instance. You'll get something like this: @@ -421,7 +418,7 @@ So to add another device, you can just paste that into a file, change the devices, and then do: ``` -$ snabb config add -s snabb-softwire-v2 lwaftr \ +$ snabb config add -s snabb-softwire-v3 lwaftr \ /softwire-config/instance < my-instance.file.conf ``` @@ -442,14 +439,14 @@ like you think they should be. To remove a queue, use `snabb config remove`: ``` -$ snabb config remove -s snabb-softwire-v2 lwaftr \ +$ snabb config remove -s snabb-softwire-v3 lwaftr \ /softwire-config/instance[device=XX:XX.X]/queue[id=ID] ``` Likewise you can remove instances this way: ``` -$ snabb config remove -s snabb-softwire-v2 lwaftr \ +$ snabb config remove -s snabb-softwire-v3 lwaftr \ /softwire-config/instance[device=XX:XX.X] ``` diff --git a/src/program/lwaftr/generate_configuration/README b/src/program/lwaftr/generate_configuration/README index f5020e88a2..8be6bcc328 100644 --- a/src/program/lwaftr/generate_configuration/README +++ b/src/program/lwaftr/generate_configuration/README @@ -5,7 +5,7 @@ snabb lwaftr generate-configuration Output filename (snabb-softwire-v2 configuration file). + --output Output filename (snabb-softwire-v3 configuration file). Examples: diff --git a/src/program/lwaftr/migrate_configuration/migrate_configuration.lua b/src/program/lwaftr/migrate_configuration/migrate_configuration.lua index af33e23f5a..8afdba5cf6 100644 --- a/src/program/lwaftr/migrate_configuration/migrate_configuration.lua +++ b/src/program/lwaftr/migrate_configuration/migrate_configuration.lua @@ -442,7 +442,7 @@ local function multiprocess_migration(src, conf_file) -- We should build up a hybrid schema from parts of v1 and v2. local v1_schema = yang.load_schema_by_name("snabb-softwire-v1") - local hybridscm = yang.load_schema_by_name("snabb-softwire-v2") + local hybridscm = yang.load_schema_by_name("snabb-softwire-v3") local v1_external = v1_schema.body["softwire-config"].body["external-interface"] local v1_internal = v1_schema.body["softwire-config"].body["internal-interface"] local external = hybridscm.body["softwire-config"].body["external-interface"] @@ -521,7 +521,7 @@ local function multiprocess_migration(src, conf_file) conf.softwire_config.external_interface.next_hop = nil conf.softwire_config.external_interface.vlan_tag = nil - return config_to_string('snabb-softwire-v2', conf) + return config_to_string('snabb-softwire-v3', conf) end local function v2_migration(src, conf_file) @@ -529,7 +529,7 @@ local function v2_migration(src, conf_file) -- switch over to v2 of snabb-softwire config. local v1_schema = yang.load_schema_by_name("snabb-softwire-v1") local v1_binding_table = v1_schema.body["softwire-config"].body["binding-table"] - local hybridscm = yang.load_schema_by_name("snabb-softwire-v2") + local hybridscm = yang.load_schema_by_name("snabb-softwire-v3") local binding_table = hybridscm.body["softwire-config"].body["binding-table"] -- Add the schema from v1 that we need to convert them. diff --git a/src/program/lwaftr/run/README b/src/program/lwaftr/run/README index bb8dafeb1d..7396df7db9 100644 --- a/src/program/lwaftr/run/README +++ b/src/program/lwaftr/run/README @@ -3,7 +3,7 @@ Usage: run --help Required arguments: -c CONF, --conf CONF Use configuration from the file CONF. - See the snabb-softwire-v2 YANG module + See the snabb-softwire-v3 YANG module for full documentation. Optional arguments: diff --git a/src/program/lwaftr/run/run.lua b/src/program/lwaftr/run/run.lua index 97c04420f4..e571448859 100644 --- a/src/program/lwaftr/run/run.lua +++ b/src/program/lwaftr/run/run.lua @@ -39,9 +39,7 @@ local function migrate_device_on_config(config, v4, v6) end if v6 then - for id, queue in pairs(instance.queue) do - queue.external_interface.device = v6 - end + instance.external_device = v6 end end @@ -169,11 +167,12 @@ function run(args) opts.ring_buffer_size) end - -- If instance has external-interface.device configure as bump-in-the-wire + -- If instance has external-device configure as bump-in-the-wire -- otherwise configure it in on-a-stick mode. - local device, id, queue = lwutil.parse_instance(lwconfig) - if not lwutil.is_on_a_stick(device, queue) then - if lib.is_iface(queue.external_interface.device) then + local device = lwutil.parse_instance(lwconfig) + local instance = lwconfig.softwire_config.instance[device] + if not lwutil.is_on_a_stick(lwconfig, device) then + if lib.is_iface(instance.external_device) then return setup.load_kernel_iface(graph, lwconfig, 'inetNic', 'b4sideNic') else return setup.load_phy(graph, lwconfig, 'inetNic', 'b4sideNic', diff --git a/src/program/lwaftr/setup.lua b/src/program/lwaftr/setup.lua index 697e0b6297..24f99b900b 100644 --- a/src/program/lwaftr/setup.lua +++ b/src/program/lwaftr/setup.lua @@ -317,7 +317,7 @@ function config_intel_avf_pf(c, name, opt, lwconfig) local numvf = 1 -- how many vfs do we need to create on the pf? local vfmac = {} -- MACs to assign to vfs local device, _, queue = lwutil.parse_instance(lwconfig) - if lwutil.is_on_a_stick(device, queue) then + if lwutil.is_on_a_stick(lwconfig, device) then numvf = 2 vfmac[0] = queue.external_interface.mac vfmac[1] = queue.internal_interface.mac @@ -364,7 +364,7 @@ end function load_phy(c, conf, v4_nic_name, v6_nic_name, ring_buffer_size) local v6_pci, id, queue = lwutil.parse_instance(conf) - local v4_pci = queue.external_interface.device + local v4_pci = conf.softwire_config.instance[v6_pci].external_device local v4_info = pci.device_info(v4_pci) local v6_info = pci.device_info(v6_pci) validate_pci_devices({v4_pci, v6_pci}) @@ -396,10 +396,10 @@ end function load_xdp(c, conf, v4_nic_name, v6_nic_name, ring_buffer_size) local v6_device, id, queue = lwutil.parse_instance(conf) - local v4_device = queue.external_interface.device + local v4_device = conf.softwire_config.instance[v6_device].external_device assert(lib.is_iface(v4_device), v4_nic_name..": "..v4_device.." is not a Linux interface") assert(lib.is_iface(v6_device), v6_nic_name..": "..v6_device.." is not a Linux interface") - assert(not lwutil.is_on_a_stick(v6_device, queue), + assert(not lwutil.is_on_a_stick(conf, v6_device), "--xdp does not support on-a-stick configuration") lwaftr_app(c, conf) @@ -434,6 +434,7 @@ end function xdp_ifsetup(conf) for idevice, instance in pairs(conf.softwire_config.instance) do + local edevice = instance.external_device local icfg, ecfg local nqueues = 0 for _, queue in pairs(instance.queue) do @@ -462,7 +463,7 @@ function xdp_ifsetup(conf) ifsetup(idevice, icfg, conf.softwire_config.internal_interface, function (ip) return ipv6:ntop(ip) end) print("Configuring external interface for XDP...") - ifsetup(ecfg.device, ecfg, conf.softwire_config.external_interface, + ifsetup(edevice, ecfg, conf.softwire_config.external_interface, ipv4_ntop) end end @@ -577,7 +578,7 @@ end function load_virt(c, conf, v4_nic_name, v6_nic_name) local v6_pci, id, queue = lwutil.parse_instance(conf) - local v4_pci = queue.external_device.device + local v4_pci = conf.softwire_config.instance[v6_pci].external_device lwaftr_app(c, conf, device) validate_pci_devices({v4_pci, v6_pci}) @@ -821,7 +822,7 @@ end -- will get its own worker process. local function compute_worker_configs(conf) local ret = {} - local copier = binary.config_copier_for_schema_by_name('snabb-softwire-v2') + local copier = binary.config_copier_for_schema_by_name('snabb-softwire-v3') local make_copy = copier(conf) for device, queues in pairs(conf.softwire_config.instance) do for id, _ in pairs(queues.queue) do @@ -866,7 +867,7 @@ function ptree_manager(f, conf, manager_opts) local initargs = { setup_fn = setup_fn, initial_configuration = conf, - schema_name = 'snabb-softwire-v2', + schema_name = 'snabb-softwire-v3', default_schema = 'ietf-softwire-br', -- log_level="DEBUG" } diff --git a/src/program/lwaftr/tests/propbased/genyang.lua b/src/program/lwaftr/tests/propbased/genyang.lua index ec22103cdb..7459516456 100644 --- a/src/program/lwaftr/tests/propbased/genyang.lua +++ b/src/program/lwaftr/tests/propbased/genyang.lua @@ -12,7 +12,7 @@ local util = require("lib.yang.util") local capabilities = {['ietf-softwire-br']={feature={'binding'}},} require('lib.yang.schema').set_default_capabilities(capabilities) -local schemas = { "ietf-softwire-br", "snabb-softwire-v2" } +local schemas = { "ietf-softwire-br", "snabb-softwire-v3" } -- choose an element of an array randomly local function choose(choices) @@ -501,7 +501,7 @@ end function selftest() print('selftest: program.lwaftr.tests.propbased.genyang') - local schema = schema.load_schema_by_name("snabb-softwire-v2") + local schema = schema.load_schema_by_name("snabb-softwire-v3") local grammar = data.config_grammar_from_schema(schema) for i=1,1000 do generate_xpath_and_val(schema, true) end diff --git a/src/program/lwaftr/tests/subcommands/config_test.py b/src/program/lwaftr/tests/subcommands/config_test.py index 8c7d999cfb..93f6fcd148 100644 --- a/src/program/lwaftr/tests/subcommands/config_test.py +++ b/src/program/lwaftr/tests/subcommands/config_test.py @@ -68,7 +68,7 @@ class TestConfigGet(BaseTestCase): """ daemon_args = DAEMON_ARGS - config_args = (str(SNABB_CMD), 'config', 'get', '--schema=snabb-softwire-v2', DAEMON_PROC_NAME) + config_args = (str(SNABB_CMD), 'config', 'get', '--schema=snabb-softwire-v3', DAEMON_PROC_NAME) @classmethod def setUpClass(cls): @@ -130,7 +130,7 @@ class TestConfigMultiproc(BaseTestCase): daemon = None daemon_args = DAEMON_ARGS ps_args = (str(SNABB_CMD), 'ps') - config_args = (str(SNABB_CMD), 'config', 'XXX', '--schema=snabb-softwire-v2', DAEMON_PROC_NAME) + config_args = (str(SNABB_CMD), 'config', 'XXX', '--schema=snabb-softwire-v3', DAEMON_PROC_NAME) @classmethod def setUpClass(cls): @@ -396,7 +396,7 @@ def setUpClass(cls): cls.reportAndFail('Config manager socket not present', None) def get_cmd_args(self, action): - cmd_args = list((str(SNABB_CMD), 'config', 'XXX', '--schema=snabb-softwire-v2', DAEMON_PROC_NAME)) + cmd_args = list((str(SNABB_CMD), 'config', 'XXX', '--schema=snabb-softwire-v3', DAEMON_PROC_NAME)) cmd_args[2] = action return cmd_args From e06c198aa9560abfc2d88c7844b8219a52c5d521 Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Mon, 8 Nov 2021 16:01:09 +0000 Subject: [PATCH 169/209] apps.ipv4.arp: add request/reply counters --- src/apps/ipv4/arp.lua | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/src/apps/ipv4/arp.lua b/src/apps/ipv4/arp.lua index 6c992db150..9a26739868 100644 --- a/src/apps/ipv4/arp.lua +++ b/src/apps/ipv4/arp.lua @@ -135,6 +135,14 @@ end ARP = {} ARP.shm = { ["next-hop-macaddr-v4"] = {counter}, + ["in-arp-request-bytes"] = {counter}, + ["in-arp-request-packets"] = {counter}, + ["out-arp-request-bytes"] = {counter}, + ["out-arp-request-packets"] = {counter}, + ["in-arp-reply-bytes"] = {counter}, + ["in-arp-reply-packets"] = {counter}, + ["out-arp-reply-bytes"] = {counter}, + ["out-arp-reply-packets"] = {counter}, } local arp_config_params = { -- Source MAC address will default to a random address. @@ -186,6 +194,8 @@ function ARP:maybe_send_arp_request (output) end function ARP:send_arp_request (output) + counter.add(self.shm["out-arp-request-bytes"], self.arp_request_pkt.length) + counter.add(self.shm["out-arp-request-packets"]) transmit(output, packet.clone(self.arp_request_pkt)) end @@ -239,11 +249,18 @@ function ARP:push() h.arp.hlen ~= 6 or h.arp.plen ~= 4) then -- Ignore invalid packet. elseif ntohs(h.arp.oper) == arp_oper_request then + counter.add(self.shm["in-arp-request-bytes"], p.length) + counter.add(self.shm["in-arp-request-packets"]) if self.self_ip and ipv4_eq(h.arp.tpa, self.self_ip) then - transmit(osouth, make_arp_reply(self.self_mac, self.self_ip, - h.arp.sha, h.arp.spa)) + local reply = make_arp_reply(self.self_mac, self.self_ip, + h.arp.sha, h.arp.spa) + counter.add(self.shm["out-arp-reply-bytes"], reply.length) + counter.add(self.shm["out-arp-reply-packets"]) + transmit(osouth, reply) end elseif ntohs(h.arp.oper) == arp_oper_reply then + counter.add(self.shm["in-arp-reply-bytes"], p.length) + counter.add(self.shm["in-arp-reply-packets"]) if self.next_ip and ipv4_eq(h.arp.spa, self.next_ip) then self:arp_resolved(self.next_ip, copy_mac(h.arp.sha), 'remote') end From e3b400ac1bcbc268835e815eae3cc8138cdf300b Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Mon, 8 Nov 2021 16:01:53 +0000 Subject: [PATCH 170/209] apps.lwaftr.ndp: add NS/NA counters --- src/apps/lwaftr/ndp.lua | 30 +++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/src/apps/lwaftr/ndp.lua b/src/apps/lwaftr/ndp.lua index e35bc86485..94617738a6 100644 --- a/src/apps/lwaftr/ndp.lua +++ b/src/apps/lwaftr/ndp.lua @@ -267,6 +267,14 @@ end NDP = {} NDP.shm = { ["next-hop-macaddr-v6"] = {counter}, + ["in-ndp-ns-bytes"] = {counter}, + ["in-ndp-ns-packets"] = {counter}, + ["out-ndp-ns-bytes"] = {counter}, + ["out-ndp-ns-packets"] = {counter}, + ["in-ndp-na-bytes"] = {counter}, + ["in-ndp-na-packets"] = {counter}, + ["out-ndp-na-bytes"] = {counter}, + ["out-ndp-na-packets"] = {counter}, } local ndp_config_params = { -- Source MAC address will default to a random address. @@ -327,10 +335,12 @@ function NDP:maybe_send_ns_request (output) self.next_ns_time = self.next_ns_time or engine.now() if self.next_ns_time <= engine.now() then self:ndp_resolving(self.next_ip) - transmit(self.output.south, - make_ns_packet(self.self_mac, self.self_ip, - self.mac_mcast, self.solicited_node_mcast, - self.next_ip)) + local ns = make_ns_packet(self.self_mac, self.self_ip, + self.mac_mcast, self.solicited_node_mcast, + self.next_ip) + counter.add(self.shm["out-ndp-ns-bytes"], ns.length) + counter.add(self.shm["out-ndp-ns-packets"]) + transmit(self.output.south, ns) self.next_ns_time = engine.now() + self.ns_interval end end @@ -390,6 +400,8 @@ function NDP:handle_ndp (pkt) if not verify_icmp_checksum(pkt) then return end if h.icmpv6.type == icmpv6_na then + counter.add(self.shm["in-ndp-na-bytes"], pkt.length) + counter.add(self.shm["in-ndp-na-packets"]) -- Only process advertisements when we are looking for a -- next-hop MAC. if self.next_mac then return end @@ -422,6 +434,8 @@ function NDP:handle_ndp (pkt) -- Advertisement Message Format. self:resolve_next_hop(copy_mac(h.ether.shost)) elseif h.icmpv6.type == icmpv6_ns then + counter.add(self.shm["in-ndp-ns-bytes"], pkt.length) + counter.add(self.shm["in-ndp-ns-packets"]) if pkt.length < ndp_header_len + ffi.sizeof(ns_header_t) then return end local ns = ffi.cast(ns_header_ptr_t, h.body) if is_address_multicast(ns.target_ip) then return end @@ -449,9 +463,11 @@ function NDP:handle_ndp (pkt) end end end - link.transmit(self.output.south, - make_na_packet(self.self_mac, h.ether.shost, - self.self_ip, dst_ip, self.is_router)) + local na = make_na_packet(self.self_mac, h.ether.shost, + self.self_ip, dst_ip, self.is_router) + counter.add(self.shm["out-ndp-na-bytes"], na.length) + counter.add(self.shm["out-ndp-na-packets"]) + link.transmit(self.output.south, na) else -- Unhandled NDP packet; silently drop. return From 6f5e905778b84bf8a27f3519287cfc858107a792 Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Mon, 8 Nov 2021 16:02:39 +0000 Subject: [PATCH 171/209] apps.ipv{4,6}.echo: add echo I/O counters --- src/apps/ipv4/echo.lua | 15 ++++++++++++++- src/apps/ipv6/echo.lua | 15 ++++++++++++++- 2 files changed, 28 insertions(+), 2 deletions(-) diff --git a/src/apps/ipv4/echo.lua b/src/apps/ipv4/echo.lua index 9d422a81ea..e025c6ffec 100644 --- a/src/apps/ipv4/echo.lua +++ b/src/apps/ipv4/echo.lua @@ -73,7 +73,14 @@ local function ipv4_header_length(h) return bit.band(h.version_and_ihl, ipv4_ihl_mask) * 4 end -ICMPEcho = {} +ICMPEcho = { + shm = { + ['in-icmpv4-echo-bytes'] = {counter}, + ['in-icmpv4-echo-packets'] = {counter}, + ['out-icmpv4-echo-bytes'] = {counter}, + ['out-icmpv4-echo-packets'] = {counter}, + } +} function ICMPEcho:new(conf) local addresses = {} @@ -139,6 +146,12 @@ function ICMPEcho:respond_to_echo_request(pkt) ipsum(out.data + ether_header_len + ipv4_header_len, out.length - ether_header_len - ipv4_header_len, 0)) + -- Update counters + counter.add(self.shm['in-icmpv4-echo-bytes'], pkt.length) + counter.add(self.shm['in-icmpv4-echo-packets']) + counter.add(self.shm['out-icmpv4-echo-bytes'], out.length) + counter.add(self.shm['out-icmpv4-echo-packets']) + link.transmit(self.output.south, out) return true diff --git a/src/apps/ipv6/echo.lua b/src/apps/ipv6/echo.lua index 4817b63f18..bfc6262114 100644 --- a/src/apps/ipv6/echo.lua +++ b/src/apps/ipv6/echo.lua @@ -66,7 +66,14 @@ local icmp_header_ptr_t = ffi.typeof('$*', icmp_header_t) local function ipv6_equals(a, b) return ffi.C.memcmp(a, b, 16) == 0 end -ICMPEcho = {} +ICMPEcho = { + shm = { + ['in-icmpv6-echo-bytes'] = {counter}, + ['in-icmpv6-echo-packets'] = {counter}, + ['out-icmpv6-echo-bytes'] = {counter}, + ['out-icmpv6-echo-packets'] = {counter}, + } +} function ICMPEcho:new(conf) local addresses = {} @@ -131,6 +138,12 @@ function ICMPEcho:respond_to_echo_request(pkt) ffi.sizeof(ipv6_pseudo_header_t), 0)))) + -- Update counters + counter.add(self.shm['in-icmpv6-echo-bytes'], pkt.length) + counter.add(self.shm['in-icmpv6-echo-packets']) + counter.add(self.shm['out-icmpv6-echo-bytes'], out.length) + counter.add(self.shm['out-icmpv6-echo-packets']) + link.transmit(self.output.south, out) return true From bd914c12169821478eb31e1ef4b84c2e8bf7aa95 Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Mon, 8 Nov 2021 16:09:45 +0000 Subject: [PATCH 172/209] lwaftr/snabb-softwire-v3: state additions and fixes --- src/apps/lwaftr/lwaftr.lua | 16 +-- src/lib/yang/snabb-softwire-v3.yang | 131 ++++++++++++++++-- ...pv4-in-binding-big-packet-df-set-allow.lua | 4 +- .../counters/in-1p-ipv4-out-1p-icmpv4.lua | 4 +- .../counters/in-1p-ipv6-out-1p-icmpv4-1.lua | 4 +- .../counters/in-1p-ipv6-out-1p-icmpv6-1.lua | 4 +- .../counters/in-1p-ipv6-out-1p-icmpv6-2.lua | 4 +- .../in-1p-ipv6-out-1p-ipv4-hoplimhair.lua | 4 +- ...in-ipv4-ipv6-out-icmpv4-ipv6-hairpin-1.lua | 4 +- .../data/counters/tcp-frominet-bound-ttl1.lua | 4 +- src/program/snabbvmx/query/example1.xml | 8 +- src/program/snabbvmx/query/example2.xml | 16 +-- 12 files changed, 152 insertions(+), 51 deletions(-) diff --git a/src/apps/lwaftr/lwaftr.lua b/src/apps/lwaftr/lwaftr.lua index 405423dc5f..e76b8faec3 100644 --- a/src/apps/lwaftr/lwaftr.lua +++ b/src/apps/lwaftr/lwaftr.lua @@ -392,10 +392,10 @@ LwAftr.shm = { ["in-ipv4-packets"] = {counter}, ["in-ipv6-bytes"] = {counter}, ["in-ipv6-packets"] = {counter}, - ["out-icmpv4-bytes"] = {counter}, - ["out-icmpv4-packets"] = {counter}, - ["out-icmpv6-bytes"] = {counter}, - ["out-icmpv6-packets"] = {counter}, + ["out-icmpv4-error-bytes"] = {counter}, + ["out-icmpv4-error-packets"] = {counter}, + ["out-icmpv6-error-bytes"] = {counter}, + ["out-icmpv6-error-packets"] = {counter}, ["out-ipv4-bytes"] = {counter}, ["out-ipv4-packets"] = {counter}, ["out-ipv6-bytes"] = {counter}, @@ -492,8 +492,8 @@ function LwAftr:transmit_icmpv6_reply (pkt) -- Send packet if limit not reached. if self.icmpv6_error_count < rate_limiting.packets then self.icmpv6_error_count = self.icmpv6_error_count + 1 - counter.add(self.shm["out-icmpv6-bytes"], pkt.length) - counter.add(self.shm["out-icmpv6-packets"]) + counter.add(self.shm["out-icmpv6-error-bytes"], pkt.length) + counter.add(self.shm["out-icmpv6-error-packets"]) counter.add(self.shm["out-ipv6-bytes"], pkt.length) counter.add(self.shm["out-ipv6-packets"]) return transmit(self.o6, pkt) @@ -536,8 +536,8 @@ function LwAftr:transmit_icmpv4_reply(pkt, orig_pkt, orig_pkt_link) -- Send packet if limit not reached. if self.icmpv4_error_count < rate_limiting.packets then self.icmpv4_error_count = self.icmpv4_error_count + 1 - counter.add(self.shm["out-icmpv4-bytes"], pkt.length) - counter.add(self.shm["out-icmpv4-packets"]) + counter.add(self.shm["out-icmpv4-error-bytes"], pkt.length) + counter.add(self.shm["out-icmpv4-error-packets"]) -- Only locally generated error packets are handled here. We transmit -- them right away, instead of calling transmit_ipv4, because they are -- never hairpinned and should not be counted by the "out-ipv4" counter. diff --git a/src/lib/yang/snabb-softwire-v3.yang b/src/lib/yang/snabb-softwire-v3.yang index a146c0f0cb..c4b4eb5cdc 100644 --- a/src/lib/yang/snabb-softwire-v3.yang +++ b/src/lib/yang/snabb-softwire-v3.yang @@ -17,7 +17,12 @@ module snabb-softwire-v3 { Fix mistakes in leaf descriptions. Add default value for error-rate-limiting/packets. Allow more than two queues (lift id leaf range restriction). - Move leaf external-interface/device up as external-device."; + Move leaf external-interface/device up as external-device. + Add softwire-state/{in,out}-icmpv{4,6}-echo-{bytes,packets}, counters. + Add softwire-state/{in,out}-arp-{request,reply}-{bytes,packets}, counters. + Add softwire-state/{in,out}-ndp-{ns,na}-{bytes,packets}, counters. + Renamed softwire-state/{in,out}-icmpv{4,6}-{bytes,packets} + to softwire-state/{in,out}-icmpv{4,6}-error-{bytes,packets}."; } revision 2019-09-17 { @@ -314,7 +319,7 @@ module snabb-softwire-v3 { } leaf in-ipv4-bytes { type yang:zero-based-counter64; - description "All valid outgoing IPv4 packets."; + description "Valid incoming IPv4 bytes."; } leaf in-ipv4-frag-needs-reassembly { type yang:zero-based-counter64; @@ -332,11 +337,11 @@ module snabb-softwire-v3 { } leaf in-ipv4-packets { type yang:zero-based-counter64; - description "All valid outgoing IPv4 packets."; + description "Validalid incoming IPv4 packets."; } leaf in-ipv6-bytes { type yang:zero-based-counter64; - description "All valid outgoing IPv4 packets."; + description "Valid incoming IPv6 bytes."; } leaf in-ipv6-frag-needs-reassembly { type yang:zero-based-counter64; @@ -354,7 +359,7 @@ module snabb-softwire-v3 { } leaf in-ipv6-packets { type yang:zero-based-counter64; - description "All valid outgoing IPv4 packets."; + description "Valid incoming IPv6 packets."; } leaf ingress-packet-drops { type yang:zero-based-counter64; @@ -374,25 +379,121 @@ module snabb-softwire-v3 { for reassembling IPv6 fragments. This is directly proportional to the setting max_ipv6_reassembly_packets."; } - leaf out-icmpv4-bytes { + leaf in-arp-request-bytes { type yang:zero-based-counter64; - description "Internally generated ICMPv4 packets."; + description "Incoming ARP request bytes."; } - leaf out-icmpv4-packets { + leaf in-arp-request-packets { type yang:zero-based-counter64; - description "Internally generated ICMPv4 packets."; + description "Incoming ARP request packets."; } - leaf out-icmpv6-bytes { + leaf out-arp-request-bytes { type yang:zero-based-counter64; - description "Internally generted ICMPv6 error packets."; + description "Internally generated ARP request bytes."; + } + leaf out-arp-request-packets { + type yang:zero-based-counter64; + description "Internally generated ARP request packets."; + } + leaf in-arp-reply-bytes { + type yang:zero-based-counter64; + description "Incoming ARP reply bytes."; + } + leaf in-arp-reply-packets { + type yang:zero-based-counter64; + description "Incoming ARP reply packets."; + } + leaf out-arp-reply-bytes { + type yang:zero-based-counter64; + description "Internally generated ARP reply bytes."; + } + leaf out-arp-reply-packets { + type yang:zero-based-counter64; + description "Internally generated ARP reply packets."; + } + leaf in-ndp-ns-bytes { + type yang:zero-based-counter64; + description "Incoming NDP neighbor solicitation bytes."; + } + leaf in-ndp-ns-packets { + type yang:zero-based-counter64; + description "Incoming NDP neighbor solicitation packets."; + } + leaf out-ndp-ns-bytes { + type yang:zero-based-counter64; + description "Internally generated NDP neighbor solicitation bytes."; + } + leaf out-ndp-ns-packets { + type yang:zero-based-counter64; + description "Internally generated NDP neighbor solicitation packets."; + } + leaf in-ndp-na-bytes { + type yang:zero-based-counter64; + description "Incoming NDP neighbot advertisement bytes."; + } + leaf in-ndp-na-packets { + type yang:zero-based-counter64; + description "Incoming NDP neighbot advertisement packets."; + } + leaf out-ndp-na-bytes { + type yang:zero-based-counter64; + description "Internally generated NDP neighbot advertisement bytes."; } - leaf out-icmpv6-packets { + leaf out-ndp-na-packets { + type yang:zero-based-counter64; + description "Internally generated NDP neighbot advertisement packets."; + } + leaf out-icmpv4-error-bytes { + type yang:zero-based-counter64; + description "Internally generated ICMPv4 error bytes."; + } + leaf out-icmpv4-error-packets { + type yang:zero-based-counter64; + description "Internally generated ICMPv4 error packets."; + } + leaf out-icmpv6-error-bytes { + type yang:zero-based-counter64; + description "Internally generted ICMPv6 error bytes."; + } + leaf out-icmpv6-error-packets { type yang:zero-based-counter64; description "Internally generted ICMPv6 error packets."; } + leaf in-icmpv4-echo-bytes { + type yang:zero-based-counter64; + description "Valid incoming ICMPv4 echo request bytes."; + } + leaf in-icmpv4-echo-packets { + type yang:zero-based-counter64; + description "Valid incoming ICMPv4 echo request packets."; + } + leaf out-icmpv4-echo-bytes { + type yang:zero-based-counter64; + description "Internally generated ICMPv4 echo reply bytes."; + } + leaf out-icmpv4-echo-packets { + type yang:zero-based-counter64; + description "Internally generated ICMPv4 echo reply packets."; + } + leaf in-icmpv6-echo-bytes { + type yang:zero-based-counter64; + description "Valid incoming ICMPv6 echo request bytes."; + } + leaf in-icmpv6-echo-packets { + type yang:zero-based-counter64; + description "Valid incoming ICMPv6 echo request packets."; + } + leaf out-icmpv6-echo-bytes { + type yang:zero-based-counter64; + description "Internally generted ICMPv6 echo reply bytes."; + } + leaf out-icmpv6-echo-packets { + type yang:zero-based-counter64; + description "Internally generted ICMPv6 echo reply packets."; + } leaf out-ipv4-bytes { type yang:zero-based-counter64; - description "Valid outgoing IPv4 packets."; + description "Valid outgoing IPv4 bytes."; } leaf out-ipv4-frag { type yang:zero-based-counter64; @@ -412,7 +513,7 @@ module snabb-softwire-v3 { } leaf out-ipv6-bytes { type yang:zero-based-counter64; - description "All valid outgoing IPv6 packets."; + description "Valid outgoing IPv6 bytes."; } leaf out-ipv6-frag { type yang:zero-based-counter64; @@ -428,7 +529,7 @@ module snabb-softwire-v3 { } leaf out-ipv6-packets { type yang:zero-based-counter64; - description "All valid outgoing IPv6 packets."; + description "Valid outgoing IPv6 packets."; } } } diff --git a/src/program/lwaftr/tests/data/counters/from-inet-ipv4-in-binding-big-packet-df-set-allow.lua b/src/program/lwaftr/tests/data/counters/from-inet-ipv4-in-binding-big-packet-df-set-allow.lua index bd28c5fd2a..a710f73ef3 100644 --- a/src/program/lwaftr/tests/data/counters/from-inet-ipv4-in-binding-big-packet-df-set-allow.lua +++ b/src/program/lwaftr/tests/data/counters/from-inet-ipv4-in-binding-big-packet-df-set-allow.lua @@ -8,8 +8,8 @@ return { ["in-ipv4-packets"] = 1, ["memuse-ipv4-frag-reassembly-buffer"] = 728203264, ["memuse-ipv6-frag-reassembly-buffer"] = 11378176, - ["out-icmpv4-bytes"] = 590, - ["out-icmpv4-packets"] = 1, + ["out-icmpv4-error-bytes"] = 590, + ["out-icmpv4-error-packets"] = 1, ["out-ipv4-bytes"] = 590, ["out-ipv4-frag-not"] = 1, ["out-ipv4-packets"] = 1, diff --git a/src/program/lwaftr/tests/data/counters/in-1p-ipv4-out-1p-icmpv4.lua b/src/program/lwaftr/tests/data/counters/in-1p-ipv4-out-1p-icmpv4.lua index 39a5560d16..5d25e667a3 100644 --- a/src/program/lwaftr/tests/data/counters/in-1p-ipv4-out-1p-icmpv4.lua +++ b/src/program/lwaftr/tests/data/counters/in-1p-ipv4-out-1p-icmpv4.lua @@ -8,8 +8,8 @@ return { ["in-ipv4-packets"] = 1, ["memuse-ipv4-frag-reassembly-buffer"] = 728203264, ["memuse-ipv6-frag-reassembly-buffer"] = 11378176, - ["out-icmpv4-bytes"] = 94, - ["out-icmpv4-packets"] = 1, + ["out-icmpv4-error-bytes"] = 94, + ["out-icmpv4-error-packets"] = 1, ["out-ipv4-bytes"] = 94, ["out-ipv4-frag-not"] = 1, ["out-ipv4-packets"] = 1, diff --git a/src/program/lwaftr/tests/data/counters/in-1p-ipv6-out-1p-icmpv4-1.lua b/src/program/lwaftr/tests/data/counters/in-1p-ipv6-out-1p-icmpv4-1.lua index c9191edd5b..b9eb814f37 100644 --- a/src/program/lwaftr/tests/data/counters/in-1p-ipv6-out-1p-icmpv4-1.lua +++ b/src/program/lwaftr/tests/data/counters/in-1p-ipv6-out-1p-icmpv4-1.lua @@ -4,8 +4,8 @@ return { ["in-ipv6-packets"] = 1, ["memuse-ipv4-frag-reassembly-buffer"] = 728203264, ["memuse-ipv6-frag-reassembly-buffer"] = 11378176, - ["out-icmpv4-bytes"] = 94, - ["out-icmpv4-packets"] = 1, + ["out-icmpv4-error-bytes"] = 94, + ["out-icmpv4-error-packets"] = 1, ["out-ipv4-bytes"] = 94, ["out-ipv4-frag-not"] = 1, ["out-ipv4-packets"] = 1, diff --git a/src/program/lwaftr/tests/data/counters/in-1p-ipv6-out-1p-icmpv6-1.lua b/src/program/lwaftr/tests/data/counters/in-1p-ipv6-out-1p-icmpv6-1.lua index 16190036a3..9adebfee0f 100644 --- a/src/program/lwaftr/tests/data/counters/in-1p-ipv6-out-1p-icmpv6-1.lua +++ b/src/program/lwaftr/tests/data/counters/in-1p-ipv6-out-1p-icmpv6-1.lua @@ -8,8 +8,8 @@ return { ["in-ipv6-packets"] = 1, ["memuse-ipv4-frag-reassembly-buffer"] = 728203264, ["memuse-ipv6-frag-reassembly-buffer"] = 11378176, - ["out-icmpv6-bytes"] = 154, - ["out-icmpv6-packets"] = 1, + ["out-icmpv6-error-bytes"] = 154, + ["out-icmpv6-error-packets"] = 1, ["out-ipv6-bytes"] = 154, ["out-ipv6-packets"] = 1, ["out-ipv6-frag-not"] = 1, diff --git a/src/program/lwaftr/tests/data/counters/in-1p-ipv6-out-1p-icmpv6-2.lua b/src/program/lwaftr/tests/data/counters/in-1p-ipv6-out-1p-icmpv6-2.lua index 2c41896cac..22195784b1 100644 --- a/src/program/lwaftr/tests/data/counters/in-1p-ipv6-out-1p-icmpv6-2.lua +++ b/src/program/lwaftr/tests/data/counters/in-1p-ipv6-out-1p-icmpv6-2.lua @@ -8,8 +8,8 @@ return { ["in-ipv6-packets"] = 1, ["memuse-ipv4-frag-reassembly-buffer"] = 728203264, ["memuse-ipv6-frag-reassembly-buffer"] = 11378176, - ["out-icmpv6-bytes"] = 186, - ["out-icmpv6-packets"] = 1, + ["out-icmpv6-error-bytes"] = 186, + ["out-icmpv6-error-packets"] = 1, ["out-ipv6-bytes"] = 186, ["out-ipv6-packets"] = 1, ["out-ipv6-frag-not"] = 1, diff --git a/src/program/lwaftr/tests/data/counters/in-1p-ipv6-out-1p-ipv4-hoplimhair.lua b/src/program/lwaftr/tests/data/counters/in-1p-ipv6-out-1p-ipv4-hoplimhair.lua index de987128b6..7ebbc90158 100644 --- a/src/program/lwaftr/tests/data/counters/in-1p-ipv6-out-1p-ipv4-hoplimhair.lua +++ b/src/program/lwaftr/tests/data/counters/in-1p-ipv6-out-1p-ipv4-hoplimhair.lua @@ -4,8 +4,8 @@ return { ["in-ipv6-packets"] = 1, ["memuse-ipv4-frag-reassembly-buffer"] = 728203264, ["memuse-ipv6-frag-reassembly-buffer"] = 11378176, - ["out-icmpv4-bytes"] = 94, - ["out-icmpv4-packets"] = 1, + ["out-icmpv4-error-bytes"] = 94, + ["out-icmpv4-error-packets"] = 1, ["out-ipv6-bytes"] = 134, ["out-ipv6-frag-not"] = 1, ["out-ipv6-packets"] = 1, diff --git a/src/program/lwaftr/tests/data/counters/in-ipv4-ipv6-out-icmpv4-ipv6-hairpin-1.lua b/src/program/lwaftr/tests/data/counters/in-ipv4-ipv6-out-icmpv4-ipv6-hairpin-1.lua index c1da09ef34..cd7abb7e60 100644 --- a/src/program/lwaftr/tests/data/counters/in-ipv4-ipv6-out-icmpv4-ipv6-hairpin-1.lua +++ b/src/program/lwaftr/tests/data/counters/in-ipv4-ipv6-out-icmpv4-ipv6-hairpin-1.lua @@ -10,8 +10,8 @@ return { ["in-ipv6-packets"] = 1, ["memuse-ipv4-frag-reassembly-buffer"] = 728203264, ["memuse-ipv6-frag-reassembly-buffer"] = 11378176, - ["out-icmpv4-bytes"] = 94, - ["out-icmpv4-packets"] = 1, + ["out-icmpv4-error-bytes"] = 94, + ["out-icmpv4-error-packets"] = 1, ["out-ipv6-bytes"] = 134, ["out-ipv6-frag-not"] = 1, ["out-ipv6-packets"] = 1, diff --git a/src/program/lwaftr/tests/data/counters/tcp-frominet-bound-ttl1.lua b/src/program/lwaftr/tests/data/counters/tcp-frominet-bound-ttl1.lua index 55ea25db4b..6e67dbfde1 100644 --- a/src/program/lwaftr/tests/data/counters/tcp-frominet-bound-ttl1.lua +++ b/src/program/lwaftr/tests/data/counters/tcp-frominet-bound-ttl1.lua @@ -8,8 +8,8 @@ return { ["in-ipv4-packets"] = 1, ["memuse-ipv4-frag-reassembly-buffer"] = 728203264, ["memuse-ipv6-frag-reassembly-buffer"] = 11378176, - ["out-icmpv4-bytes"] = 94, - ["out-icmpv4-packets"] = 1, + ["out-icmpv4-error-bytes"] = 94, + ["out-icmpv4-error-packets"] = 1, ["out-ipv4-bytes"] = 94, ["out-ipv4-frag-not"] = 1, ["out-ipv4-packets"] = 1, diff --git a/src/program/snabbvmx/query/example1.xml b/src/program/snabbvmx/query/example1.xml index 3e19ca584a..796896f612 100644 --- a/src/program/snabbvmx/query/example1.xml +++ b/src/program/snabbvmx/query/example1.xml @@ -92,10 +92,10 @@ 114681497770 0 0 - 0 - 0 - 0 - 0 + 0 + 0 + 0 + 0 5119140314 0 0 diff --git a/src/program/snabbvmx/query/example2.xml b/src/program/snabbvmx/query/example2.xml index 9ac3997759..8b42e0dcfb 100644 --- a/src/program/snabbvmx/query/example2.xml +++ b/src/program/snabbvmx/query/example2.xml @@ -83,10 +83,10 @@ 0 0 0 - 0 - 0 - 0 - 0 + 0 + 0 + 0 + 0 0 0 0 @@ -320,10 +320,10 @@ 0 0 0 - 0 - 0 - 0 - 0 + 0 + 0 + 0 + 0 0 0 0 From 37cfb3625262c64a5706a5556b776f9184f8d47e Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Tue, 9 Nov 2021 10:14:56 +0000 Subject: [PATCH 173/209] apps.intel_avf: add option to add additional macs --- src/apps/intel_avf/intel_avf.lua | 35 ++++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 11 deletions(-) diff --git a/src/apps/intel_avf/intel_avf.lua b/src/apps/intel_avf/intel_avf.lua index 144167fff1..d45fd4f375 100644 --- a/src/apps/intel_avf/intel_avf.lua +++ b/src/apps/intel_avf/intel_avf.lua @@ -27,6 +27,7 @@ Intel_avf = { pciaddr = { required=true }, nqueues = {}, vlan = {}, + macs = {default={}}, ring_buffer_size = {default=2048} } } @@ -156,15 +157,20 @@ local virtchnl_queue_config_info_t = ffi.typeof([[ local virtchnl_queue_config_info_ptr_t = ffi.typeof("$ *", virtchnl_queue_config_info_t) -local virtchnl_ether_addr_t = ffi.typeof([[ +local virtchnl_ether_addr_t = ffi.typeof[[ struct { - uint16_t vsi; - uint16_t num_elements; uint8_t addr[6]; // MAC_ADDR_BYTE_LEN uint8_t pad[2]; } __attribute__((packed)) -]]) -local virtchnl_ether_addr_ptr_t = ffi.typeof("$ *", virtchnl_ether_addr_t) +]] +local virtchnl_ether_addr_list_t = ffi.typeof([[ + struct { + uint16_t vsi; + uint16_t num_elements; + $ list[1]; + } __attribute__((packed)) +]], virtchnl_ether_addr_t) +local virtchnl_ether_addr_list_ptr_t = ffi.typeof("$ *", virtchnl_ether_addr_list_t) local eth_stats_t = ffi.typeof([[ struct { @@ -757,7 +763,7 @@ function Intel_avf:mbox_setup() VIRTCHNL_OP_CONFIG_IRQ_MAP = 7, VIRTCHNL_OP_ENABLE_QUEUES = 8, VIRTCHNL_OP_DISABLE_QUEUES = 9, - -- VIRTCHNL_OP_ADD_ETH_ADDR = 10, + VIRTCHNL_OP_ADD_ETH_ADDR = 10, -- VIRTCHNL_OP_DEL_ETH_ADDR = 11, VIRTCHNL_OP_ADD_VLAN = 12, -- VIRTCHNL_OP_DEL_VLAN = 13, @@ -990,6 +996,9 @@ function Intel_avf:new(conf) self:mbox_sr_version() self:mbox_sr_caps() self:mbox_sr_rss(conf.nqueues or 1) + if #conf.macs > 0 then + self:mbox_sr_add_mac(conf.macs) + end if self.vlan then self:mbox_sr_vlan() end @@ -1130,13 +1139,17 @@ function Intel_avf:mbox_sr_irq(nqueues) self:mbox_sr("VIRTCHNL_OP_CONFIG_IRQ_MAP", ffi.sizeof(virtchnl_irq_map_info_t) + 12) end -function Intel_avf:mbox_sr_add_mac() +function Intel_avf:mbox_sr_add_mac(macs) -- pg81 - local tt = self:mbox_send_buf(virtchnl_ether_addr_ptr_t) + local tt = self:mbox_send_buf(virtchnl_ether_addr_list_ptr_t) tt.vsi = self.vsi_id - tt.num_elements = 1 - ffi.copy(tt.addr, self.mac, MAC_ADDR_BYTE_LEN) - self:mbox_sr('VIRTCHNL_OP_ADD_ETH_ADDR', ffi.sizeof(virtchnl_ether_addr_t) + 8) + tt.num_elements = #macs + for i, mac in ipairs(macs) do + ffi.copy(tt.list[i-1].addr, mac, MAC_ADDR_BYTE_LEN) + end + self:mbox_sr('VIRTCHNL_OP_ADD_ETH_ADDR', + ffi.sizeof(virtchnl_ether_addr_list_t) + + ffi.sizeof(virtchnl_ether_addr_t) * #macs) end function Intel_avf:mbox_sr_rss(nqueues) From 2bfba71328fb0a33dec9c895f42e872e599dd444 Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Tue, 9 Nov 2021 10:15:12 +0000 Subject: [PATCH 174/209] apps.intel_avf: update README.md --- src/apps/intel_avf/README.md | 43 ++++++++++++++++++++++++++++++++---- 1 file changed, 39 insertions(+), 4 deletions(-) diff --git a/src/apps/intel_avf/README.md b/src/apps/intel_avf/README.md index 72a49b1193..9eccd78f36 100644 --- a/src/apps/intel_avf/README.md +++ b/src/apps/intel_avf/README.md @@ -18,20 +18,55 @@ The links are named `input` and `output`. *Required*. The PCI address of the NIC as a string. +— Key **vlan** + +*Optional*. VLAN id used for filtering packets. If specified, VLAN tags are +stripped for incoming packets and inserted for outgoing packets. + +— Key **macs** + +*Optional*. Additional unicast or multicast MACs to listen to. +The default is the empty array `{}`. + +— Key **nqueues** + +*Optional*. Number of RSS queues to configure. If specified you need to use +the `intel_avf.IO` app to attach for I/O for each respective queue. + — Key **ring_buffer_size** *Optional*. Number of DMA descriptors to use i.e. size of the DMA transmit and receive queues. Must be a multiple of 128. Default is not specified but assumed to be broadly applicable. +## IO app + +The `intel_avf.IO` app provides a driver for a single RSS queue of a +Virtual Function (see *nqueues*). + +The links are names `input` and `output`. + + DIAGRAM: Intel_avf_IO + +-----------+ + | | + input ---->* IO *----> output + | | + +-----------+ +### Configuration + +— Key **pciaddr** + +*Required*. The PCI address of the NIC as a string. + +— Key **queue** + +*Required*. The queue number of the respective RSS queue, starting from zero. + ## Supported Hardware Ethernet controller [0200]: Intel Corporation Ethernet Virtual Function 700 Series [8086:154c] (rev 02) ## Unsupported features -* Multiple queues per VF. This driver supports a single queue. The spec allows for up to 4 queues. -* RSS with only 1 queue RSS doesn't make sense. -* Multiple vlans are unsupported, `ip link` can be used to map all traffic to a single vlan. -* Multiple MAC addresses are unsupported, `ip link` can be used to set the mac before snabb startup. +* Multiple vlans are unsupported, `vlan` can be used to strip/insert a single vlan ID. * All of the advanced offload features are unsupported. * 16 byte RX descriptors are unsupported. From b112c94ea5dc8a375acb6bb13d386ea8616c0f56 Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Tue, 9 Nov 2021 10:16:04 +0000 Subject: [PATCH 175/209] lwaftr: configure internal-interface multicast MAC address for intel_avf --- src/program/lwaftr/setup.lua | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/program/lwaftr/setup.lua b/src/program/lwaftr/setup.lua index 6b5654bbad..51d416f12f 100644 --- a/src/program/lwaftr/setup.lua +++ b/src/program/lwaftr/setup.lua @@ -295,10 +295,14 @@ end function config_intel_avf(c, name, opt, lwconfig) local nqueues = lwutil.num_queues(lwconfig) if lwutil.is_lowest_queue(lwconfig) then + local _, _, queue = lwutil.parse_instance(lwconfig) + local v6_mcast = ipv6:solicited_node_mcast(queue.internal_interface.ip) + local mac_mcast = ethernet:ipv6_mcast(v6_mcast) config.app(c, "IntelAVF_"..opt.pci:gsub("[%.:]", "_"), intel_avf.Intel_avf, { pciaddr = opt.pci, vlan = opt.vlan, - nqueues = nqueues + nqueues = nqueues, + macs = {mac_mcast} }) end config.app(c, name, intel_avf.IO, { @@ -345,7 +349,6 @@ function config_intel_avf_pf(c, name, opt, lwconfig) local avf_opt = { pci = vfpci, queue = opt.queue, - mac = opt.mac, vlan = opt.vlan, ring_buffer_size = opt.ring_buffer_size } From d3a30c6bfc96c8d81afa55f4259a7e90d4a285a8 Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Fri, 12 Nov 2021 09:47:21 +0000 Subject: [PATCH 176/209] lib.protocol.ethernet: remove ptoi apps.connectx: use lib.macaddress instead of ptoi --- src/apps/mellanox/connectx.lua | 23 +++++++++++------------ src/lib/protocol/ethernet.lua | 12 ------------ 2 files changed, 11 insertions(+), 24 deletions(-) diff --git a/src/apps/mellanox/connectx.lua b/src/apps/mellanox/connectx.lua index bd633a8a29..ec917da863 100644 --- a/src/apps/mellanox/connectx.lua +++ b/src/apps/mellanox/connectx.lua @@ -442,7 +442,7 @@ function ConnectX:new (conf) for mac, rqlist in pairs(macvlan_rqlist[vlan]) do local tid = setup_rss_rxtable(rqlist, tdomain, 1) hca:set_flow_table_entry_macvlan(rxtable, NIC_RX, flow_group_macvlan, index, - FLOW_TABLE, tid, ethernet:ptoi(mac), vlan) + FLOW_TABLE, tid, macaddress:new(mac), vlan) index = index + 1 end end @@ -450,7 +450,7 @@ function ConnectX:new (conf) local flow_group_mcast = hca:create_flow_group_macvlan( rxtable, NIC_RX, index, index + mcast_size - 1, usevlan, 'mcast' ) - local mac_mcast = ethernet:ptoi("01:00:00:00:00:00") + local mac_mcast = macaddress:new("01:00:00:00:00:00") for vlan in pairs(macvlan_rqlist) do local mcast_tirs = {} for mac, rqlist in pairs(macvlan_rqlist[vlan]) do @@ -486,8 +486,7 @@ function ConnectX:new (conf) status = {counter, 2}, -- Link down type = {counter, 0x1000}, -- ethernetCsmacd promisc = {counter, vport_context.promisc_all}, - macaddr = {counter, - macaddress:new(vport_context.permanent_address).bits}, + macaddr = {counter, vport_context.permanent_address.bits}, rxbytes = {counter}, rxpackets = {counter}, rxmcast = {counter}, @@ -967,13 +966,13 @@ function HCA:query_nic_vport_context () :execute() local mac_hi = self:output(0x10+0xF4, 31, 0) local mac_lo = self:output(0x10+0xF8, 31, 0) - local mac_hex = bit.tohex(mac_hi, 4) .. bit.tohex(mac_lo, 8) + local mac = macaddress:new(bit.tohex(mac_hi, 4) .. bit.tohex(mac_lo, 8)) return { min_wqe_inline_mode = self:output(0x10+0x00, 26, 24), mtu = self:output(0x10+0x24, 15, 0), promisc_uc = self:output(0x10+0xf0, 31, 31) == 1, promisc_mc = self:output(0x10+0xf0, 30, 30) == 1, promisc_all = self:output(0x10+0xf0, 29, 29) == 1, - permanent_address = mac_hex } + permanent_address = mac } end function HCA:modify_nic_vport_context (mtu, promisc_uc, promisc_mc, promisc_all) @@ -1574,8 +1573,8 @@ end -- Create a DMAC+VLAN flow group. function HCA:create_flow_group_macvlan (table_id, table_type, start_ix, end_ix, usevlan, mcast) - local dmac = (mcast and ethernet:ptoi("01:00:00:00:00:00")) - or ethernet:ptoi("ff:ff:ff:ff:ff:ff") + local dmac = (mcast and macaddress:new("01:00:00:00:00:00")) + or macaddress:new("ff:ff:ff:ff:ff:ff") self:command("CREATE_FLOW_GROUP", 0x3FC, 0x0C) :input("opcode", 0x00, 31, 16, 0x933) :input("table_type", 0x10, 31, 24, table_type) @@ -1583,8 +1582,8 @@ function HCA:create_flow_group_macvlan (table_id, table_type, start_ix, end_ix, :input("start_ix", 0x1C, 31, 0, start_ix) :input("end_ix", 0x24, 31, 0, end_ix) -- (inclusive) :input("match_criteria", 0x3C, 7, 0, 1) -- match outer headers - :input("dmac0", 0x40 + 0x08, 31, 0, shr(dmac, 16)) - :input("dmac1", 0x40 + 0x0C, 31, 16, band(dmac, 0xFFFF)) + :input("dmac0", 0x40 + 0x08, 31, 0, bswap(dmac:subbits(0,32))) + :input("dmac1", 0x40 + 0x0C, 31, 16, shr(bswap(dmac:subbits(32,48)), 16)) if usevlan then self:input("vlanid", 0x40 + 0x0C, 11, 0, 0xFFF) end @@ -1606,8 +1605,8 @@ function HCA:set_flow_table_entry_macvlan (table_id, table_type, group_id, :input("group_id", 0x40 + 0x04, 31, 0, group_id) :input("action", 0x40 + 0x0C, 15, 0, 4) -- action = FWD_DST :input("dest_list_sz", 0x40 + 0x10, 23, 0, #dest_ids) -- destination list size - :input("dmac0", 0x40 + 0x48, 31, 0, shr(dmac, 16)) - :input("dmac1", 0x40 + 0x4C, 31, 16, band(dmac, 0xFFFF)) + :input("dmac0", 0x40 + 0x48, 31, 0, bswap(dmac:subbits(0,32))) + :input("dmac1", 0x40 + 0x4C, 31, 16, shr(bswap(dmac:subbits(32,48)), 16)) :input("vlan", 0x40 + 0x4C, 11, 0, vlanid or 0) for i, dest_id in ipairs(dest_ids) do self:input("dest_type", 0x40 + 0x300 + 0x8*(i-1), 31, 24, dest_type) diff --git a/src/lib/protocol/ethernet.lua b/src/lib/protocol/ethernet.lua index ad8e4dd2db..c874f35acb 100644 --- a/src/lib/protocol/ethernet.lua +++ b/src/lib/protocol/ethernet.lua @@ -66,18 +66,6 @@ function ethernet:ntop (n) return table.concat(p, ":") end --- Convert printable address to integer -function ethernet:ptoi (p) - local n = ethernet:pton(p) - assert(ffi.abi("le")) - return bit.bor(bit.lshift(n[0], 40), - bit.lshift(n[1], 32), - bit.lshift(n[2], 24), - bit.lshift(n[3], 16), - bit.lshift(n[4], 8), - bit.lshift(n[5], 0)) -end - -- Mapping of an IPv6 multicast address to a MAC address per RFC2464, -- section 7 function ethernet:ipv6_mcast(ip) From ee37d77d0613911b5a4fff26d5b2b28bbff3251f Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Mon, 6 Sep 2021 13:42:23 +0000 Subject: [PATCH 177/209] apps.intel_avf: add multiqueue/rss support Design based on separate Manager/IO apps design from ConnectX driver. (cherry picked from commit 4c84dd0ee09826a273eecdc0974919a0ff3fd1cf) --- src/apps/intel_avf/intel_avf.lua | 618 +++++++++++++----- src/apps/intel_avf/tests/back2back/test.snabb | 36 + 2 files changed, 500 insertions(+), 154 deletions(-) diff --git a/src/apps/intel_avf/intel_avf.lua b/src/apps/intel_avf/intel_avf.lua index 52716a282f..79d5a82c00 100644 --- a/src/apps/intel_avf/intel_avf.lua +++ b/src/apps/intel_avf/intel_avf.lua @@ -7,6 +7,7 @@ module(..., package.seeall) local ffi = require("ffi") local lib = require("core.lib") +local sync = require("core.sync") local macaddress = require("lib.macaddress") local pci = require("lib.hardware.pci") local register = require("lib.hardware.register") @@ -24,6 +25,7 @@ local MAC_ADDR_BYTE_LEN = 6 Intel_avf = { config = { pciaddr = { required=true }, + nqueues = {}, ring_buffer_size = {default=2048} } } @@ -107,33 +109,51 @@ local virtchnl_msg_t = ffi.typeof([[ ]]) local virtchnl_msg_ptr_t = ffi.typeof("$ *", virtchnl_msg_t) -local virtchnl_q_pair_t = ffi.typeof([[ +local virtchnl_txq_info_t = ffi.typeof([[ struct { uint16_t vsi_id; - uint16_t num_queue_pairs; - uint32_t pad; + uint16_t queue_id; + uint16_t ring_len; + uint16_t deprecated0; + uint64_t dma_ring_addr; + uint64_t deprecated1; + } __attribute__((packed)) +]]) - uint16_t tx_vsi_id; - uint16_t tx_queue_id; - uint16_t tx_ring_len; - uint16_t tx_deprecated0; - uint64_t tx_dma_ring_addr; - uint64_t tx_deprecated1; - - uint16_t rx_vsi_id; - uint16_t rx_queue_id; - uint32_t rx_ring_len; - uint16_t rx_hdr_size; - uint16_t rx_deprecated0; - uint32_t rx_databuffer_size; - uint32_t rx_max_pkt_size; - uint32_t rx_pad0; - uint64_t rx_dma_ring_addr; - uint32_t rx_deprecated1; - uint32_t rx_pad1; +local virtchnl_rxq_info_t = ffi.typeof([[ + struct { + uint16_t vsi_id; + uint16_t queue_id; + uint32_t ring_len; + uint16_t hdr_size; + uint16_t deprecated0; + uint32_t databuffer_size; + uint32_t max_pkt_size; + uint32_t pad0; + uint64_t dma_ring_addr; + uint32_t deprecated1; + uint32_t pad1; } __attribute__((packed)) ]]) -local virtchnl_q_pair_ptr_t = ffi.typeof("$ *", virtchnl_q_pair_t) + +local virtchnl_queue_pair_info_t = ffi.typeof([[ + struct { + /* NOTE: vsi_id and queue_id should be indentical for both queues. */ + $ txq; + $ rxq; + } __attribute__((packed)) +]], virtchnl_txq_info_t, virtchnl_rxq_info_t) + +local virtchnl_queue_config_info_t = ffi.typeof([[ + struct { + uint16_t vsi_id; + uint16_t num_queue_pairs; + uint32_t pad; + $ qpair[1]; + } __attribute__((packed)) +]], virtchnl_queue_pair_info_t) + +local virtchnl_queue_config_info_ptr_t = ffi.typeof("$ *", virtchnl_queue_config_info_t) local virtchnl_ether_addr_t = ffi.typeof([[ struct { @@ -209,6 +229,7 @@ local virtchnl_rss_key_t = ffi.typeof([[ uint16_t vsi_id; uint16_t key_len; uint8_t key[1]; /* RSS hash key, packed bytes */ + uint8_t pad; } __attribute__((packed)) ]]) local virtchnl_rss_key_ptr_t = ffi.typeof('$*', virtchnl_rss_key_t) @@ -218,6 +239,7 @@ local virtchnl_rss_lut_t = ffi.typeof([[ uint16_t vsi_id; uint16_t lut_entries; uint8_t lut[1]; /* RSS lookup table*/ + uint8_t pad; } __attribute__((packed)) ]]) local virtchnl_rss_lut_ptr_t = ffi.typeof('$*', virtchnl_rss_lut_t) @@ -246,28 +268,161 @@ local mbox_q_t = ffi.typeof([[ ]]) local mbox_q_ptr_t = ffi.typeof('$*', mbox_q_t) -function Intel_avf:init_tx_q() - self.txdesc = ffi.cast(txdesc_ptr_t, - memory.dma_alloc(ffi.sizeof(txdesc_t) * self.ring_buffer_size)) - ffi.fill(self.txdesc, ffi.sizeof(txdesc_t) * self.ring_buffer_size) - self.txqueue = ffi.new("struct packet *[?]", self.ring_buffer_size) - for i=0, self.ring_buffer_size - 1 do - self.txqueue[i] = nil - self.txdesc[i].cmd_type_offset_bsz = 0 +--------------------------------------------------------------- +-- CXQ (Queue pair control object): +-- +-- A "CXQ" is an object that we define to represent a transmit/receive pair. +-- +-- CXQs are created and deleted by a "Control" app (Intel_avf) and, +-- in between, they are used by "IO" apps to send and receive packets. +-- +-- The lifecycle of a CXQ is managed using a state machine. This is +-- necessary because we allow Control and IO apps to start in any +-- order, for Control and IO apps to start/stop/restart independently, +-- for multiple IO apps to attempt to attach to the same CXQ, and even +-- for apps to stop in one Snabb process and be started in another +-- one. +-- +-- (This design is lifted from the apps.mellanox.connectx driver.) +-- +--------------------------------------------------------------- + +-- CXQs can be in one of five states: +-- INIT: CXQ is being initialized by the control app +-- FREE: CXQ is ready and available for use by an IO app. +-- IDLE: CXQ is owned by an app, but not actively processing right now. +-- BUSY: CXQ is owned by an app and is currently processing (e.g. push/pull). +-- DEAD: CXQ has been deallocated; IO app must try to open a new one. +-- +-- Once a CXQ is closed it stays in the DEAD state forever. However, a +-- replacement CXQ with the same name can be created and existing IO +-- apps can reattach to that instead. This will rerun the state machine. +-- +-- Here are the valid state transitions & when they occur: +-- +-- App Change Why +-- ---- ----------- -------------------------------------------------------- +-- CTRL none->INIT: Control app starts initialization. +-- CTRL INIT->FREE: Control app completes initialization. +-- IO FREE->IDLE: IO app starts and becomes owner of the CXQ. +-- IO IDLE->FREE: IO app stops and releases the CXQ for future use. +-- IO IDLE->BUSY: IO app starts running a pull/push method. +-- IO BUSY->IDLE: IO app stops running a pull/push method. +-- CTRL IDLE->DEAD: Control app closes the CXQ. (Replacement can be created.) +-- +-- These state transitions are *PROHIBITED* for important reasons: +-- +-- App Change Why *PROHIBITED* +-- ------ ----------- -------------------------------------------------------- +-- CTRL BUSY->DEAD Cannot close a CXQ while it is busy (must wait.) +-- IO DEAD->BUSY Cannot use a CXQ that is closed (must check.) +-- * DEAD->* Cannot transition from DEAD (must create new CXQ.) +-- +-- Further notes: +-- +-- Packet buffers for pending DMA (transmit or receive) are freed by +-- the Control app (which can disable DMA first) rather than by the IO +-- app (which shuts down with DMA still active.) +-- +-- Abnormal shutdown of the process hosting the Control app is *not* +-- supported. We just don’t have anywhere to free packets to in that +-- case. + +-- A CXQ is represented by one struct allocated in shared memory. +-- +-- The struct defines the fields in very specific terms so that it can +-- be used directly by the driver code (rather than copying back and +-- forth between the shared memory object and a separate native +-- format.) +local cxq_t = ffi.typeof([[ + struct { + int state[1]; // current state / availability + + // configuration information: + uint32_t qno; // queue number + uint32_t ring_size; // size of rx/tx rings + + // Transmit state + uint32_t tx_next; + uint32_t tx_cand; + uint32_t tx_desc_free; + $ txdesc; + struct packet *txqueue[64*1024]; + + // Receive state + uint32_t rx_tail; + $ rxdesc; + struct packet *rxqueue[64*1024]; + } __attribute((packed)) +]], txdesc_ptr_t, rxdesc_ptr_t) + +-- CXQ states: +local INIT = 0 -- Implicit initial state due to 0 value. +local BUSY = 1 +local IDLE = 2 +local FREE = 3 +local DEAD = 4 + +-- Release CXQ from IO apps after process termination. +-- Called from core.main.shutdown +function shutdown(pid) + for _, pciaddr in ipairs(shm.children("/"..pid.."/intel_avf")) do + for _, queue in ipairs(shm.children("/"..pid.."/intel_avf/"..pciaddr)) do + local backlink = "/"..pid.."/intel_avf/"..pciaddr.."/"..queue + local ok, cxq = pcall(shm.open, backlink, cxq_t) + if ok then + -- Allow reclaimation of CXQ + sync.cas(cxq.state, IDLE, FREE) + sync.cas(cxq.state, BUSY, FREE) + shm.unlink(backlink) + end + end end end -function Intel_avf:init_rx_q() - self.rxqueue = ffi.new("struct packet *[?]", self.ring_buffer_size) - self.rxdesc = ffi.cast(rxdesc_ptr_t, - memory.dma_alloc(ffi.sizeof(rxdesc_t) * self.ring_buffer_size), 128) +function Intel_avf:init_cxq (qno) + -- Create a shared memory object for controlling the queue pair + local cxq = shm.create("group/pci/"..self.pciaddress.."/"..qno, cxq_t) + cxq.qno = qno + cxq.ring_size = self.ring_buffer_size + self:init_tx_q(cxq) + self:init_rx_q(cxq) + return cxq +end + +function Intel_avf:free_cxq (cxq) + -- Free packets remaining in TX/RX queues. + for i = 0, cxq.ring_size-1 do + if cxq.txqueue[i] ~= nil then + packet.free(cxq.txqueue[i]) + end + packet.free(cxq.rxqueue[i]) + end + shm.unlink("group/pci/"..self.pciaddress.."/"..cxq.qno) + shm.unmap(cxq) +end + +function Intel_avf:init_tx_q(cxq) + cxq.txdesc = ffi.cast(txdesc_ptr_t, memory.dma_alloc(ffi.sizeof(txdesc_t) * self.ring_buffer_size)) + ffi.fill(cxq.txdesc, ffi.sizeof(txdesc_t) * self.ring_buffer_size) + for i=0, self.ring_buffer_size - 1 do + cxq.txqueue[i] = nil + cxq.txdesc[i].cmd_type_offset_bsz = 0 + end + cxq.tx_next = 0 + cxq.tx_cand = 0 + cxq.tx_desc_free = self.ring_buffer_size - 1 +end +function Intel_avf:init_rx_q(cxq) + cxq.rxdesc = ffi.cast(rxdesc_ptr_t, memory.dma_alloc(ffi.sizeof(rxdesc_t) * self.ring_buffer_size)) for i = 0, self.ring_buffer_size-1 do local p = packet.allocate() - self.rxqueue[i] = p - self.rxdesc[i].read.address = tophysical(p.data) - self.rxdesc[i].write.status_err_type_len = 0 + cxq.rxqueue[i] = p + cxq.rxdesc[i].read.address = tophysical(p.data) + cxq.rxdesc[i].write.status_err_type_len = 0 end + cxq.rx_tail = 0 end function Intel_avf:supported_hardware() @@ -362,135 +517,202 @@ function Intel_avf:mbox_setup_txq() self.r.VF_ATQLEN(bits({ ENABLE = 31 }) + self.mbox.q_len) end -function Intel_avf:mbox_sr_q() - local tt = self:mbox_send_buf(virtchnl_q_pair_ptr_t) +function Intel_avf:mbox_sr_q(cxqs) + local tt = self:mbox_send_buf(virtchnl_queue_config_info_ptr_t) tt.vsi_id = self.vsi_id - tt.num_queue_pairs = 1 - - tt.tx_vsi_id = self.vsi_id - tt.tx_queue_id = self.qno - tt.tx_ring_len = self.ring_buffer_size - tt.tx_dma_ring_addr = tophysical(self.txdesc) - - tt.rx_vsi_id = self.vsi_id - tt.rx_queue_id = self.qno - tt.rx_ring_len = self.ring_buffer_size - -- Only 32 byte rxdescs are supported, at least by the PF driver in - -- centos 7 3.10.0-957.1.3.el7.x86_64 - tt.rx_hdr_size = 32 - tt.rx_databuffer_size = packet.max_payload - tt.rx_max_pkt_size = packet.max_payload - tt.rx_dma_ring_addr = tophysical(self.rxdesc) - - self:mbox_sr('VIRTCHNL_OP_CONFIG_VSI_QUEUES', ffi.sizeof(virtchnl_q_pair_t) + 64) + tt.num_queue_pairs = #cxqs + + for i, cxq in ipairs(cxqs) do + tt.qpair[i-1].txq.vsi_id = self.vsi_id + tt.qpair[i-1].txq.queue_id = cxq.qno + tt.qpair[i-1].txq.ring_len = cxq.ring_size + tt.qpair[i-1].txq.dma_ring_addr = tophysical(cxq.txdesc) + + tt.qpair[i-1].rxq.vsi_id = self.vsi_id + tt.qpair[i-1].rxq.queue_id = cxq.qno + tt.qpair[i-1].rxq.ring_len = cxq.ring_size + -- Only 32 byte rxdescs are supported, at least by the PF driver in + -- centos 7 3.10.0-957.1.3.el7.x86_64 + tt.qpair[i-1].rxq.hdr_size = 32 + tt.qpair[i-1].rxq.databuffer_size = packet.max_payload + tt.qpair[i-1].rxq.max_pkt_size = packet.max_payload + tt.qpair[i-1].rxq.dma_ring_addr = tophysical(cxq.rxdesc) + end - self.r.rx_tail = self.r.QRX_TAIL[self.qno] - self.r.tx_tail = self.r.QTX_TAIL[self.qno] - self.rx_tail = 0 - self.r.rx_tail(self.ring_buffer_size - 1) + self:mbox_sr('VIRTCHNL_OP_CONFIG_VSI_QUEUES', + ffi.sizeof(virtchnl_queue_config_info_t) + + ffi.sizeof(virtchnl_queue_pair_info_t) * #cxqs) end -function Intel_avf:mbox_sr_enable_q () +function Intel_avf:mbox_sr_enable_q (nqueues) local tt = self:mbox_send_buf(queue_select_ptr_t) tt.vsi_id = self.vsi_id tt.pad = 0 - tt.rx_queues = bits({ ENABLE = self.qno }) - tt.tx_queues = bits({ ENABLE = self.qno }) + local q_enable_mask = lshift(1, nqueues) - 1 + tt.rx_queues = q_enable_mask + tt.tx_queues = q_enable_mask self:mbox_sr('VIRTCHNL_OP_ENABLE_QUEUES', ffi.sizeof(queue_select_t)) end -function Intel_avf:ringnext (index) - return band(index+1, self.ring_buffer_size - 1) +IO = { + config = { + pciaddr = {required=true}, + queue = {required=true} + } +} + +function IO:new (conf) + local self = setmetatable({}, { __index = IO }) + self.pciaddr = pci.qualified(conf.pciaddr) + self.qno = conf.queue + + -- This is also done in Intel_avf:new() but might not have + -- happened yet. + pci.unbind_device_from_linux(self.pciaddr) + + self.fd = pci.open_pci_resource_unlocked(self.pciaddr, 0) + self.base = pci.map_pci_memory(self.fd) + self.r = {} + Intel_avf.load_registers(self) -- Initialize registers at (self.r.*) + + self.online = false -- True when queue is up and running + self.cxq = nil -- shm object containing queue control information + self.open_throttle = -- Timer to throttle shm open attempts (10ms) + lib.throttle(0.25) + + return self +end + +function IO:stop() + if self.cxq then + assert(sync.cas(self.cxq.state, IDLE, FREE) or + self.cxq.state[0] == DEAD, + "illegal state detected") + self:close() + end +end + +-- Close the queue mapping. +function IO:close () + shm.unlink(self.backlink) + shm.unmap(self.cxq) + self.cxq = nil end -function Intel_avf:reclaim_txdesc () +-- Open the queue mapping. +function IO:open () + local shmpath = "group/pci/"..self.pciaddr.."/"..self.qno + self.backlink = "intel_avf/"..self.pciaddr.."/"..self.qno + if shm.exists(shmpath) then + shm.alias(self.backlink, shmpath) + self.cxq = shm.open(shmpath, cxq_t) + if sync.cas(self.cxq.state, FREE, IDLE) then + -- Select queue tail registers + self.r.rx_tail = self.r.QRX_TAIL[self.cxq.qno] + self.r.tx_tail = self.r.QTX_TAIL[self.cxq.qno] + else + close() -- Queue was not FREE. + end + end +end + +-- Return true on successful activation of the queue. +function IO:activate () + -- If not open then make a request on a regular schedule. + if self.cxq == nil and self.open_throttle() then + self:open() + end + if self.cxq then + -- Careful: Control app may have closed the CXQ. + if sync.cas(self.cxq.state, IDLE, BUSY) then + return true + else + assert(self.cxq.state[0] == DEAD, "illegal state detected") + self:close() + end + end +end + +-- Enter the idle state. +function IO:deactivate () + assert(sync.cas(self.cxq.state, BUSY, IDLE)) +end + +function IO:reclaim_txdesc () local RS = bits({ RS = 5 }) local COMPLETE = 15 - while band(self.txdesc[ self:ringnext(self.tx_cand) ].cmd_type_offset_bsz, COMPLETE) == COMPLETE - and self.tx_desc_free < self.ring_buffer_size - 1 do - local c = self.tx_cand - packet.free(self.txqueue[c]) - self.txqueue[c] = nil - self.tx_cand = self:ringnext(self.tx_cand) - self.tx_desc_free = self.tx_desc_free + 1 + local cxq = self.cxq + while band(cxq.txdesc[band(cxq.tx_cand+1, cxq.ring_size-1)].cmd_type_offset_bsz, COMPLETE) == COMPLETE + and cxq.tx_desc_free < cxq.ring_size - 1 do + local c = cxq.tx_cand + packet.free(cxq.txqueue[c]) + cxq.txqueue[c] = nil + cxq.tx_cand = band(cxq.tx_cand+1, cxq.ring_size-1) + cxq.tx_desc_free = cxq.tx_desc_free + 1 end end -function Intel_avf:push () - local li = self.input.input +function IO:transmit (li) if li == nil then return end local RS_EOP = bits({ EOP = 4, RS = 5 }) local SIZE_SHIFT = 34 self:reclaim_txdesc() - while not empty(li) and self.tx_desc_free > 0 do + local cxq = self.cxq + while not empty(li) and cxq.tx_desc_free > 0 do local p = receive(li) -- NB: need to extend size for 4 byte CRC (not clear from the spec.) local size = lshift(4ULL+p.length, SIZE_SHIFT) - self.txdesc[ self.tx_next ].address = tophysical(p.data) - self.txqueue[ self.tx_next ] = p - self.txdesc[ self.tx_next ].cmd_type_offset_bsz = RS_EOP + size - self.tx_next = self:ringnext(self.tx_next) - self.tx_desc_free = self.tx_desc_free - 1 + cxq.txdesc[ cxq.tx_next ].address = tophysical(p.data) + cxq.txqueue[ cxq.tx_next ] = p + cxq.txdesc[ cxq.tx_next ].cmd_type_offset_bsz = RS_EOP + size + cxq.tx_next = band(cxq.tx_next+1, cxq.ring_size-1) + cxq.tx_desc_free = cxq.tx_desc_free - 1 end C.full_memory_barrier() - self.r.tx_tail(band(self.tx_next, self.ring_buffer_size - 1)) - - if self.sync_stats_throttle() then - self:sync_stats() - end + self.r.tx_tail(band(cxq.tx_next, cxq.ring_size - 1)) end -function Intel_avf:pull() - local lo = self.output.output +function IO:receive (lo) if lo == nil then return end local pkts = 0 - while band(self.rxdesc[self.rx_tail].write.status_err_type_len, 0x01) == 1 and pkts < engine.pull_npackets do - local p = self.rxqueue[self.rx_tail] - p.length = rshift(self.rxdesc[self.rx_tail].write.status_err_type_len, 38) + local cxq = self.cxq + while band(cxq.rxdesc[cxq.rx_tail].write.status_err_type_len, 0x01) == 1 and pkts < engine.pull_npackets do + local p = cxq.rxqueue[cxq.rx_tail] + p.length = rshift(cxq.rxdesc[cxq.rx_tail].write.status_err_type_len, 38) transmit(lo, p) local np = packet.allocate() - self.rxqueue[self.rx_tail] = np - self.rxdesc[self.rx_tail].read.address = tophysical(np.data) - self.rxdesc[self.rx_tail].write.status_err_type_len = 0 - self.rx_tail = band(self.rx_tail + 1, self.ring_buffer_size-1) + cxq.rxqueue[cxq.rx_tail] = np + cxq.rxdesc[cxq.rx_tail].read.address = tophysical(np.data) + cxq.rxdesc[cxq.rx_tail].write.status_err_type_len = 0 + cxq.rx_tail = band(cxq.rx_tail+1, cxq.ring_size-1) pkts = pkts + 1 end -- This avoids the queue being full / empty when HEAD=TAIL C.full_memory_barrier() - self.r.rx_tail(band(self.rx_tail - 1, self.ring_buffer_size - 1)) - - if self.sync_stats_throttle() then - self:sync_stats() - end + self.r.rx_tail(band(cxq.rx_tail-1, cxq.ring_size-1)) end -function Intel_avf:sync_stats () - if self.mbox.state == self.mbox.opcodes['VIRTCHNL_OP_GET_STATS'] then - self:mbox_r_stats('async') - end - if self.mbox.state == self.mbox.opcodes['VIRTCHNL_OP_RESET_VF'] then - self:mbox_s_stats() +function IO:push () + if self:activate() then + self:transmit(self.input.input) + self:deactivate() end end -function Intel_avf:flush_stats () - if self.mbox.state == self.mbox.opcodes['VIRTCHNL_OP_GET_STATS'] then - self:mbox_r_stats() +function IO:pull () + if self:activate() then + self:receive(self.output.output) + self:deactivate() end - self:mbox_s_stats() - self:mbox_r_stats() end -function Intel_avf:rxdrop () return counter.read(self.shm.rxdrop) end -function Intel_avf:txdrop () return counter.read(self.shm.txdrop) end - function Intel_avf:mbox_setup() local dlen = 4096 self.mbox = { @@ -548,7 +770,7 @@ function Intel_avf:mbox_sr(opcode, datalen) return self:mbox_recv(opcode) end -function Intel_avf:mbox_send(opcode, datalen) +function Intel_avf:mbox_send(opcode, datalen, timeout) assert(opcode == 'VIRTCHNL_OP_RESET_VF' or self.mbox.state == self.mbox.opcodes['VIRTCHNL_OP_RESET_VF']) @@ -582,9 +804,11 @@ function Intel_avf:mbox_send(opcode, datalen) self.r.VF_ATQT(self.mbox.next_send_idx) lib.waitfor(function() + assert(not (timeout and timeout()), "timeout") return self.r.VF_ATQT() == self.mbox.next_send_idx end) lib.waitfor(function() + assert(not (timeout and timeout()), "timeout") -- 1 == bits({ DescriptorDone = 0 }) -- 2 == bits({ Complete = 1 }) @@ -685,12 +909,13 @@ function Intel_avf:mbox_recv(opcode, async) return ptr end -function Intel_avf:wait_for_vfgen_rstat() +function Intel_avf:wait_for_vfgen_rstat(timeout) -- Constant names stolen from DPDK drivers/net/avf/base/virtchnl.h -- Section 6.1 on page 51 local mask0 = bits( { VIRTCHNL_VFR_COMPLETED = 1 }) local mask1 = bits( { VIRTCHNL_VFR_VFACTIVE = 2 }) lib.waitfor(function () + assert(not (timeout and timeout()), "timeout") local v = self.r.VFGEN_RSTAT() return bit.band(mask0, v) == mask0 or bit.band(mask1, v) == mask1 end) @@ -698,15 +923,10 @@ end function Intel_avf:new(conf) local self = { - pciaddress = conf.pciaddr, + pciaddress = pci.qualified(conf.pciaddr), path = pci.path(conf.pciaddr), r = {}, ring_buffer_size = conf.ring_buffer_size, - - tx_next = 0, - tx_cand = 0, - tx_desc_free = conf.ring_buffer_size - 1, - qno = 0, shm = { rxbytes = {counter}, rxpackets = {counter}, @@ -730,7 +950,7 @@ function Intel_avf:new(conf) self = setmetatable(self, { __index = Intel_avf }) self:supported_hardware() - self.fd = pci.open_pci_resource_unlocked(self.pciaddress, 0) + self.fd = pci.open_pci_resource_locked(self.pciaddress, 0) pci.unbind_device_from_linux(self.pciaddress) pci.set_bus_master(self.pciaddress, true) self.base = pci.map_pci_memory(self.fd) @@ -738,29 +958,44 @@ function Intel_avf:new(conf) -- wait for the nic to be ready, setup the mailbox and then reset it -- that way it doesn't matter what state you where given the card - self:wait_for_vfgen_rstat() - self:mbox_setup() - self:reset() - - -- FIXME - -- I haven't worked out why the sleep is required but without it - -- self_mbox_set_version hangs indefinitely - --C.sleep(1) - -- See elaboration in Intel_avf:reset() + lib.waitfor(function () + return pcall(function () + self:wait_for_vfgen_rstat() + self:mbox_setup() + self:reset() -- reset can timeout + end) + end) -- setup the nic for real self:mbox_setup() self:mbox_sr_version() self:mbox_sr_caps() - self:mbox_s_rss() - self:init_tx_q() - self:init_rx_q() + self:mbox_s_rss(conf.nqueues or 1) + + -- Queue setup + self.cxqs = {} + for qno=0, (conf.nqueues or 1) - 1 do + self.cxqs[#self.cxqs+1] = self:init_cxq(qno) + end self:init_irq() self:mbox_sr_irq() - self:mbox_sr_q() - self:mbox_sr_enable_q() + self:mbox_sr_q(self.cxqs) + self:mbox_sr_enable_q(#self.cxqs) + + for _, cxq in ipairs(self.cxqs) do + -- CXQ is now fully initialized & ready for attach. + assert(sync.cas(cxq.state, INIT, FREE)) + end + + if not conf.nqueues then + -- If number of queues it not explicitly configured default to + -- old behavior and configure this app to do I/O on a single queue. + self.io = IO:new{pciaddr=self.pciaddress, queue=0} + self.io.input, self.io.output = {}, {} + end + return self end @@ -769,39 +1004,95 @@ function Intel_avf:link() if not shm.exists("pci/"..self.pciaddress) then shm.alias("pci/"..self.pciaddress, "apps/"..self.appname) end + + if self.io then + self.io.input, self.io.output = self.input, self.output + end +end + +function Intel_avf:push () + if self.io then + self.io:push() + end + if self.sync_stats_throttle() then + self:sync_stats() + end end +function Intel_avf:pull () + if self.io then + self.io:pull() + end + if self.sync_stats_throttle() then + self:sync_stats() + end +end + +function Intel_avf:sync_stats () + if self.mbox.state == self.mbox.opcodes['VIRTCHNL_OP_GET_STATS'] then + self:mbox_r_stats('async') + end + if self.mbox.state == self.mbox.opcodes['VIRTCHNL_OP_RESET_VF'] then + self:mbox_s_stats() + end +end + +function Intel_avf:flush_stats () + if self.mbox.state == self.mbox.opcodes['VIRTCHNL_OP_GET_STATS'] then + self:mbox_r_stats() + end + self:mbox_s_stats() + self:mbox_r_stats() +end + +function Intel_avf:rxdrop () return counter.read(self.shm.rxdrop) end +function Intel_avf:txdrop () return counter.read(self.shm.txdrop) end + function Intel_avf:reset() -- From "Appendix A Virtual Channel Protocol": -- VF sends this request to PF with no parameters PF does NOT respond! VF -- driver must delay then poll VFGEN_RSTAT register until reset completion -- is indicated. The admin queue must be reinitialized after this operation. - self:mbox_send('VIRTCHNL_OP_RESET_VF', 0) + self:mbox_send('VIRTCHNL_OP_RESET_VF', 0, lib.timeout(1)) -- As per the above we (the VF driver) must "delay". Sadly, the spec does -- (as of this time / to my knowledge) not give further clues as to how to -- detect that the delay is sufficient. One second turned out to be not -- enough in some cases, two seconds has always worked so far. C.usleep(2e6) - self:wait_for_vfgen_rstat() + self:wait_for_vfgen_rstat(lib.timeout(1)) end function Intel_avf:stop() self:reset() pci.set_bus_master(self.pciaddress, false) pci.close_pci_resource(self.fd, self.base) - -- Free packets remaining in TX/RX queues. - for i = 0, self.ring_buffer_size-1 do - if self.txqueue[i] ~= nil then - packet.free(self.txqueue[i]) - end + -- If we have an embedded IO app, stop it. + if self.io then + self.io:stop() end - for i = 0, self.ring_buffer_size-1 do - packet.free(self.rxqueue[i]) + -- Free packets remaining in TX/RX queues. + for _, cxq in ipairs(self.cxqs) do + local timeout = lib.timeout(3) + lib.waitfor(function () + assert(not timeout(), "Intel_avf: failed to free queue "..tonumber(cxq.qno)) + return sync.cas(cxq.state, FREE, DEAD) or sync.cas(cxq.state, IDLE, DEAD) + end) + self:free_cxq(cxq) end -- Unlink SHM alias. shm.unlink("pci/"..self.pciaddress) end +function Intel_avf:report () + self:flush_stats() + for _, c in ipairs{ + 'rxbytes', 'rxpackets', 'rxmcast', 'rxbcast', 'rxdrop', 'rx_unknown_protocol', + 'txbytes', 'txpackets', 'txmcast', 'txbcast', 'txdrop', 'txerrors' + } do + print((" %-20s %20s"):format(c, lib.comma_value(counter.read(self.shm[c])))) + end +end + function Intel_avf:init_irq() local intv = bit.lshift(20, 5) local v = bit.bor(bits({ ENABLE = 0, CLEARPBA = 1, ITR0 = 3, ITR1 = 4}), intv) @@ -828,13 +1119,32 @@ function Intel_avf:mbox_sr_add_mac() self:mbox_sr('VIRTCHNL_OP_ADD_ETH_ADDR', ffi.sizeof(virtchnl_ether_addr_t) + 8) end -function Intel_avf:mbox_s_rss() - -- pg83 - -- Forcefully disable the NICs RSS features. Contrary to the spec, RSS - -- capabilites are turned on by default and need to be disabled (as least - -- under Linux/some NICs.) - local tt = self:mbox_send_buf(virtchnl_rss_hena_ptr_t) - self:mbox_sr('VIRTCHNL_OP_SET_RSS_HENA', ffi.sizeof(virtchnl_rss_hena_t)) +function Intel_avf:mbox_s_rss(nqueues) + if nqueues == 1 then + -- pg83 + -- Forcefully disable the NICs RSS features. Contrary to the spec, RSS + -- capabilites are turned on by default and need to be disabled (as least + -- under Linux/some NICs.) + local tt = self:mbox_send_buf(virtchnl_rss_hena_ptr_t) + self:mbox_sr('VIRTCHNL_OP_SET_RSS_HENA', ffi.sizeof(virtchnl_rss_hena_t)) + end + -- Set random RSS key + local tt = self:mbox_send_buf(virtchnl_rss_key_ptr_t) + tt.vsi_id = self.vsi_id + tt.key_len = self.rss_key_size + ffi.copy(tt.key, lib.random_bytes(self.rss_key_size), self.rss_key_size) + self:mbox_sr('VIRTCHNL_OP_CONFIG_RSS_KEY', + ffi.sizeof(virtchnl_rss_key_t) + self.rss_key_size-1) + -- Setup LUT + local tt = self:mbox_send_buf(virtchnl_rss_lut_ptr_t) + tt.vsi_id = self.vsi_id + tt.lut_entries = self.rss_lut_size + for i=0, self.rss_lut_size-1 do + tt.lut[i] = i % nqueues -- fill LUT with configured queues + end + self:mbox_sr('VIRTCHNL_OP_CONFIG_RSS_LUT', + ffi.sizeof(virtchnl_rss_lut_t) + self.rss_lut_size-1) + end function Intel_avf:mbox_s_stats() diff --git a/src/apps/intel_avf/tests/back2back/test.snabb b/src/apps/intel_avf/tests/back2back/test.snabb index f4ec12c519..67fa2d6979 100755 --- a/src/apps/intel_avf/tests/back2back/test.snabb +++ b/src/apps/intel_avf/tests/back2back/test.snabb @@ -96,7 +96,43 @@ while true do engine.main({ duration = 1, no_report = true }) end engine.report_links() +engine.report_apps() assert(rx("nic1.output", "sink.input") >= tosend, "packets received do not match packets sent") + +-- Test RSS queues +local nqueues = 4 +local c = config.new() +local sizes = {64,128,192,256,384,512,1024,1500} +local packets = {} +for _=1,1000 do packets[#packets+1] = sizes[(#packets%#sizes)+1] end +config.app(c, "synth0", synth.Synth, { + sizes=packets, + src=src, + dst=dst, + random_payload=true +}) +config.app(c, "synth1", synth.Synth, { + sizes=packets, + src=dst, + dst=src, + random_payload=true +}) +config.app(c, "nic0", intel_avf.Intel_avf, { pciaddr = vf0, nqueues = nqueues }) +config.app(c, "nic1", intel_avf.Intel_avf, { pciaddr = vf1, nqueues = nqueues }) +config.app(c, "sink", basic.Sink) +for qno=0, nqueues-1 do + config.app(c, "nic0_io"..qno, intel_avf.IO, {pciaddr = vf0, queue = qno}) + config.link(c, "synth0.output"..qno.. " -> nic0_io"..qno..".input") + config.link(c, "nic0_io"..qno..".output -> sink.input_nic0_io"..qno) + config.app(c, "nic1_io"..qno, intel_avf.IO, {pciaddr = vf1, queue = qno}) + config.link(c, "synth1.output"..qno.. " -> nic1_io"..qno..".input") + config.link(c, "nic1_io"..qno..".output -> sink.input_nic1_io"..qno) +end +engine.configure(c) +engine.main({ duration = 1, no_report = true }) +engine.report_links() +engine.report_apps() + engine.stop() main.exit(0) From 451aeecf60638687c0c5c69ae37fe225a9698cc3 Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Wed, 8 Sep 2021 12:38:27 +0000 Subject: [PATCH 178/209] apps.intel_avf: add VLAN filter/stripping/insertion support Also: set a required reserved bit in txdesc (not sure if it had an effect) (cherry picked from commit e3dd48ff4a3a5a272fba4ffad611c153a486e195) --- src/apps/intel_avf/intel_avf.lua | 41 +++++++++++++++++++++++++------- 1 file changed, 33 insertions(+), 8 deletions(-) diff --git a/src/apps/intel_avf/intel_avf.lua b/src/apps/intel_avf/intel_avf.lua index 79d5a82c00..54869ef021 100644 --- a/src/apps/intel_avf/intel_avf.lua +++ b/src/apps/intel_avf/intel_avf.lua @@ -12,7 +12,7 @@ local macaddress = require("lib.macaddress") local pci = require("lib.hardware.pci") local register = require("lib.hardware.register") local tophysical = core.memory.virtual_to_physical -local band, lshift, rshift = bit.band, bit.lshift, bit.rshift +local band, lshift, rshift, bor = bit.band, bit.lshift, bit.rshift, bit.bor local transmit, receive, empty = link.transmit, link.receive, link.empty local counter = require("core.counter") local shm = require("core.shm") @@ -26,6 +26,7 @@ Intel_avf = { config = { pciaddr = { required=true }, nqueues = {}, + vlan = {}, ring_buffer_size = {default=2048} } } @@ -251,6 +252,15 @@ local virtchnl_rss_hena_t = ffi.typeof([[ ]]) local virtchnl_rss_hena_ptr_t = ffi.typeof('$*', virtchnl_rss_hena_t) +local virtchnl_vlan_filter_list_t = ffi.typeof([[ + struct { + uint16_t vsi_id; + uint16_t num_elements; + uint16_t vlan_id[1]; + } __attribute__((packed)) +]]) +local virtchnl_vlan_filter_list_ptr_t = ffi.typeof('$*', virtchnl_vlan_filter_list_t) + local mbox_q_t = ffi.typeof([[ struct { uint8_t flags0; @@ -340,6 +350,7 @@ local cxq_t = ffi.typeof([[ // configuration information: uint32_t qno; // queue number + uint16_t vlan; // 802.1Q vlan tag uint32_t ring_size; // size of rx/tx rings // Transmit state @@ -384,6 +395,7 @@ function Intel_avf:init_cxq (qno) -- Create a shared memory object for controlling the queue pair local cxq = shm.create("group/pci/"..self.pciaddress.."/"..qno, cxq_t) cxq.qno = qno + cxq.vlan = self.vlan or 0 cxq.ring_size = self.ring_buffer_size self:init_tx_q(cxq) self:init_rx_q(cxq) @@ -658,18 +670,19 @@ end function IO:transmit (li) if li == nil then return end - local RS_EOP = bits({ EOP = 4, RS = 5 }) + local cxq = self.cxq + local RS_EOP = bor(bits({ EOP = 4, RS = 5, RSV = 6 }), (cxq.vlan>0 and bits{ IL2TAG1 = 7}) or 0) + local L2TAG1 = lshift(0ULL+cxq.vlan, 48) local SIZE_SHIFT = 34 self:reclaim_txdesc() - local cxq = self.cxq while not empty(li) and cxq.tx_desc_free > 0 do local p = receive(li) -- NB: need to extend size for 4 byte CRC (not clear from the spec.) local size = lshift(4ULL+p.length, SIZE_SHIFT) cxq.txdesc[ cxq.tx_next ].address = tophysical(p.data) cxq.txqueue[ cxq.tx_next ] = p - cxq.txdesc[ cxq.tx_next ].cmd_type_offset_bsz = RS_EOP + size + cxq.txdesc[ cxq.tx_next ].cmd_type_offset_bsz = bor(RS_EOP, size, L2TAG1) cxq.tx_next = band(cxq.tx_next+1, cxq.ring_size-1) cxq.tx_desc_free = cxq.tx_desc_free - 1 end @@ -744,7 +757,7 @@ function Intel_avf:mbox_setup() VIRTCHNL_OP_DISABLE_QUEUES = 9, -- VIRTCHNL_OP_ADD_ETH_ADDR = 10, -- VIRTCHNL_OP_DEL_ETH_ADDR = 11, - -- VIRTCHNL_OP_ADD_VLAN = 12, + VIRTCHNL_OP_ADD_VLAN = 12, -- VIRTCHNL_OP_DEL_VLAN = 13, -- VIRTCHNL_OP_CONFIG_PROMISCUOUS_MODE = 14, VIRTCHNL_OP_GET_STATS = 15, @@ -841,7 +854,7 @@ function Intel_avf:mbox_sr_caps() -- dpdk/drivers/net/avf/avf_vchnl.c local supported_caps = bits({ VIRTCHNL_VF_OFFLOAD_L2 = 0, - VIRTCHNL_VF_OFFLOAD_VLAN = 16, + VIRTCHNL_VF_OFFLOAD_VLAN = 16, -- NB: Could leave this bit off and let PF handle VLANs VIRTCHNL_VF_OFFLOAD_RX_POLLING = 17, VIRTCHNL_VF_OFFLOAD_RSS_PF = 19 }) @@ -925,6 +938,7 @@ function Intel_avf:new(conf) local self = { pciaddress = pci.qualified(conf.pciaddr), path = pci.path(conf.pciaddr), + vlan = conf.vlan, r = {}, ring_buffer_size = conf.ring_buffer_size, shm = { @@ -970,7 +984,10 @@ function Intel_avf:new(conf) self:mbox_setup() self:mbox_sr_version() self:mbox_sr_caps() - self:mbox_s_rss(conf.nqueues or 1) + self:mbox_sr_rss(conf.nqueues or 1) + if self.vlan then + self:mbox_sr_vlan() + end -- Queue setup self.cxqs = {} @@ -1119,7 +1136,7 @@ function Intel_avf:mbox_sr_add_mac() self:mbox_sr('VIRTCHNL_OP_ADD_ETH_ADDR', ffi.sizeof(virtchnl_ether_addr_t) + 8) end -function Intel_avf:mbox_s_rss(nqueues) +function Intel_avf:mbox_sr_rss(nqueues) if nqueues == 1 then -- pg83 -- Forcefully disable the NICs RSS features. Contrary to the spec, RSS @@ -1144,7 +1161,15 @@ function Intel_avf:mbox_s_rss(nqueues) end self:mbox_sr('VIRTCHNL_OP_CONFIG_RSS_LUT', ffi.sizeof(virtchnl_rss_lut_t) + self.rss_lut_size-1) +end +function Intel_avf:mbox_sr_vlan() + local tt = self:mbox_send_buf(virtchnl_vlan_filter_list_ptr_t) + tt.vsi_id = self.vsi_id + tt.num_elements = 1 + tt.vlan_id[0] = self.vlan + self:mbox_sr('VIRTCHNL_OP_ADD_VLAN', + ffi.sizeof(virtchnl_vlan_filter_list_t) + ffi.sizeof("uint16_t")*1) end function Intel_avf:mbox_s_stats() From ce533cf73dfe03205b564d732cb5b4af39447ceb Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Fri, 10 Sep 2021 15:44:17 +0000 Subject: [PATCH 179/209] lib.hardware.pci: add Intel X710 device info (cherry picked from commit 448d626aad725b8f5f5f4c729c0fe0b6edc3120a) --- src/lib/hardware/pci.lua | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/lib/hardware/pci.lua b/src/lib/hardware/pci.lua index bfe91ede56..6ad59cf7bf 100644 --- a/src/lib/hardware/pci.lua +++ b/src/lib/hardware/pci.lua @@ -67,6 +67,7 @@ model = { ["X520"] = 'Intel X520', ["i350"] = 'Intel 350', ["i210"] = 'Intel 210', + ["X710"] = 'Intel X710', ["XL710_VF"] = 'Intel XL710/X710 Virtual Function', ["AVF"] = 'Intel AVF' } @@ -85,6 +86,7 @@ local cards = { ["0x157b"] = {model = model["i210"], driver = 'apps.intel_mp.intel_mp'}, ["0x154c"] = {model = model["XL710_VF"], driver = 'apps.intel_avf.intel_avf'}, ["0x1889"] = {model = model["AVF"], driver = 'apps.intel_avf.intel_avf'}, + ["0x1572"] = {model = model["X710"], driver = nil}, }, ["0x1924"] = { ["0x0903"] = {model = 'SFN7122F', driver = 'apps.solarflare.solarflare'} From 3a39748b6619c13cc7dbade5bbf4be9805baba39 Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Thu, 16 Sep 2021 10:19:36 +0000 Subject: [PATCH 180/209] intel_avf: move bits() constructor out of fast-path lib.bits uses pairs which is a JIT NYI, leading to split traces and GC activity due to snapshotting. (cherry picked from commit 80557cfe891d6c8a7b8ad49264cfc3d1b09808e2) --- src/apps/intel_avf/intel_avf.lua | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/apps/intel_avf/intel_avf.lua b/src/apps/intel_avf/intel_avf.lua index 54869ef021..108a0c4a4e 100644 --- a/src/apps/intel_avf/intel_avf.lua +++ b/src/apps/intel_avf/intel_avf.lua @@ -652,9 +652,9 @@ function IO:deactivate () assert(sync.cas(self.cxq.state, BUSY, IDLE)) end +local RS = bits({ RS = 5 }) +local COMPLETE = 15 function IO:reclaim_txdesc () - local RS = bits({ RS = 5 }) - local COMPLETE = 15 local cxq = self.cxq while band(cxq.txdesc[band(cxq.tx_cand+1, cxq.ring_size-1)].cmd_type_offset_bsz, COMPLETE) == COMPLETE @@ -667,11 +667,13 @@ function IO:reclaim_txdesc () end end +local RS_EOP = bits{ EOP = 4, RS = 5, RSV = 6 } +local IL2TAG1 = bits{ IL2TAG1 = 7} function IO:transmit (li) if li == nil then return end local cxq = self.cxq - local RS_EOP = bor(bits({ EOP = 4, RS = 5, RSV = 6 }), (cxq.vlan>0 and bits{ IL2TAG1 = 7}) or 0) + local RS_EOP_IL2TAG1 = bor(RS_EOP, (cxq.vlan>0 and IL2TAG1) or 0) local L2TAG1 = lshift(0ULL+cxq.vlan, 48) local SIZE_SHIFT = 34 @@ -682,7 +684,7 @@ function IO:transmit (li) local size = lshift(4ULL+p.length, SIZE_SHIFT) cxq.txdesc[ cxq.tx_next ].address = tophysical(p.data) cxq.txqueue[ cxq.tx_next ] = p - cxq.txdesc[ cxq.tx_next ].cmd_type_offset_bsz = bor(RS_EOP, size, L2TAG1) + cxq.txdesc[ cxq.tx_next ].cmd_type_offset_bsz = bor(RS_EOP_IL2TAG1, size, L2TAG1) cxq.tx_next = band(cxq.tx_next+1, cxq.ring_size-1) cxq.tx_desc_free = cxq.tx_desc_free - 1 end From b2ac1c3a9976ad2e99354425dea618af0d49bce3 Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Wed, 22 Sep 2021 12:47:05 +0000 Subject: [PATCH 181/209] apps.intel_avf: move device stats to pci/ (cherry picked from commit fa553a6e824c4f2591791f4a0e0f8cc50a026241) --- src/apps/intel_avf/intel_avf.lua | 77 ++++++++++++++++---------------- 1 file changed, 39 insertions(+), 38 deletions(-) diff --git a/src/apps/intel_avf/intel_avf.lua b/src/apps/intel_avf/intel_avf.lua index 108a0c4a4e..c41ea28ccf 100644 --- a/src/apps/intel_avf/intel_avf.lua +++ b/src/apps/intel_avf/intel_avf.lua @@ -943,22 +943,25 @@ function Intel_avf:new(conf) vlan = conf.vlan, r = {}, ring_buffer_size = conf.ring_buffer_size, - shm = { - rxbytes = {counter}, - rxpackets = {counter}, - rxmcast = {counter}, - rxbcast = {counter}, - rxdrop = {counter}, - rx_unknown_protocol = {counter}, - txbytes = {counter}, - txpackets = {counter}, - txmcast = {counter}, - txbcast = {counter}, - txdrop = {counter}, - txerrors = {counter} - }, sync_stats_throttle = lib.throttle(1) } + -- PCI device statistics + local frame = { + macaddr = {counter}, + rxbytes = {counter}, + rxpackets = {counter}, + rxmcast = {counter}, + rxbcast = {counter}, + rxdrop = {counter}, + rxerrors = {counter}, + txbytes = {counter}, + txpackets = {counter}, + txmcast = {counter}, + txbcast = {counter}, + txdrop = {counter}, + txerrors = {counter} + } + self.stats = shm.create_frame("pci/"..self.pciaddress, frame) -- pg79 /* number of descriptors, multiple of 32 */ assert(self.ring_buffer_size % 32 == 0, @@ -990,6 +993,9 @@ function Intel_avf:new(conf) if self.vlan then self:mbox_sr_vlan() end + + -- publish device MAC address to SHM + counter.set(self.stats.macaddr, self.mac.bits) -- Queue setup self.cxqs = {} @@ -1019,11 +1025,6 @@ function Intel_avf:new(conf) end function Intel_avf:link() - -- Alias SHM frame to canonical location. - if not shm.exists("pci/"..self.pciaddress) then - shm.alias("pci/"..self.pciaddress, "apps/"..self.appname) - end - if self.io then self.io.input, self.io.output = self.input, self.output end @@ -1056,7 +1057,7 @@ function Intel_avf:sync_stats () end end -function Intel_avf:flush_stats () +function Intel_avf:flush_stats () if self.mbox.state == self.mbox.opcodes['VIRTCHNL_OP_GET_STATS'] then self:mbox_r_stats() end @@ -1064,8 +1065,8 @@ function Intel_avf:flush_stats () self:mbox_r_stats() end -function Intel_avf:rxdrop () return counter.read(self.shm.rxdrop) end -function Intel_avf:txdrop () return counter.read(self.shm.txdrop) end +function Intel_avf:rxdrop () return counter.read(self.stats.rxdrop) end +function Intel_avf:txdrop () return counter.read(self.stats.txdrop) end function Intel_avf:reset() -- From "Appendix A Virtual Channel Protocol": @@ -1098,17 +1099,17 @@ function Intel_avf:stop() end) self:free_cxq(cxq) end - -- Unlink SHM alias. + -- Unlink stats frame. shm.unlink("pci/"..self.pciaddress) end function Intel_avf:report () self:flush_stats() for _, c in ipairs{ - 'rxbytes', 'rxpackets', 'rxmcast', 'rxbcast', 'rxdrop', 'rx_unknown_protocol', + 'rxbytes', 'rxpackets', 'rxmcast', 'rxbcast', 'rxdrop', 'rxdrop', 'txbytes', 'txpackets', 'txmcast', 'txbcast', 'txdrop', 'txerrors' } do - print((" %-20s %20s"):format(c, lib.comma_value(counter.read(self.shm[c])))) + print((" %-20s %20s"):format(c, lib.comma_value(counter.read(self.stats[c])))) end end @@ -1187,18 +1188,18 @@ function Intel_avf:mbox_r_stats(async) local stats = ffi.cast(eth_stats_ptr_t, ret) local set = counter.set - set(self.shm.rxbytes, stats.rx_bytes) - set(self.shm.rxpackets, stats.rx_unicast) - set(self.shm.rxmcast, stats.rx_multicast) - set(self.shm.rxbcast, stats.rx_broadcast) - set(self.shm.rxdrop, stats.rx_discards) - set(self.shm.rx_unknown_protocol, stats.rx_unknown_protocol) - - set(self.shm.txbytes, stats.tx_bytes) - set(self.shm.txpackets, stats.tx_unicast) - set(self.shm.txmcast, stats.tx_multicast) - set(self.shm.txbcast, stats.tx_broadcast) - set(self.shm.txdrop, stats.tx_discards) - set(self.shm.txerrors, stats.tx_errors) + set(self.stats.rxbytes, stats.rx_bytes) + set(self.stats.rxpackets, stats.rx_unicast) + set(self.stats.rxmcast, stats.rx_multicast) + set(self.stats.rxbcast, stats.rx_broadcast) + set(self.stats.rxdrop, stats.rx_discards) + set(self.stats.rxdrop, stats.rx_unknown_protocol) + + set(self.stats.txbytes, stats.tx_bytes) + set(self.stats.txpackets, stats.tx_unicast) + set(self.stats.txmcast, stats.tx_multicast) + set(self.stats.txbcast, stats.tx_broadcast) + set(self.stats.txdrop, stats.tx_discards) + set(self.stats.txerrors, stats.tx_errors) end From 7566bcf3c8f3f62031eb316ad9ad3105d09af6dc Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Wed, 22 Sep 2021 12:48:02 +0000 Subject: [PATCH 182/209] snabb top: handle devices that do not specify speed (cherry picked from commit 85428eab2214562db3b7b96dec4c71e3eb678ca5) --- src/program/top/top.lua | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/program/top/top.lua b/src/program/top/top.lua index 38c6d2ea20..23e4a99d4e 100644 --- a/src/program/top/top.lua +++ b/src/program/top/top.lua @@ -628,7 +628,7 @@ function compute_display_tree.interface(tree, prev, dt, t) rchars('%s:', tag:upper()), lchars('%.3f %sPPS', scale(pps)), lchars('%.3f %sbps', scale(bps)), - lchars('%.2f%%', bps/max*100), + max > 0 and lchars('%.2f%%', bps/max*100) or nil, drops > 0 and rchars('%.3f %sPPS dropped', scale(drops)) or nil) end local function show_pci(addr, pci, prev) @@ -636,7 +636,8 @@ function compute_display_tree.interface(tree, prev, dt, t) gridrow(rchars('| '), lchars('')) gridrow(rchars('\\-'), rchars('%s:', addr), - lchars('%d %sbE, MAC: %s', bps, tag, + lchars('%sMAC: %s', + (bps > 0 and ("%d %sbE, "):format(bps, tag)) or '', macaddr_string(tonumber(pci.macaddr and pci.macaddr.value) or 0))) show_traffic('rx', pci, prev) show_traffic('tx', pci, prev) From dae106165f22d07709f235c2d9bc1225114238c6 Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Fri, 5 Nov 2021 13:32:21 +0000 Subject: [PATCH 183/209] apps.intel_avf: fix IRQ setup for additional queues (cherry picked from commit 8c44e614d805173f7fa081edf49f9ae0c557bac0) --- src/apps/intel_avf/intel_avf.lua | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/apps/intel_avf/intel_avf.lua b/src/apps/intel_avf/intel_avf.lua index c41ea28ccf..144167fff1 100644 --- a/src/apps/intel_avf/intel_avf.lua +++ b/src/apps/intel_avf/intel_avf.lua @@ -1004,7 +1004,7 @@ function Intel_avf:new(conf) end self:init_irq() - self:mbox_sr_irq() + self:mbox_sr_irq(conf.nqueues or 1) self:mbox_sr_q(self.cxqs) self:mbox_sr_enable_q(#self.cxqs) @@ -1121,12 +1121,12 @@ function Intel_avf:init_irq() self.r.VFINT_DYN_CTLN[0](v) end -function Intel_avf:mbox_sr_irq() +function Intel_avf:mbox_sr_irq(nqueues) local tt = self:mbox_send_buf(virtchnl_irq_map_info_ptr_t) tt.num_vectors = 1 tt.vsi_id = self.vsi_id tt.vector_id = 0 - tt.rxq_map = 1 + tt.rxq_map = 2^nqueues-1 -- disable interrupts for all queues self:mbox_sr("VIRTCHNL_OP_CONFIG_IRQ_MAP", ffi.sizeof(virtchnl_irq_map_info_t) + 12) end From 5ccf479820ae0cd1daa224f9ac4e27c3e08eb4bd Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Tue, 9 Nov 2021 10:14:56 +0000 Subject: [PATCH 184/209] apps.intel_avf: add option to add additional macs (cherry picked from commit 37cfb3625262c64a5706a5556b776f9184f8d47e) --- src/apps/intel_avf/intel_avf.lua | 35 ++++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 11 deletions(-) diff --git a/src/apps/intel_avf/intel_avf.lua b/src/apps/intel_avf/intel_avf.lua index 144167fff1..d45fd4f375 100644 --- a/src/apps/intel_avf/intel_avf.lua +++ b/src/apps/intel_avf/intel_avf.lua @@ -27,6 +27,7 @@ Intel_avf = { pciaddr = { required=true }, nqueues = {}, vlan = {}, + macs = {default={}}, ring_buffer_size = {default=2048} } } @@ -156,15 +157,20 @@ local virtchnl_queue_config_info_t = ffi.typeof([[ local virtchnl_queue_config_info_ptr_t = ffi.typeof("$ *", virtchnl_queue_config_info_t) -local virtchnl_ether_addr_t = ffi.typeof([[ +local virtchnl_ether_addr_t = ffi.typeof[[ struct { - uint16_t vsi; - uint16_t num_elements; uint8_t addr[6]; // MAC_ADDR_BYTE_LEN uint8_t pad[2]; } __attribute__((packed)) -]]) -local virtchnl_ether_addr_ptr_t = ffi.typeof("$ *", virtchnl_ether_addr_t) +]] +local virtchnl_ether_addr_list_t = ffi.typeof([[ + struct { + uint16_t vsi; + uint16_t num_elements; + $ list[1]; + } __attribute__((packed)) +]], virtchnl_ether_addr_t) +local virtchnl_ether_addr_list_ptr_t = ffi.typeof("$ *", virtchnl_ether_addr_list_t) local eth_stats_t = ffi.typeof([[ struct { @@ -757,7 +763,7 @@ function Intel_avf:mbox_setup() VIRTCHNL_OP_CONFIG_IRQ_MAP = 7, VIRTCHNL_OP_ENABLE_QUEUES = 8, VIRTCHNL_OP_DISABLE_QUEUES = 9, - -- VIRTCHNL_OP_ADD_ETH_ADDR = 10, + VIRTCHNL_OP_ADD_ETH_ADDR = 10, -- VIRTCHNL_OP_DEL_ETH_ADDR = 11, VIRTCHNL_OP_ADD_VLAN = 12, -- VIRTCHNL_OP_DEL_VLAN = 13, @@ -990,6 +996,9 @@ function Intel_avf:new(conf) self:mbox_sr_version() self:mbox_sr_caps() self:mbox_sr_rss(conf.nqueues or 1) + if #conf.macs > 0 then + self:mbox_sr_add_mac(conf.macs) + end if self.vlan then self:mbox_sr_vlan() end @@ -1130,13 +1139,17 @@ function Intel_avf:mbox_sr_irq(nqueues) self:mbox_sr("VIRTCHNL_OP_CONFIG_IRQ_MAP", ffi.sizeof(virtchnl_irq_map_info_t) + 12) end -function Intel_avf:mbox_sr_add_mac() +function Intel_avf:mbox_sr_add_mac(macs) -- pg81 - local tt = self:mbox_send_buf(virtchnl_ether_addr_ptr_t) + local tt = self:mbox_send_buf(virtchnl_ether_addr_list_ptr_t) tt.vsi = self.vsi_id - tt.num_elements = 1 - ffi.copy(tt.addr, self.mac, MAC_ADDR_BYTE_LEN) - self:mbox_sr('VIRTCHNL_OP_ADD_ETH_ADDR', ffi.sizeof(virtchnl_ether_addr_t) + 8) + tt.num_elements = #macs + for i, mac in ipairs(macs) do + ffi.copy(tt.list[i-1].addr, mac, MAC_ADDR_BYTE_LEN) + end + self:mbox_sr('VIRTCHNL_OP_ADD_ETH_ADDR', + ffi.sizeof(virtchnl_ether_addr_list_t) + + ffi.sizeof(virtchnl_ether_addr_t) * #macs) end function Intel_avf:mbox_sr_rss(nqueues) From d22af4f19e5a7a58d9b60b3b4660d8307ff21d0d Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Tue, 9 Nov 2021 10:15:12 +0000 Subject: [PATCH 185/209] apps.intel_avf: update README.md (cherry picked from commit 2bfba71328fb0a33dec9c895f42e872e599dd444) --- src/apps/intel_avf/README.md | 43 ++++++++++++++++++++++++++++++++---- 1 file changed, 39 insertions(+), 4 deletions(-) diff --git a/src/apps/intel_avf/README.md b/src/apps/intel_avf/README.md index 72a49b1193..9eccd78f36 100644 --- a/src/apps/intel_avf/README.md +++ b/src/apps/intel_avf/README.md @@ -18,20 +18,55 @@ The links are named `input` and `output`. *Required*. The PCI address of the NIC as a string. +— Key **vlan** + +*Optional*. VLAN id used for filtering packets. If specified, VLAN tags are +stripped for incoming packets and inserted for outgoing packets. + +— Key **macs** + +*Optional*. Additional unicast or multicast MACs to listen to. +The default is the empty array `{}`. + +— Key **nqueues** + +*Optional*. Number of RSS queues to configure. If specified you need to use +the `intel_avf.IO` app to attach for I/O for each respective queue. + — Key **ring_buffer_size** *Optional*. Number of DMA descriptors to use i.e. size of the DMA transmit and receive queues. Must be a multiple of 128. Default is not specified but assumed to be broadly applicable. +## IO app + +The `intel_avf.IO` app provides a driver for a single RSS queue of a +Virtual Function (see *nqueues*). + +The links are names `input` and `output`. + + DIAGRAM: Intel_avf_IO + +-----------+ + | | + input ---->* IO *----> output + | | + +-----------+ +### Configuration + +— Key **pciaddr** + +*Required*. The PCI address of the NIC as a string. + +— Key **queue** + +*Required*. The queue number of the respective RSS queue, starting from zero. + ## Supported Hardware Ethernet controller [0200]: Intel Corporation Ethernet Virtual Function 700 Series [8086:154c] (rev 02) ## Unsupported features -* Multiple queues per VF. This driver supports a single queue. The spec allows for up to 4 queues. -* RSS with only 1 queue RSS doesn't make sense. -* Multiple vlans are unsupported, `ip link` can be used to map all traffic to a single vlan. -* Multiple MAC addresses are unsupported, `ip link` can be used to set the mac before snabb startup. +* Multiple vlans are unsupported, `vlan` can be used to strip/insert a single vlan ID. * All of the advanced offload features are unsupported. * 16 byte RX descriptors are unsupported. From dc40426510e3c89aa50725454750acdbbcf6a83a Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Mon, 14 Aug 2017 22:47:50 +0200 Subject: [PATCH 186/209] apps.test.synth: add packets option --- src/apps/test/README.md | 5 +++++ src/apps/test/synth.lua | 36 ++++++++++++++++++++---------------- 2 files changed, 25 insertions(+), 16 deletions(-) diff --git a/src/apps/test/README.md b/src/apps/test/README.md index 6d4ef5bb31..28dda5b634 100644 --- a/src/apps/test/README.md +++ b/src/apps/test/README.md @@ -79,6 +79,11 @@ Generate a random payload for each packet in `sizes`. Insert the packet number (32bit uint) directly after the ethertype. The packet number starts at 0 and is sequential on each output link. +— Key **packets** + +Emit *packets* (an array of *packets*) instead of synthesizing packets. When +this option is used *src*, *dst*, *sizes*, and *random_payload* are ignored. + ## Npackets (apps.test.npackets) The `Npackets` app allows are most N packets to flow through it. Any further diff --git a/src/apps/test/synth.lua b/src/apps/test/synth.lua index c75ebd2e93..6bf18ed1c9 100644 --- a/src/apps/test/synth.lua +++ b/src/apps/test/synth.lua @@ -15,28 +15,32 @@ Synth = { dst = {default='00:00:00:00:00:00'}, random_payload = { default = false }, packet_id = { default = false }, + packets = {} } } function Synth:new (conf) assert(#conf.sizes >= 1, "Needs at least one size.") - local packets = {} - for i, size in ipairs(conf.sizes) do - local payload_size = size - ethernet:sizeof() - assert(payload_size >= 0 and payload_size <= 1536, - "Invalid payload size: "..payload_size) - local data - if conf.random_payload then - data = lib.random_bytes(payload_size) - else - data = ffi.new("char[?]", payload_size) + local packets = conf.packets + if not packets then + packets = {} + for i, size in ipairs(conf.sizes) do + local payload_size = size - ethernet:sizeof() + assert(payload_size >= 0 and payload_size <= 1536, + "Invalid payload size: "..payload_size) + local data + if conf.random_payload then + data = lib.random_bytes(payload_size) + else + data = ffi.new("char[?]", payload_size) + end + local dgram = datagram:new(packet.from_pointer(data, payload_size)) + local ether = ethernet:new({ src = ethernet:pton(conf.src), + dst = ethernet:pton(conf.dst), + type = payload_size }) + dgram:push(ether) + packets[i] = dgram:packet() end - local dgram = datagram:new(packet.from_pointer(data, payload_size)) - local ether = ethernet:new({ src = ethernet:pton(conf.src), - dst = ethernet:pton(conf.dst), - type = payload_size }) - dgram:push(ether) - packets[i] = dgram:packet() end return setmetatable({packets=packets}, {__index=Synth}) end From 355051b03a24e5fdf4887b2b35938a4f8deca330 Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Tue, 30 Nov 2021 15:09:45 +0000 Subject: [PATCH 187/209] intel_avf: truncate CRC from received packets --- src/apps/intel_avf/intel_avf.lua | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/apps/intel_avf/intel_avf.lua b/src/apps/intel_avf/intel_avf.lua index d45fd4f375..c74c2c1eb3 100644 --- a/src/apps/intel_avf/intel_avf.lua +++ b/src/apps/intel_avf/intel_avf.lua @@ -705,7 +705,8 @@ function IO:receive (lo) local cxq = self.cxq while band(cxq.rxdesc[cxq.rx_tail].write.status_err_type_len, 0x01) == 1 and pkts < engine.pull_npackets do local p = cxq.rxqueue[cxq.rx_tail] - p.length = rshift(cxq.rxdesc[cxq.rx_tail].write.status_err_type_len, 38) + -- NB: truncate 4 byte CRC + p.length = rshift(cxq.rxdesc[cxq.rx_tail].write.status_err_type_len, 38) - 4 transmit(lo, p) local np = packet.allocate() From d3fd3e722d9dd7f8243b24b836456de8e5e20a8a Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Tue, 30 Nov 2021 15:11:04 +0000 Subject: [PATCH 188/209] =?UTF-8?q?intel=5Favf:=20more=20robust=20HENA?= =?UTF-8?q?=C2=A0configuration?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/apps/intel_avf/intel_avf.lua | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/apps/intel_avf/intel_avf.lua b/src/apps/intel_avf/intel_avf.lua index c74c2c1eb3..3fff291bc6 100644 --- a/src/apps/intel_avf/intel_avf.lua +++ b/src/apps/intel_avf/intel_avf.lua @@ -1154,14 +1154,19 @@ function Intel_avf:mbox_sr_add_mac(macs) end function Intel_avf:mbox_sr_rss(nqueues) + -- Setup HENA + local tt = self:mbox_send_buf(virtchnl_rss_hena_ptr_t) if nqueues == 1 then -- pg83 -- Forcefully disable the NICs RSS features. Contrary to the spec, RSS -- capabilites are turned on by default and need to be disabled (as least -- under Linux/some NICs.) - local tt = self:mbox_send_buf(virtchnl_rss_hena_ptr_t) - self:mbox_sr('VIRTCHNL_OP_SET_RSS_HENA', ffi.sizeof(virtchnl_rss_hena_t)) + tt.hena = 0 + else + -- Enable all + tt.hena = 0xffffffffffffffffULL end + self:mbox_sr('VIRTCHNL_OP_SET_RSS_HENA', ffi.sizeof(virtchnl_rss_hena_t)) -- Set random RSS key local tt = self:mbox_send_buf(virtchnl_rss_key_ptr_t) tt.vsi_id = self.vsi_id From 991acd98e521647789e33f706bed38c7b3782d13 Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Tue, 30 Nov 2021 15:14:37 +0000 Subject: [PATCH 189/209] intel_avf: fix tests, separate RSS tests, add VLAN, multicast tests --- .../intel_avf/tests/back2back/multicast.snabb | 116 ++++++++++++++++ src/apps/intel_avf/tests/back2back/rss.snabb | 125 ++++++++++++++++++ src/apps/intel_avf/tests/back2back/test.snabb | 38 +----- 3 files changed, 243 insertions(+), 36 deletions(-) create mode 100755 src/apps/intel_avf/tests/back2back/multicast.snabb create mode 100755 src/apps/intel_avf/tests/back2back/rss.snabb diff --git a/src/apps/intel_avf/tests/back2back/multicast.snabb b/src/apps/intel_avf/tests/back2back/multicast.snabb new file mode 100755 index 0000000000..42054c1831 --- /dev/null +++ b/src/apps/intel_avf/tests/back2back/multicast.snabb @@ -0,0 +1,116 @@ +#!../../../../snabb snsh +local vf0 = os.getenv("SNABB_AVF_PF0_VF0") +local vf1 = os.getenv("SNABB_AVF_PF1_VF0") or os.getenv("SNABB_AVF_PF0_VF1") + +assert(vf0 ~= nil, "SNABB_AVF_PF0_VF0 is nil") +assert(vf1 ~= nil, "SNABB_AVF_PF1_VF0 is nil") + +local src = os.getenv("SNABB_AVF_PF0_SRC0") +local dst = os.getenv("SNABB_AVF_PF1_DST0") or os.getenv("SNABB_AVF_PF0_DST1") + +assert(src ~= nil, "SNABB_AVF_SRC0 is nil") +assert(dst ~= nil, "SNABB_AVF_DST1 is nil") + +local packet_count = 1001 + +local basic = require("apps.basic.basic_apps") +local intel_avf = require("apps.intel_avf.intel_avf") +local match = require("apps.test.match") +local npackets = require("apps.test.npackets") +local synth = require("apps.test.synth") +local counter = require("core.counter") + +-- Broadcast + +local c = config.new() +config.app(c, "synth", synth.Synth, { + sizes = {64,67,128,133,192,256,384,512,777,1024}, + src=src, + dst="ff:ff:ff:ff:ff:ff", + random_payload = true +} ) +config.app(c, "tee", basic.Tee) +config.app(c, "match", match.Match) + +config.app(c, "npackets", npackets.Npackets, { npackets = packet_count }) +config.app(c, "nic0", intel_avf.Intel_avf, { pciaddr = vf0 }) +config.app(c, "nic1", intel_avf.Intel_avf, { pciaddr = vf1 }) + +config.link(c, "synth.output -> npackets.input") +config.link(c, "npackets.output -> tee.input") +config.link(c, "tee.output1 -> nic0.input") +config.link(c, "nic1.output -> match.rx") +config.link(c, "tee.output2 -> match.comparator") + +engine.configure(c) + +local n0 = engine.app_table['nic0'] +local n1 = engine.app_table['nic1'] +n0:flush_stats() +n1:flush_stats() + +engine.main({duration = 1, report = false}) +engine.report_links() +engine.report_apps() + +function rx(l1, l2) + return counter.read(engine.link_table[l1 .. " -> " .. l2].stats.rxpackets) +end +function assert_eq(a,b,msg) + local an = tonumber(a) + local bn = tonumber(b) + assert(an == bn, msg .. " " .. an .. " ~= " .. bn) +end + +local s = rx("tee.output1", "nic0.input") +local r = rx("nic1.output", "match.rx") +assert_eq(s, r, "packets_sr_1") + +n0:flush_stats() +n1:flush_stats() +assert_eq(counter.read(n0.stats.txbcast), counter.read(n1.stats.rxbcast), "mxbox_sr_stats_1") +assert_eq(counter.read(n0.stats.txbcast), packet_count, "mbox_sr_stats_2") + +local m = engine.app_table['match'] +assert(#m:errors() == 0, "Corrupt packets.") + +engine.configure(config.new()) + +-- Multicast + +local ethernet = require("lib.protocol.ethernet") +local ipv6 = require("lib.protocol.ipv6") + +local v6_mcast = ipv6:solicited_node_mcast(ipv6:pton("fd10::1")) +local mac_mcast = ethernet:ipv6_mcast(v6_mcast) + +config.app(c, "synth", synth.Synth, { + sizes = {64,67,128,133,192,256,384,512,777,1024}, + src=src, + dst=ethernet:ntop(mac_mcast), + random_payload = true +} ) +config.app(c, "nic1", intel_avf.Intel_avf, { pciaddr = vf1, macs = {mac_mcast} }) + +engine.configure(c) + +local n0 = engine.app_table['nic0'] +local n1 = engine.app_table['nic1'] +n0:flush_stats() +n1:flush_stats() + +engine.main({duration = 1, report = false}) +engine.report_links() +engine.report_apps() + +local s = rx("tee.output1", "nic0.input") +local r = rx("nic1.output", "match.rx") +assert_eq(s, r, "packets_sr_1") + +n0:flush_stats() +n1:flush_stats() +assert_eq(counter.read(n0.stats.txmcast), counter.read(n1.stats.rxmcast), "mxbox_sr_stats_1") +assert_eq(counter.read(n0.stats.txmcast), packet_count, "mbox_sr_stats_2") + +local m = engine.app_table['match'] +assert(#m:errors() == 0, "Corrupt packets.") \ No newline at end of file diff --git a/src/apps/intel_avf/tests/back2back/rss.snabb b/src/apps/intel_avf/tests/back2back/rss.snabb new file mode 100755 index 0000000000..e2ef6b7c06 --- /dev/null +++ b/src/apps/intel_avf/tests/back2back/rss.snabb @@ -0,0 +1,125 @@ +#!../../../../snabb snsh +local vf0 = os.getenv("SNABB_AVF_PF0_VF0") +local vf1 = os.getenv("SNABB_AVF_PF1_VF0") or os.getenv("SNABB_AVF_PF0_VF1") + +assert(vf0 ~= nil, "SNABB_AVF_PF0_VF0 is nil") +assert(vf1 ~= nil, "SNABB_AVF_PF1_VF0 is nil") + +local src = os.getenv("SNABB_AVF_PF0_SRC0") +local dst = os.getenv("SNABB_AVF_PF1_DST0") or os.getenv("SNABB_AVF_PF0_DST1") + +assert(src ~= nil, "SNABB_AVF_SRC0 is nil") +assert(dst ~= nil, "SNABB_AVF_DST1 is nil") + +local basic = require("apps.basic.basic_apps") +local intel_avf = require("apps.intel_avf.intel_avf") +local synth = require("apps.test.synth") +local counter = require("core.counter") + +-- Test RSS queues +local nqueues = 4 +local c = config.new() +local sizes = {64,128,192,256,384,512,1024,1500} +local function random_v4_packets (conf) + local lib = require("core.lib") + local ethernet = require("lib.protocol.ethernet") + local ipv4 = require("lib.protocol.ipv4") + local eth = ethernet:new{src = ethernet:pton(conf.src), + dst = ethernet:pton(conf.dst), + type = 0x0800} + local packets = {} + for _, size in ipairs(conf.sizes) do + for _=1,10 do + local ip = ipv4:new{src=lib.random_bytes(4), + dst=lib.random_bytes(4)} + ip:total_length(size - eth:sizeof()) + local payload_length = ip:total_length() - ip:sizeof() + local p = packet.allocate() + packet.append(p, eth:header(), eth:sizeof()) + packet.append(p, ip:header(), ip:sizeof()) + packet.append(p, lib.random_bytes(payload_length), payload_length) + table.insert(packets, p) + end + end + return packets +end + +config.app(c, "synth0", synth.Synth, { + packets = random_v4_packets{ + sizes=sizes, + src=src, + dst=dst + } +}) +config.app(c, "synth1", synth.Synth, { + packets = random_v4_packets{ + sizes=sizes, + src=dst, + dst=src + } +}) +config.app(c, "nic0", intel_avf.Intel_avf, { pciaddr = vf0, nqueues = nqueues }) +config.app(c, "nic1", intel_avf.Intel_avf, { pciaddr = vf1, nqueues = nqueues }) +config.app(c, "sink", basic.Sink) +for qno=0, nqueues-1 do + config.app(c, "nic0_io"..qno, intel_avf.IO, {pciaddr = vf0, queue = qno}) + config.link(c, "synth0.output"..qno.. " -> nic0_io"..qno..".input") + config.link(c, "nic0_io"..qno..".output -> sink.input_nic0_io"..qno) + config.app(c, "nic1_io"..qno, intel_avf.IO, {pciaddr = vf1, queue = qno}) + config.link(c, "synth1.output"..qno.. " -> nic1_io"..qno..".input") + config.link(c, "nic1_io"..qno..".output -> sink.input_nic1_io"..qno) +end +engine.configure(c) +engine.main({ duration = 1, no_report = true }) +engine.report_links() +engine.report_apps() + +local received = {} +for nic=0,1 do + for qno=0, nqueues-1 do + local output = engine.app_table["nic"..nic.."_io"..qno].output.output + received[#received+1] = tonumber(counter.read(output.stats.txpackets)) + end +end + +local function mean (values) + local sum = 0 + for _, value in ipairs(values) do + sum = sum + value + end + return sum / #values +end + +local function stdev (values) + local avg = mean(values) + local var = {} + for _, value in ipairs(values) do + var[#var+1] = (value-avg)^2 + end + return math.sqrt(mean(var)) +end + +local rx_mean, rx_sd = mean(received), stdev(received) +print("RX AVG", rx_mean, "SD", rx_sd) +assert(rx_sd/rx_mean <= 0.1, "SD exceeds 10% of mean (queues should receive roughly equal numbers of packets)") + +-- Exercise VLANs + +config.app(c, "nic0", intel_avf.Intel_avf, { pciaddr = vf0, nqueues = nqueues, vlan = 42 }) +config.app(c, "nic1", intel_avf.Intel_avf, { pciaddr = vf1, nqueues = nqueues, vlan = 42 }) +engine.configure(c) +engine.main({ duration = 1, no_report = true }) +engine.report_links() +engine.report_apps() + +local received = {} +for nic=0,1 do + for qno=0, nqueues-1 do + local output = engine.app_table["nic"..nic.."_io"..qno].output.output + received[#received+1] = tonumber(counter.read(output.stats.txpackets)) + end +end + +local rx_mean, rx_sd = mean(received), stdev(received) +print("RX AVG", rx_mean, "SD", rx_sd) +assert(rx_sd/rx_mean <= 0.1, "SD exceeds 10% of mean (queues should receive roughly equal numbers of packets)") \ No newline at end of file diff --git a/src/apps/intel_avf/tests/back2back/test.snabb b/src/apps/intel_avf/tests/back2back/test.snabb index 67fa2d6979..19014c239e 100755 --- a/src/apps/intel_avf/tests/back2back/test.snabb +++ b/src/apps/intel_avf/tests/back2back/test.snabb @@ -68,8 +68,8 @@ assert_eq(s, r, "packets_sr_1") n0:flush_stats() n1:flush_stats() -assert_eq(counter.read(n0.shm.txpackets), counter.read(n1.shm.rxpackets), "mxbox_sr_stats_1") -assert_eq(counter.read(n0.shm.txpackets), packet_count, "mbox_sr_stats_2") +assert_eq(counter.read(n0.stats.txpackets), counter.read(n1.stats.rxpackets), "mxbox_sr_stats_1") +assert_eq(counter.read(n0.stats.txpackets), packet_count, "mbox_sr_stats_2") local m = engine.app_table['match'] assert(#m:errors() == 0, "Corrupt packets.") @@ -100,39 +100,5 @@ engine.report_apps() assert(rx("nic1.output", "sink.input") >= tosend, "packets received do not match packets sent") --- Test RSS queues -local nqueues = 4 -local c = config.new() -local sizes = {64,128,192,256,384,512,1024,1500} -local packets = {} -for _=1,1000 do packets[#packets+1] = sizes[(#packets%#sizes)+1] end -config.app(c, "synth0", synth.Synth, { - sizes=packets, - src=src, - dst=dst, - random_payload=true -}) -config.app(c, "synth1", synth.Synth, { - sizes=packets, - src=dst, - dst=src, - random_payload=true -}) -config.app(c, "nic0", intel_avf.Intel_avf, { pciaddr = vf0, nqueues = nqueues }) -config.app(c, "nic1", intel_avf.Intel_avf, { pciaddr = vf1, nqueues = nqueues }) -config.app(c, "sink", basic.Sink) -for qno=0, nqueues-1 do - config.app(c, "nic0_io"..qno, intel_avf.IO, {pciaddr = vf0, queue = qno}) - config.link(c, "synth0.output"..qno.. " -> nic0_io"..qno..".input") - config.link(c, "nic0_io"..qno..".output -> sink.input_nic0_io"..qno) - config.app(c, "nic1_io"..qno, intel_avf.IO, {pciaddr = vf1, queue = qno}) - config.link(c, "synth1.output"..qno.. " -> nic1_io"..qno..".input") - config.link(c, "nic1_io"..qno..".output -> sink.input_nic1_io"..qno) -end -engine.configure(c) -engine.main({ duration = 1, no_report = true }) -engine.report_links() -engine.report_apps() - engine.stop() main.exit(0) From 6845091b222252ea9692c0c6014605cba9443554 Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Wed, 1 Dec 2021 11:23:37 +0000 Subject: [PATCH 190/209] intel_avf: add explicit VLAN test case --- src/apps/intel_avf/tests/back2back/vlan.snabb | 107 ++++++++++++++++++ 1 file changed, 107 insertions(+) create mode 100755 src/apps/intel_avf/tests/back2back/vlan.snabb diff --git a/src/apps/intel_avf/tests/back2back/vlan.snabb b/src/apps/intel_avf/tests/back2back/vlan.snabb new file mode 100755 index 0000000000..b27b68d2c6 --- /dev/null +++ b/src/apps/intel_avf/tests/back2back/vlan.snabb @@ -0,0 +1,107 @@ +#!../../../../snabb snsh +local vf0 = os.getenv("SNABB_AVF_PF0_VF0") +local vf1 = os.getenv("SNABB_AVF_PF1_VF0") or os.getenv("SNABB_AVF_PF0_VF1") + +assert(vf0 ~= nil, "SNABB_AVF_PF0_VF0 is nil") +assert(vf1 ~= nil, "SNABB_AVF_PF1_VF0 is nil") + +local src = os.getenv("SNABB_AVF_PF0_SRC0") +local dst = os.getenv("SNABB_AVF_PF1_DST0") or os.getenv("SNABB_AVF_PF0_DST1") + +assert(src ~= nil, "SNABB_AVF_SRC0 is nil") +assert(dst ~= nil, "SNABB_AVF_DST1 is nil") + +local packet_count = 1001 + +local basic = require("apps.basic.basic_apps") +local intel_avf = require("apps.intel_avf.intel_avf") +local match = require("apps.test.match") +local npackets = require("apps.test.npackets") +local synth = require("apps.test.synth") +local counter = require("core.counter") + +local c = config.new() +config.app(c, "synth", synth.Synth, { + sizes = {64,67,128,133,192,256,384,512,777,1024}, + src=src, + dst=dst, + random_payload = true +} ) +config.app(c, "tee", basic.Tee) +config.app(c, "match", match.Match) + +config.app(c, "npackets", npackets.Npackets, { npackets = packet_count }) +config.app(c, "nic0", intel_avf.Intel_avf, { pciaddr = vf0, vlan = 1 }) +config.app(c, "nic1", intel_avf.Intel_avf, { pciaddr = vf1, vlan = 1 }) + +config.link(c, "synth.output -> npackets.input") +config.link(c, "npackets.output -> tee.input") +config.link(c, "tee.output1 -> nic0.input") +config.link(c, "nic1.output -> match.rx") +config.link(c, "tee.output2 -> match.comparator") + +engine.configure(c) + +local n0 = engine.app_table['nic0'] +local n1 = engine.app_table['nic1'] +n0:flush_stats() +n1:flush_stats() + +engine.main({duration = 1, report = false}) +engine.report_links() +engine.report_apps() + +function rx(l1, l2) + return counter.read(engine.link_table[l1 .. " -> " .. l2].stats.rxpackets) +end +function assert_eq(a,b,msg) + local an = tonumber(a) + local bn = tonumber(b) + assert(an == bn, msg .. " " .. an .. " ~= " .. bn) +end + +local s = rx("tee.output1", "nic0.input") +local r = rx("nic1.output", "match.rx") +assert_eq(s, r, "packets_sr_1") + +n0:flush_stats() +n1:flush_stats() +assert_eq(counter.read(n0.stats.txpackets), counter.read(n1.stats.rxpackets), "mxbox_sr_stats_1") +assert_eq(counter.read(n0.stats.txpackets), packet_count, "mbox_sr_stats_2") + +local m = engine.app_table['match'] +assert(#m:errors() == 0, "Corrupt packets.") + +-- Check VLAN filtering + +local c = config.new() +config.app(c, "synth", synth.Synth, { + sizes = {64,67,128,133,192,256,384,512,777,1024}, + src=src, + dst=dst, + random_payload = true +} ) + +config.app(c, "nic0", intel_avf.Intel_avf, { pciaddr = vf0, vlan = 1 }) +config.app(c, "nic1", intel_avf.Intel_avf, { pciaddr = vf1, vlan = 2 }) +config.app(c, "sink", basic.Sink) + +config.link(c, "synth.output -> nic0.input") +config.link(c, "nic1.output -> sink.input") + +engine.configure(c) + +local n0 = engine.app_table['nic0'] +local n1 = engine.app_table['nic1'] +n0:flush_stats() +n1:flush_stats() + +engine.main({duration = 1, report = false}) +engine.report_links() +engine.report_apps() + +n0:flush_stats() +n1:flush_stats() + +assert(counter.read(n0.stats.txpackets) > 0, "No packets sent") +assert(counter.read(n1.stats.rxpackets) == 0, "Should not receive from other VLAN") From 9c83c2561cb43d46bc88cb81e8cdd7386cba851d Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Thu, 2 Dec 2021 12:29:28 +0000 Subject: [PATCH 191/209] intel_avf: fix rx/tx packet lengths This fixes a bug originally introduced in fece51f0 where sent packets where with 4 bytes too many as a trailer. Reverts the hence bogus 355051b0. The original bug I believe comes from a misunderstood shift truncation which is fixed by a 0ULL+ before shift. The original 4ULL+ "fix" was likely a fever dream of mine where I thought AVF expects us to specify the packet size+4 (i.e., include space for the CRC?), and only fixed the real issue (shift truncation) by accident. --- src/apps/intel_avf/intel_avf.lua | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/apps/intel_avf/intel_avf.lua b/src/apps/intel_avf/intel_avf.lua index 3fff291bc6..eacadcc407 100644 --- a/src/apps/intel_avf/intel_avf.lua +++ b/src/apps/intel_avf/intel_avf.lua @@ -686,8 +686,7 @@ function IO:transmit (li) self:reclaim_txdesc() while not empty(li) and cxq.tx_desc_free > 0 do local p = receive(li) - -- NB: need to extend size for 4 byte CRC (not clear from the spec.) - local size = lshift(4ULL+p.length, SIZE_SHIFT) + local size = lshift(0ULL+p.length, SIZE_SHIFT) -- NB: extend to 64 bit before shift cxq.txdesc[ cxq.tx_next ].address = tophysical(p.data) cxq.txqueue[ cxq.tx_next ] = p cxq.txdesc[ cxq.tx_next ].cmd_type_offset_bsz = bor(RS_EOP_IL2TAG1, size, L2TAG1) @@ -705,8 +704,7 @@ function IO:receive (lo) local cxq = self.cxq while band(cxq.rxdesc[cxq.rx_tail].write.status_err_type_len, 0x01) == 1 and pkts < engine.pull_npackets do local p = cxq.rxqueue[cxq.rx_tail] - -- NB: truncate 4 byte CRC - p.length = rshift(cxq.rxdesc[cxq.rx_tail].write.status_err_type_len, 38) - 4 + p.length = rshift(cxq.rxdesc[cxq.rx_tail].write.status_err_type_len, 38) transmit(lo, p) local np = packet.allocate() From c1220bed5f720d3facad1a4373d9bec5a1474a01 Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Fri, 3 Dec 2021 13:56:28 +0000 Subject: [PATCH 192/209] apps.mellanox: add ConnectX 6 type --- src/apps/mellanox/connectx.lua | 1 + 1 file changed, 1 insertion(+) diff --git a/src/apps/mellanox/connectx.lua b/src/apps/mellanox/connectx.lua index ec917da863..480e682b89 100644 --- a/src/apps/mellanox/connectx.lua +++ b/src/apps/mellanox/connectx.lua @@ -199,6 +199,7 @@ local mlx_types = { ["0x1013" ] = 4, -- ConnectX4 ["0x1017" ] = 5, -- ConnectX5 ["0x1019" ] = 5, -- ConnectX5 + ["0x101d" ] = 6, -- ConnectX6 } function ConnectX:new (conf) From 5b68b01f3d6d9e5e69491cc1abf9b0b416f68026 Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Fri, 3 Dec 2021 13:58:09 +0000 Subject: [PATCH 193/209] apps.mellanox.connectx_test: add multicast and checks --- src/apps/mellanox/connectx_test.lua | 63 ++++++++++++++++++++++++++--- 1 file changed, 57 insertions(+), 6 deletions(-) diff --git a/src/apps/mellanox/connectx_test.lua b/src/apps/mellanox/connectx_test.lua index 42b8d6f36e..36b754b328 100644 --- a/src/apps/mellanox/connectx_test.lua +++ b/src/apps/mellanox/connectx_test.lua @@ -67,8 +67,10 @@ function switch (pci0, pci1, npackets, ncores, minlen, maxlen, minburst, maxburs -- MAC destination local r = math.random() - if r < 0.10 then -- 10% of packets are broadcast + if r < 0.05 then -- 5% of packets are broadcast ffi.fill(p.data, 6, 0xFF) + elseif r < 0.10 then -- 5% of packets are multicast + p.data[0], p.data[1] = 0x33, 0x33 -- "locally administered" multicast elseif r < 0.20 then -- 10% are unicast to random destinations for i = 1, 5 do p.data[i] = math.random(256) - 1 end else -- rest are unicast to known mac @@ -185,22 +187,50 @@ function switch (pci0, pci1, npackets, ncores, minlen, maxlen, minburst, maxburs print(("%-16s %20s %20s"):format(k, lib.comma_value(stat0[k]), lib.comma_value(stat1[k]))) end + local received = {[pci0]={}, [pci1]={}} print(("@@ %16s; %12s; %12s; %12s; %12s; %12s; %12s; %12s"):format( "nic", "link", "txpkt", "txbyte", "txdrop", "rxpkt", "rxbyte", "rxdrop")) - for id in pairs(io0) do + -- Sort into key order + local t = {} + for k in pairs(io0) do table.insert(t, k) end + table.sort(t) + for _, id in pairs(t) do local function prlink (nic, id, app) local function count (cnt) return tonumber(counter.read(cnt)) end - local srx = app.input.input.stats - local stx = app.output.output.stats + local stx = app.input.input.stats + local srx = app.output.output.stats print(("@@ %16s; %12s; %12d; %12d; %12d; %12d; %12d; %12d"):format( nic, id, - count(srx.txpackets), count(srx.txbytes), count(srx.txdrop), - count(stx.txpackets), count(stx.txbytes), count(stx.txdrop))) + count(stx.txpackets), count(stx.txbytes), count(stx.txdrop), + count(srx.txpackets), count(srx.txbytes), count(srx.txdrop))) + received[nic][#received[nic]+1] = count(srx.txpackets) end prlink(pci0, id, io0[id]) prlink(pci1, id, io1[id]) end print(("time: %.1fs - Mpps: %.3f per NIC"):format(finish-start, npackets/1e6/(finish-start))) + + print("hardware counter check") + assert(stat0.tx_ucast_packets+stat0.tx_mcast_packets+stat0.tx_bcast_packets == npackets, "0: sent too little") + assert(stat1.tx_ucast_packets+stat1.tx_mcast_packets+stat1.tx_bcast_packets == npackets, "1: sent too little") + assert(stat0.tx_ucast_packets == stat1.rx_ucast_packets, "0.tx_ucast != 1.rx_ucast") + assert(stat1.tx_ucast_packets == stat0.rx_ucast_packets, "1.tx_ucast != 0.rx_ucast") + assert(stat0.tx_mcast_packets*macs == stat1.rx_mcast_packets, "0.tx_mcast*macs != 1.rx_mcast") + assert(stat1.tx_mcast_packets*macs == stat0.rx_mcast_packets, "1.tx_mcast*macs != 0.rx_mcast") + assert(stat0.tx_bcast_packets*macs == stat1.rx_bcast_packets, "0.tx_bcast*macs != 1.rx_bcast") + assert(stat1.tx_bcast_packets*macs == stat0.rx_bcast_packets, "1.tx_bcast*macs != 0.rx_bcast") + + for _, nic in pairs{pci0, pci1} do + local sum, avg, sd = sum(received[nic]), mean(received[nic]), stdev(received[nic]) + print(("RX check %s sum=%d avg=%.1f sd=%.1f") + :format(nic, sum, avg, sd)) + -- expect some slack because we send 10% to random MACs + assert(sum >= npackets*.8, "received too little") + -- expect more packets on queues 0 because we send 10% mcast, + -- but mostly even distribution of packets + assert(sd / avg < .2, "uneven packet distribution") + end + print("selftest: done") end @@ -213,6 +243,27 @@ function between (min, max) end end +function sum (values) + local sum = 0 + for _, value in ipairs(values) do + sum = sum + value + end + return sum +end + +function mean (values) + return sum(values) / #values +end + +function stdev (values) + local avg = mean(values) + local var = {} + for _, value in ipairs(values) do + var[#var+1] = (value-avg)^2 + end + return math.sqrt(mean(var)) +end + function selftest () local pci0 = os.getenv("SNABB_PCI_CONNECTX_0") local pci1 = os.getenv("SNABB_PCI_CONNECTX_1") From cc59d6456371a56e03823af39c356f328aed92cb Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Mon, 6 Dec 2021 13:18:46 +0000 Subject: [PATCH 194/209] apps.mellanox.connectx: implement stop() --- src/apps/mellanox/connectx.lua | 62 +++++++++++++++++++++++++++++----- 1 file changed, 54 insertions(+), 8 deletions(-) diff --git a/src/apps/mellanox/connectx.lua b/src/apps/mellanox/connectx.lua index 480e682b89..9a9b2e1184 100644 --- a/src/apps/mellanox/connectx.lua +++ b/src/apps/mellanox/connectx.lua @@ -98,7 +98,8 @@ local rqt_max_size = 128 -- IO IDLE->FREE: IO app stops and releases the CXQ for future use. -- IO IDLE->BUSY: IO app starts running a pull/push method. -- IO BUSY->IDLE: IO app stops running a pull/push method. --- CTRL IDLE->DEAD: Control app closes the CXQ. (Replacement can be created.) NYI +-- CTRL IDLE->DEAD: Control app closes the CXQ. (Replacement can be created.) +-- CTRL FREE->DEAD: Control app closes the CXQ. (Replacement can be created.) -- -- These state transitions are *PROHIBITED* for important reasons: -- @@ -171,6 +172,12 @@ local DEAD = 4 function shutdown(pid) for _, pciaddr in ipairs(shm.children("/"..pid.."/mellanox")) do for _, queue in ipairs(shm.children("/"..pid.."/mellanox/"..pciaddr)) do + -- NB: this iterates the backlinks created by IO apps! + -- Meaning, this cleans up CXQ attachments from dying IO apps. + -- The actual CXQ objects are cleaned up in the process running + -- the Control app (see ConnectX:stop()). + -- The code below is just to make sure crashing IO apps do not block + -- the Control app. local backlink = "/"..pid.."/mellanox/"..pciaddr.."/"..queue local shm_name = "/"..pid.."/group/pci/"..pciaddr.."/"..queue if shm.exists(shm_name) then @@ -266,6 +273,9 @@ function ConnectX:new (conf) local tdomain = hca:alloc_transport_domain() local rlkey = hca:query_rlkey() + -- CXQ objects managed by this control app + local cxq_shm = {} + -- List of all receive queues for hashing traffic across local rqlist = {} local rqs = {} @@ -282,7 +292,9 @@ function ConnectX:new (conf) for _, queue in ipairs(conf.queues) do -- Create a shared memory object for controlling the queue pair - local cxq = shm.create("group/pci/"..pciaddress.."/"..queue.id, cxq_t) + local shmpath = "group/pci/"..pciaddress.."/"..queue.id + local cxq = shm.create(shmpath, cxq_t) + cxq_shm[shmpath] = cxq local function check_qsize (type, size) assert(check_pow2(size), @@ -568,11 +580,38 @@ function ConnectX:new (conf) end self.sync_timer = lib.throttle(1) + function free_cxq (cxq) + -- Force CXQ state -> DEAD + local timeout = lib.timeout(2) + lib.waitfor(function () + assert(not timeout(), "ConnectX: failed to close CXQ.") + return sync.cas(cxq.state, IDLE, DEAD) + or sync.cas(cxq.state, FREE, DEAD) + end) + -- Reclaim packets + for idx=0, cxq.rqsize-1 do + if cxq.rx[idx] ~= nil then + packet.free(cxq.rx[idx]) + cxq.rx[idx] = nil + end + end + for idx=0, cxq.sqsize-1 do + if cxq.tx[idx] ~= nil then + packet.free(cxq.tx[idx]) + cxq.tx[idx] = nil + end + end + end + function self:stop () pci.set_bus_master(pciaddress, false) pci.reset_device(pciaddress) pci.close_pci_resource(fd, mmio) mmio, fd = nil + for shmpath, cxq in pairs(cxq_shm) do + free_cxq(cxq) + shm.unlink(shmpath) + end end function self:pull () @@ -1284,6 +1323,11 @@ function IO:new (conf) end end + -- Detach from the NIC. + function self:stop () + close() + end + return self end @@ -1354,6 +1398,10 @@ function RQ:new (cxq) link.transmit(l, p) cxq.rx[idx] = nil elseif opcode == 13 or opcode == 14 then + -- Error on receive + assert(cxq.rx[idx] ~= nil) + packet.free(cxq.rx[idx]) + cxq.rx[idx] = nil local syndromes = { [0x1] = "Local_Length_Error", [0x4] = "Local_Protection_Error", @@ -1366,12 +1414,8 @@ function RQ:new (cxq) [0x14] = "Remote_Operation_Error" } local syndrome = c.u8[0x37] - print(("Got error. opcode=%d syndrome=0x%x message=%s"):format( - opcode, syndrome, syndromes[syndromes])) -- XXX - -- Error on receive - assert(packets[idx] ~= nil) - packet.free(packets[idx]) - packets[idx] = nil + error(("Got error. opcode=%d syndrome=0x%x message=%s") + :format(opcode, syndrome, syndromes[syndromes])) else error(("Unexpected CQE opcode: %d (0x%x)"):format(opcode, opcode)) end @@ -2388,6 +2432,8 @@ function selftest () nic0:stop() nic1:stop() + io0:stop() + io1:stop() if (stat0.tx_ucast_packets == bursts*each and stat0.tx_ucast_octets == bursts*each*octets and stat1.tx_ucast_packets == bursts*each and stat1.tx_ucast_octets == bursts*each*octets) then From ad4c941cfc2638e4bd8b619ff951e59705822930 Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Mon, 6 Dec 2021 13:19:37 +0000 Subject: [PATCH 195/209] apps.mellanox.connectx_test: exercise stop()... ...and multiple switch() configs --- src/apps/mellanox/connectx_test.lua | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/src/apps/mellanox/connectx_test.lua b/src/apps/mellanox/connectx_test.lua index 36b754b328..35cae77fe3 100644 --- a/src/apps/mellanox/connectx_test.lua +++ b/src/apps/mellanox/connectx_test.lua @@ -215,10 +215,10 @@ function switch (pci0, pci1, npackets, ncores, minlen, maxlen, minburst, maxburs assert(stat1.tx_ucast_packets+stat1.tx_mcast_packets+stat1.tx_bcast_packets == npackets, "1: sent too little") assert(stat0.tx_ucast_packets == stat1.rx_ucast_packets, "0.tx_ucast != 1.rx_ucast") assert(stat1.tx_ucast_packets == stat0.rx_ucast_packets, "1.tx_ucast != 0.rx_ucast") - assert(stat0.tx_mcast_packets*macs == stat1.rx_mcast_packets, "0.tx_mcast*macs != 1.rx_mcast") - assert(stat1.tx_mcast_packets*macs == stat0.rx_mcast_packets, "1.tx_mcast*macs != 0.rx_mcast") - assert(stat0.tx_bcast_packets*macs == stat1.rx_bcast_packets, "0.tx_bcast*macs != 1.rx_bcast") - assert(stat1.tx_bcast_packets*macs == stat0.rx_bcast_packets, "1.tx_bcast*macs != 0.rx_bcast") + assert(stat0.tx_mcast_packets*2 == stat1.rx_mcast_packets, "0.tx_mcast*2 != 1.rx_mcast") + assert(stat1.tx_mcast_packets*2 == stat0.rx_mcast_packets, "1.tx_mcast*2 != 0.rx_mcast") + assert(stat0.tx_bcast_packets*2 == stat1.rx_bcast_packets, "0.tx_bcast*2 != 1.rx_bcast") + assert(stat1.tx_bcast_packets*2 == stat0.rx_bcast_packets, "1.tx_bcast*2 != 0.rx_bcast") for _, nic in pairs{pci0, pci1} do local sum, avg, sd = sum(received[nic]), mean(received[nic]), stdev(received[nic]) @@ -231,6 +231,17 @@ function switch (pci0, pci1, npackets, ncores, minlen, maxlen, minburst, maxburs assert(sd / avg < .2, "uneven packet distribution") end + nic0:stop() + nic1:stop() + for _, queue in ipairs(queues) do + io0[queue.id]:stop() + link.free(io0[queue.id].input.input, ("input-%s-%s" ):format(pci0, queue.id)) + link.free(io0[queue.id].output.output, ("output-%s-%s" ):format(pci0, queue.id)) + io1[queue.id]:stop() + link.free(io1[queue.id].input.input, ("input-%s-%s" ):format(pci1, queue.id)) + link.free(io1[queue.id].output.output, ("output-%s-%s" ):format(pci1, queue.id)) + end + print("selftest: done") end @@ -272,5 +283,7 @@ function selftest () os.exit(engine.test_skipped_code) end switch(pci0, pci1, 10e6, 1, 60, 1500, 100, 100, 2, 2, 4) + switch(pci0, pci1, 10e6, 1, 60, 1500, 100, 100, 1, 2, 8) + switch(pci0, pci1, 10e6, 1, 60, 1500, 100, 100, 4, 1, 4) end From d36174e00047dbbc6d3f091b620aa3b1fd5980de Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Mon, 6 Dec 2021 14:49:36 +0000 Subject: [PATCH 196/209] apps.mellanox.connectx_test: add match test --- src/apps/mellanox/connectx_test.lua | 54 +++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/src/apps/mellanox/connectx_test.lua b/src/apps/mellanox/connectx_test.lua index 35cae77fe3..cba693ddcd 100644 --- a/src/apps/mellanox/connectx_test.lua +++ b/src/apps/mellanox/connectx_test.lua @@ -275,6 +275,59 @@ function stdev (values) return math.sqrt(mean(var)) end +function basic_match (pci0, pci1) + print("selftest: connectx_test match") + + local packet_count = 1001 + local src, dst = "00:00:00:00:00:01", "00:00:00:00:00:02" + + local basic = require("apps.basic.basic_apps") + local match = require("apps.test.match") + local npackets = require("apps.test.npackets") + local synth = require("apps.test.synth") + local counter = require("core.counter") + + local c = config.new() + config.app(c, "synth", synth.Synth, { + sizes={64,67,128,133,192,256,384,512,777,1024}, + src=src, + dst=dst, + random_payload=true + }) + config.app(c, "tee", basic.Tee) + config.app(c, "match", match.Match) + config.app(c, "npackets", npackets.Npackets, {npackets=packet_count}) + config.app(c, "nic0", connectx.ConnectX, { + pciaddress=pci0, + queues={{id="io0", mac=src}} + }) + config.app(c, "io0", connectx.IO, {pciaddress=pci0, queue="io0"}) + config.app(c, "nic1", connectx.ConnectX, { + pciaddress=pci1, + queues={{id="io1", mac=dst}} + }) + config.app(c, "io1", connectx.IO, {pciaddress=pci1, queue="io1"}) + + config.link(c, "synth.output -> npackets.input") + config.link(c, "npackets.output -> tee.input") + config.link(c, "tee.output1 -> io0.input") + config.link(c, "io1.output -> match.rx") + config.link(c, "tee.output2 -> match.comparator") + + engine.configure(c) + + engine.main({duration = 1, report = false}) + engine.report_links() + engine.report_apps() + + local m = engine.app_table['match'] + assert(#m:errors() == 0, "Corrupt packets.") + + engine.configure(config.new()) + + print("selftest: done") +end + function selftest () local pci0 = os.getenv("SNABB_PCI_CONNECTX_0") local pci1 = os.getenv("SNABB_PCI_CONNECTX_1") @@ -282,6 +335,7 @@ function selftest () print("SNABB_PCI_CONNECTX_0 and SNABB_PCI_CONNECTX_1 must be set. Skipping selftest.") os.exit(engine.test_skipped_code) end + basic_match(pci0, pci1) switch(pci0, pci1, 10e6, 1, 60, 1500, 100, 100, 2, 2, 4) switch(pci0, pci1, 10e6, 1, 60, 1500, 100, 100, 1, 2, 8) switch(pci0, pci1, 10e6, 1, 60, 1500, 100, 100, 4, 1, 4) From bf1322e74578b9be01f74aad01bea1c3cc49e11b Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Mon, 6 Dec 2021 15:13:14 +0000 Subject: [PATCH 197/209] apps.mellanox: add README --- src/apps/mellanox/README.md | 79 +++++++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) create mode 100644 src/apps/mellanox/README.md diff --git a/src/apps/mellanox/README.md b/src/apps/mellanox/README.md new file mode 100644 index 0000000000..f348c9759c --- /dev/null +++ b/src/apps/mellanox/README.md @@ -0,0 +1,79 @@ +# Mellanox Connect-X app (apps.mellanox.connectx) + +The `connectx.ConnectX` app provides a driver for +Mellanox Connect-X 4, 5, and 6 series network cards. + +The links are named `input` and `output`. + + DIAGRAM: ConnectX + +-----------+ + | | + input ---->* ConnectX *----> output + | | + +-----------+ + +## Configuration + +— Key **pciaddress** + +*Required*. The PCI address of the NIC as a string. + +— Key **queues** + +*Required*. Array of RX/TX queue specifications. +You need to use the `connectx.IO` app to attach for I/O on each respective queue. +A queue specification is a table with the following keys: + + * `id`—a unique queue identifier string + * `vlan`—an optional VLAN identifier + * `mac`—an optional MAC address as a string + (either none or all queues must specify a MAC) + +Multiple queues with matching `vlan`/`mac` identifiers will have incoming traffic +distributed between them via 3-tuple or 5-tuple RSS. +Multicast and broadcast traffic arrives on the first queue of each RSS group. + +— Key **mtu** + +*Optional.* MTU configured for the device. The default is 9500. + +— Key **sendq_size** + +— Key **recvq_size** + +*Optional*. Sizes of the send and receive queues. The default is 1024. + + +## IO app + +The `connectx.IO` app provides a driver for a single queue of a +Mellanox Connect-X network card (see *queues*). + +The links are names `input` and `output`. + + DIAGRAM: connectx_IO + +-----------+ + | | + input ---->* IO *----> output + | | + +-----------+ +### Configuration + +— Key **pciaddress** + +*Required*. The PCI address of the NIC as a string. + +— Key **queue** + +*Required*. The queue identifier of the respective queue. + +## Supported Hardware + +This driver has been confirmed to work with +Mellanox Connect-X 4, 5, and 6 series cards. + +## Unsupported features + +* VLAN promiscuous mode is not supported + (i.e., queues that specify `vlan` but no `mac`) +* Local-loopback between queues is not implemented \ No newline at end of file From 22f950cd96434e1041a63d96d117f197007847da Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Fri, 10 Dec 2021 14:08:43 +0000 Subject: [PATCH 198/209] lib.hardware.pci: add mellanox MT2892 Family [ConnectX-6 Dx] model --- src/lib/hardware/pci.lua | 1 + 1 file changed, 1 insertion(+) diff --git a/src/lib/hardware/pci.lua b/src/lib/hardware/pci.lua index 6618b793b1..7f3fa20c94 100644 --- a/src/lib/hardware/pci.lua +++ b/src/lib/hardware/pci.lua @@ -93,6 +93,7 @@ local cards = { ["0x1013" ] = {model = 'MT27700', driver = 'apps.mellanox.connectx'}, ["0x1017" ] = {model = 'MT27800', driver = 'apps.mellanox.connectx'}, ["0x1019" ] = {model = 'MT28800', driver = 'apps.mellanox.connectx'}, + ["0x101d" ] = {model = 'MT2892', driver = 'apps.mellanox.connectx'}, }, } From ffb0ba950b0c5cf1a0a4b76b9341e920263c0284 Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Mon, 12 Jul 2021 11:32:05 +0000 Subject: [PATCH 199/209] apps.xdp: only run rxtx_match selftest if SNABB_XDP_NQUEUES=1 --- src/apps/xdp/xdp.lua | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/apps/xdp/xdp.lua b/src/apps/xdp/xdp.lua index 52c9a5606c..d74738cc53 100644 --- a/src/apps/xdp/xdp.lua +++ b/src/apps/xdp/xdp.lua @@ -746,8 +746,10 @@ function selftest () selftest_rxtx(xdpdeva, xdpmaca, xdpdevb, xdpmacb, nqueues) print("test: duplex") selftest_duplex(xdpdeva, xdpmaca, xdpdevb, xdpmacb, nqueues) - print("test: rxtx_match") - selftest_rxtx_match(xdpdeva, xdpmaca, xdpdevb, xdpmacb) + if nqueues == 1 then + print("test: rxtx_match") + selftest_rxtx_match(xdpdeva, xdpmaca, xdpdevb, xdpmacb) + end if nqueues > 1 then print("test: share_interface") selftest_share_interface(xdpdeva, xdpmaca, xdpdevb, xdpmacb, nqueues) From 1a5f7c47fcc2560e1d1d21be85a9d763b7aa35cc Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Mon, 12 Jul 2021 11:48:28 +0000 Subject: [PATCH 200/209] apps.xdp: mark stop() NYI / not supported --- src/apps/xdp/xdp.lua | 61 ++++++++------------------------------------ 1 file changed, 10 insertions(+), 51 deletions(-) diff --git a/src/apps/xdp/xdp.lua b/src/apps/xdp/xdp.lua index d74738cc53..bbd6829313 100644 --- a/src/apps/xdp/xdp.lua +++ b/src/apps/xdp/xdp.lua @@ -595,49 +595,8 @@ end -- Instance methods function XDP:stop () - -- Close socket. - self.sock:close() - -- Reclaim packet buffers left on rings. - -- - -- Problem: we need a way to tell apart which packets buffers on the - -- (write-only) tx and fill rings need to be freed, and which packet buffers - -- were already enqueued to the (read-only) rx and completions rings. - -- Otherwise, we might cause memory corruption by double-freeing packets. - -- - -- We can not however reliably inspect the kernel's internal read cursors - -- for the tx and fill rings. Instead we solve this with a *hack* based on - -- the assumptions that 1) the kernel does not modify the rings after - -- closing the XDP socket; 2) the kernel moves packets from fill to rx rings - -- and tx to completion rings *in-order*; 3) the kernel does not clobber - -- descriptors that have not yet moved to an rx or completion ring. - -- - -- First we flush the rx and completion rings, freeing any dequeued packets, - -- while updating the rxq and txq tallies (see XDP:create_xsk()). - while not empty(self.rx) do - packet.free_internal(receive(self.rx)) - self.rxq = self.rxq - 1 - end - while not empty(self.cr) do - packet.free_internal(reclaim(self.cr)) - self.txq = self.txq - 1 - end - -- Then, we use the final rxq/txq tallies to infer how many packets on the - -- transmit and fill rings are left dangling, and free those amounts of - -- packets (starting from the most recently enqueued, going backwards) from - -- each ring individually. - for _ = 1, self.txq do - packet.free_internal(rewind_transmit(self.tx)) - end - for _ = 1, self.rxq do - packet.free_internal(rewind_fill(self.fr)) - end - -- Unmap rings. - assert(S.munmap(self.rx.map, self.rx.maplen)) - assert(S.munmap(self.tx.map, self.tx.maplen)) - assert(S.munmap(self.fr.map, self.fr.maplen)) - assert(S.munmap(self.cr.map, self.cr.maplen)) - -- Close interface lockfd. See XDP:open_interface(). - self.lockfd:close() + -- XXX - previous shutdown sequence was broken (see git history for details.) + error("Can not stop XDP driver (operation not supported)") end function XDP:pull () @@ -742,14 +701,14 @@ function selftest () end snabb_enable_xdp() engine.report_load() - print("test: rxtx") - selftest_rxtx(xdpdeva, xdpmaca, xdpdevb, xdpmacb, nqueues) - print("test: duplex") - selftest_duplex(xdpdeva, xdpmaca, xdpdevb, xdpmacb, nqueues) if nqueues == 1 then print("test: rxtx_match") selftest_rxtx_match(xdpdeva, xdpmaca, xdpdevb, xdpmacb) end + print("test: rxtx") + selftest_rxtx(xdpdeva, xdpmaca, xdpdevb, xdpmacb, nqueues) + print("test: duplex") + selftest_duplex(xdpdeva, xdpmaca, xdpdevb, xdpmacb, nqueues) if nqueues > 1 then print("test: share_interface") selftest_share_interface(xdpdeva, xdpmaca, xdpdevb, xdpmacb, nqueues) @@ -890,10 +849,10 @@ function selftest_rxtx_match (xdpdeva, xdpmaca, xdpdevb, xdpmacb) }) config.app(c, "npackets", npackets.Npackets, {npackets=1000}) config.app(c, "match", match.Match) - config.app(c, xdpdeva, XDP, {ifname=xdpdeva}) - config.app(c, xdpdevb, XDP, {ifname=xdpdevb}) - config.link(c, "source.output -> "..xdpdeva..".input") - config.link(c, xdpdevb..".output -> match.rx") + config.app(c, xdpdeva.."_q0", XDP, {ifname=xdpdeva}) + config.app(c, xdpdevb.."_q0", XDP, {ifname=xdpdevb}) + config.link(c, "source.output -> "..xdpdeva.."_q0.input") + config.link(c, xdpdevb.."_q0.output -> match.rx") config.link(c, "source.copy -> npackets.input") config.link(c, "npackets.output -> match.comparator") engine.configure(c) From 7b0d35bcecdc256049185d7e9cc69869ed2e6bc8 Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Thu, 13 Jan 2022 12:12:49 +0100 Subject: [PATCH 201/209] lwAFTR 2022.01.13 --- .version | 2 +- src/program/lwaftr/doc/CHANGELOG.md | 21 +++++++++++++++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/.version b/.version index 73ea2eb7dc..1dd823ea3f 100644 --- a/.version +++ b/.version @@ -1 +1 @@ -2019.06.02 +2022.01.13 diff --git a/src/program/lwaftr/doc/CHANGELOG.md b/src/program/lwaftr/doc/CHANGELOG.md index 1bc6c6a115..8d4c66cfd6 100644 --- a/src/program/lwaftr/doc/CHANGELOG.md +++ b/src/program/lwaftr/doc/CHANGELOG.md @@ -1,5 +1,26 @@ # Change Log +## [2022.01.13] + +### Notable changes + + * Support for XDP, AVF, and Mellanox drivers + + * Restore support for bump-in-the-wire operation + + * New updated lwAFTR YANG schema: `snabb-softwire-v3.yang`. + lwAFTR can now operate on >2 CPU cores + + * Add statistics counters for ICMP, ARP, and NDP + + * Fragmenter/defragmenter can now handle padded packets (bug fix) + + * NDP app now sends correct neighbot advertisements (bug fix) + + * Fix a parsing bug in `lib.yang` where nested default values of leaves where not set + + * Fix a bug in `lib.numa` where it could not gracefully handle the inability to read a CPU performance governor + ## [2019.06.02] ### Notable changes From 1531f37d959913a3980e9451698601eadba3a162 Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Thu, 13 Jan 2022 11:41:43 +0000 Subject: [PATCH 202/209] apps.ipv4.arp: fix selftest --- src/apps/ipv4/arp.lua | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/apps/ipv4/arp.lua b/src/apps/ipv4/arp.lua index 9a26739868..50383254b2 100644 --- a/src/apps/ipv4/arp.lua +++ b/src/apps/ipv4/arp.lua @@ -293,9 +293,14 @@ end function selftest() print('selftest: arp') - local arp = ARP:new({ self_ip = ipv4:pton('1.2.3.4'), - next_ip = ipv4:pton('5.6.7.8'), - shared_next_mac_key = "foo" }) + local c = config.new() + config.app(c, 'arp', ARP, { + self_ip = ipv4:pton('1.2.3.4'), + next_ip = ipv4:pton('5.6.7.8'), + shared_next_mac_key = "foo" + }) + engine.configure(c) + local arp = engine.app_table.arp arp.input = { south=link.new('south in'), north=link.new('north in') } arp.output = { south=link.new('south out'), north=link.new('north out') } From 3f3c40cf08c05cc74bb64a57e6ecc2e6eb57718a Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Thu, 13 Jan 2022 12:49:50 +0000 Subject: [PATCH 203/209] lwaftr: update end-to-end test data (for new ICMP, ARP, and NDP counters) --- .../tests/data/counters/arp-for-next-hop.lua | 2 ++ .../data/counters/icmpv6-ping-and-reply.lua | 4 ++++ .../counters/in-1p-ipv4-out-1p-ipv6-echo.lua | 4 ++++ .../in-1p-ipv6-out-1p-ipv4-4-and-echo.lua | 4 ++++ .../ndp-no-na-next-hop6-mac-not-set-2pkts.lua | 2 ++ .../ndp-no-na-next-hop6-mac-not-set-3pkts.lua | 4 ++++ .../tests/data/counters/ndp-ns-for-next-hop.lua | 2 ++ .../tests/data/counters/ndp-secondary.lua | 2 ++ .../lwaftr/tests/data/counters/nofrag4-echo.lua | 10 ++++++++++ .../lwaftr/tests/data/counters/nofrag4.lua | 4 ++++ .../tests/data/counters/nofrag6-no-icmp.lua | 5 +++++ .../lwaftr/tests/data/counters/nofrag6-sol.lua | 4 ++++ .../lwaftr/tests/data/counters/nofrag6.lua | 2 ++ .../lwaftr/tests/data/ndp_outgoing_ns.pcap | Bin 126 -> 126 bytes .../tests/data/vlan/ndp_ns_and_recap.pcap | Bin 256 -> 256 bytes .../lwaftr/tests/data/vlan/ndp_outgoing_ns.pcap | Bin 130 -> 130 bytes src/program/lwaftr/tests/end-to-end/test_env.sh | 8 ++++---- 17 files changed, 53 insertions(+), 4 deletions(-) create mode 100644 src/program/lwaftr/tests/data/counters/nofrag4-echo.lua create mode 100644 src/program/lwaftr/tests/data/counters/nofrag6-no-icmp.lua diff --git a/src/program/lwaftr/tests/data/counters/arp-for-next-hop.lua b/src/program/lwaftr/tests/data/counters/arp-for-next-hop.lua index c692e9d73e..10d952517d 100644 --- a/src/program/lwaftr/tests/data/counters/arp-for-next-hop.lua +++ b/src/program/lwaftr/tests/data/counters/arp-for-next-hop.lua @@ -2,4 +2,6 @@ return { ["memuse-ipv4-frag-reassembly-buffer"] = 728203264, ["memuse-ipv6-frag-reassembly-buffer"] = 11378176, ["out-ipv4-frag-not"] = 1, + ["out-arp-request-bytes"] = 42, + ["out-arp-request-packets"] = 1, } diff --git a/src/program/lwaftr/tests/data/counters/icmpv6-ping-and-reply.lua b/src/program/lwaftr/tests/data/counters/icmpv6-ping-and-reply.lua index 2e6a730d5a..c0e739a163 100644 --- a/src/program/lwaftr/tests/data/counters/icmpv6-ping-and-reply.lua +++ b/src/program/lwaftr/tests/data/counters/icmpv6-ping-and-reply.lua @@ -3,4 +3,8 @@ return { ["memuse-ipv4-frag-reassembly-buffer"] = 728203264, ["memuse-ipv6-frag-reassembly-buffer"] = 11378176, ["out-ipv6-frag-not"] = 1, + ["in-icmpv6-echo-packets"] = 1, + ["in-icmpv6-echo-bytes"] = 74, + ["out-icmpv6-echo-packets"] = 1, + ["out-icmpv6-echo-bytes"] = 74, } diff --git a/src/program/lwaftr/tests/data/counters/in-1p-ipv4-out-1p-ipv6-echo.lua b/src/program/lwaftr/tests/data/counters/in-1p-ipv4-out-1p-ipv6-echo.lua index 418eba156c..9e45c62802 100644 --- a/src/program/lwaftr/tests/data/counters/in-1p-ipv4-out-1p-ipv6-echo.lua +++ b/src/program/lwaftr/tests/data/counters/in-1p-ipv4-out-1p-ipv6-echo.lua @@ -8,4 +8,8 @@ return { ["out-ipv6-bytes"] = 106, ["out-ipv6-frag-not"] = 1, ["out-ipv6-packets"] = 1, + ["in-icmpv4-echo-packets"] = 1, + ["in-icmpv4-echo-bytes"] = 54, + ["out-icmpv4-echo-packets"] = 1, + ["out-icmpv4-echo-bytes"] = 54, } diff --git a/src/program/lwaftr/tests/data/counters/in-1p-ipv6-out-1p-ipv4-4-and-echo.lua b/src/program/lwaftr/tests/data/counters/in-1p-ipv6-out-1p-ipv4-4-and-echo.lua index b567ab93b3..f9e6b09f02 100644 --- a/src/program/lwaftr/tests/data/counters/in-1p-ipv6-out-1p-ipv4-4-and-echo.lua +++ b/src/program/lwaftr/tests/data/counters/in-1p-ipv6-out-1p-ipv4-4-and-echo.lua @@ -8,4 +8,8 @@ return { ["out-ipv4-frag-not"] = 1, ["out-ipv4-packets"] = 1, ["out-ipv6-frag-not"] = 1, + ["in-icmpv6-echo-packets"] = 1, + ["in-icmpv6-echo-bytes"] = 74, + ["out-icmpv6-echo-packets"] = 1, + ["out-icmpv6-echo-bytes"] = 74, } diff --git a/src/program/lwaftr/tests/data/counters/ndp-no-na-next-hop6-mac-not-set-2pkts.lua b/src/program/lwaftr/tests/data/counters/ndp-no-na-next-hop6-mac-not-set-2pkts.lua index 0bfb63a5fd..fdffa2d088 100644 --- a/src/program/lwaftr/tests/data/counters/ndp-no-na-next-hop6-mac-not-set-2pkts.lua +++ b/src/program/lwaftr/tests/data/counters/ndp-no-na-next-hop6-mac-not-set-2pkts.lua @@ -12,4 +12,6 @@ return { ["out-ipv6-bytes"] = 106, ["out-ipv6-frag-not"] = 1, ["out-ipv6-packets"] = 1, + ["out-ndp-ns-packets"] = 1, + ["out-ndp-ns-bytes"] = 86, } diff --git a/src/program/lwaftr/tests/data/counters/ndp-no-na-next-hop6-mac-not-set-3pkts.lua b/src/program/lwaftr/tests/data/counters/ndp-no-na-next-hop6-mac-not-set-3pkts.lua index b85c3e5ded..5ae37831cf 100644 --- a/src/program/lwaftr/tests/data/counters/ndp-no-na-next-hop6-mac-not-set-3pkts.lua +++ b/src/program/lwaftr/tests/data/counters/ndp-no-na-next-hop6-mac-not-set-3pkts.lua @@ -12,4 +12,8 @@ return { ["out-ipv6-bytes"] = 106, ["out-ipv6-frag-not"] = 2, ["out-ipv6-packets"] = 1, + ["out-ndp-ns-packets"] = 1, + ["out-ndp-ns-bytes"] = 86, + ["in-ndp-na-packets"] = 1, + ["in-ndp-na-bytes"] = 86, } diff --git a/src/program/lwaftr/tests/data/counters/ndp-ns-for-next-hop.lua b/src/program/lwaftr/tests/data/counters/ndp-ns-for-next-hop.lua index 2a8197e9c0..24f4b3e4f8 100644 --- a/src/program/lwaftr/tests/data/counters/ndp-ns-for-next-hop.lua +++ b/src/program/lwaftr/tests/data/counters/ndp-ns-for-next-hop.lua @@ -2,4 +2,6 @@ return { ["memuse-ipv4-frag-reassembly-buffer"] = 728203264, ["memuse-ipv6-frag-reassembly-buffer"] = 11378176, ["out-ipv6-frag-not"] = 1, + ["out-ndp-ns-packets"] = 1, + ["out-ndp-ns-bytes"] = 86, } diff --git a/src/program/lwaftr/tests/data/counters/ndp-secondary.lua b/src/program/lwaftr/tests/data/counters/ndp-secondary.lua index 3ce9e1bc91..21e9924102 100644 --- a/src/program/lwaftr/tests/data/counters/ndp-secondary.lua +++ b/src/program/lwaftr/tests/data/counters/ndp-secondary.lua @@ -2,4 +2,6 @@ return { ["in-ipv6-frag-reassembly-unneeded"] = 1, ["memuse-ipv4-frag-reassembly-buffer"] = 728203264, ["memuse-ipv6-frag-reassembly-buffer"] = 11378176, + ["in-ndp-ns-packets"] = 1, + ["in-ndp-ns-bytes"] = 86, } diff --git a/src/program/lwaftr/tests/data/counters/nofrag4-echo.lua b/src/program/lwaftr/tests/data/counters/nofrag4-echo.lua new file mode 100644 index 0000000000..c65654e0f1 --- /dev/null +++ b/src/program/lwaftr/tests/data/counters/nofrag4-echo.lua @@ -0,0 +1,10 @@ +return { + ["in-ipv4-frag-reassembly-unneeded"] = 1, + ["memuse-ipv4-frag-reassembly-buffer"] = 728203264, + ["memuse-ipv6-frag-reassembly-buffer"] = 11378176, + ["out-ipv4-frag-not"] = 1, + ["out-icmpv4-echo-packets"] = 1, + ["out-icmpv4-echo-bytes"] = 54, + ["in-icmpv4-echo-packets"] = 1, + ["in-icmpv4-echo-bytes"] = 54, +} diff --git a/src/program/lwaftr/tests/data/counters/nofrag4.lua b/src/program/lwaftr/tests/data/counters/nofrag4.lua index 6e95815eba..bc266a4c02 100644 --- a/src/program/lwaftr/tests/data/counters/nofrag4.lua +++ b/src/program/lwaftr/tests/data/counters/nofrag4.lua @@ -3,4 +3,8 @@ return { ["memuse-ipv4-frag-reassembly-buffer"] = 728203264, ["memuse-ipv6-frag-reassembly-buffer"] = 11378176, ["out-ipv4-frag-not"] = 1, + ["out-arp-reply-packets"] = 1, + ["out-arp-reply-bytes"] = 42, + ["in-arp-request-bytes"] = 42, + ["in-arp-request-packets"] = 1, } diff --git a/src/program/lwaftr/tests/data/counters/nofrag6-no-icmp.lua b/src/program/lwaftr/tests/data/counters/nofrag6-no-icmp.lua new file mode 100644 index 0000000000..3ce9e1bc91 --- /dev/null +++ b/src/program/lwaftr/tests/data/counters/nofrag6-no-icmp.lua @@ -0,0 +1,5 @@ +return { + ["in-ipv6-frag-reassembly-unneeded"] = 1, + ["memuse-ipv4-frag-reassembly-buffer"] = 728203264, + ["memuse-ipv6-frag-reassembly-buffer"] = 11378176, +} diff --git a/src/program/lwaftr/tests/data/counters/nofrag6-sol.lua b/src/program/lwaftr/tests/data/counters/nofrag6-sol.lua index 2e6a730d5a..70f8ca17ab 100644 --- a/src/program/lwaftr/tests/data/counters/nofrag6-sol.lua +++ b/src/program/lwaftr/tests/data/counters/nofrag6-sol.lua @@ -3,4 +3,8 @@ return { ["memuse-ipv4-frag-reassembly-buffer"] = 728203264, ["memuse-ipv6-frag-reassembly-buffer"] = 11378176, ["out-ipv6-frag-not"] = 1, + ["out-ndp-na-packets"] = 1, + ["out-ndp-na-bytes"] = 86, + ["in-ndp-ns-packets"] = 1, + ["in-ndp-ns-bytes"] = 86, } diff --git a/src/program/lwaftr/tests/data/counters/nofrag6.lua b/src/program/lwaftr/tests/data/counters/nofrag6.lua index 3ce9e1bc91..21e9924102 100644 --- a/src/program/lwaftr/tests/data/counters/nofrag6.lua +++ b/src/program/lwaftr/tests/data/counters/nofrag6.lua @@ -2,4 +2,6 @@ return { ["in-ipv6-frag-reassembly-unneeded"] = 1, ["memuse-ipv4-frag-reassembly-buffer"] = 728203264, ["memuse-ipv6-frag-reassembly-buffer"] = 11378176, + ["in-ndp-ns-packets"] = 1, + ["in-ndp-ns-bytes"] = 86, } diff --git a/src/program/lwaftr/tests/data/ndp_outgoing_ns.pcap b/src/program/lwaftr/tests/data/ndp_outgoing_ns.pcap index 1169e042cedb538715fea34f134feff56b4ce800..871ce745b7a0557c67b3cf6108c6a97fe6f17c1e 100644 GIT binary patch delta 63 zcmb=co1npFZ2X^zfpMaqf&q|*0Y;EmJA=?FumA@GCj%D)HvZ@0C8mq AcK`qY delta 63 wcmb=co1nq=9}FhyDHt$tFmN((F>o`mFfcPPF)+3>2sAN(0Gc=>qY@YZ032xx3jhEB diff --git a/src/program/lwaftr/tests/data/vlan/ndp_ns_and_recap.pcap b/src/program/lwaftr/tests/data/vlan/ndp_ns_and_recap.pcap index e8fbd3971fd2925a30f296220fbf15d0f200aded..6d75e56a8d7a07d1f5df4a8af648d89bf13a9aa6 100644 GIT binary patch delta 40 lcmZo*YG9h6!DejypNWBSqMnKfkcI(9kXSo|(5i`PvjDTn2>Jj3 delta 40 scmZo*YG9h6!S){vChDn(FmNz%GH@|)Gq5l)GcYkQwlfGcO-!2w03KNjjsO4v diff --git a/src/program/lwaftr/tests/data/vlan/ndp_outgoing_ns.pcap b/src/program/lwaftr/tests/data/vlan/ndp_outgoing_ns.pcap index a829f9b45728c3c1826f2135c4993d5ab7d97388..6320e70cbb47b635f62c1c757365df512b1e29b1 100644 GIT binary patch delta 65 zcmZo-Y+{_C!DejypNWBSqMnKYkcI(9kXSo|&?>M12LmSq7Xvo~3j;F)P@Iub2@C*s CCkTZA delta 65 ycmZo-Y+{_C!S){vChDmeFmNz%GH@|)Gq5l)GcYkQwlfGcF@OM?I3uGH7ytkvD+?I_ diff --git a/src/program/lwaftr/tests/end-to-end/test_env.sh b/src/program/lwaftr/tests/end-to-end/test_env.sh index 632f536a11..5204ac2f71 100755 --- a/src/program/lwaftr/tests/end-to-end/test_env.sh +++ b/src/program/lwaftr/tests/end-to-end/test_env.sh @@ -365,7 +365,7 @@ TEST_DATA=( "ingress-filter: from-b4 (IPv6) packet found in binding table (DROP)" "no_icmp_with_filters_drop.conf" "" "tcp-fromb4-ipv6.pcap" "" "" -"nofrag6.lua" +"nofrag6-no-icmp.lua" # Egress filters @@ -375,7 +375,7 @@ TEST_DATA=( "egress-filter: to-internet (IPv4) (DROP)" "no_icmp_with_filters_drop.conf" "" "tcp-fromb4-ipv6.pcap" "" "" -"nofrag6.lua" +"nofrag6-no-icmp.lua" "egress-filter: to-b4 (IPv4) (ACCEPT)" "no_icmp_with_filters_accept.conf" "tcp-frominet-trafficclass.pcap" "" "" "tcp-afteraftr-ipv6-trafficclass.pcap" @@ -389,11 +389,11 @@ TEST_DATA=( "ICMP Echo to AFTR (IPv4)" "no_icmp.conf" "ping-v4.pcap" "" "ping-v4-reply.pcap" "" -"nofrag4.lua" +"nofrag4-echo.lua" "ICMP Echo to AFTR (IPv4) (ttl=32)" "no_icmp.conf" "ping-v4-ttl-32.pcap" "" "ping-v4-reply.pcap" "" -"nofrag4.lua" +"nofrag4-echo.lua" "ICMP Echo to AFTR (IPv4) + data" "no_icmp.conf" "ping-v4-and-data.pcap" "" "ping-v4-reply.pcap" "tcp-afteraftr-ipv6.pcap" From 247ef95c15225553294bff67a821ad07ab1f5f21 Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Tue, 18 Jan 2022 13:39:14 +0000 Subject: [PATCH 204/209] core.packet: revert globalization of newly exported variables and functions Recent changes ([1], [2], [3]) exposed some previously internal variables and functions from core.packet. This patch restores local bindings for those within core.packet in order to maintain performance. [1] 61745637fb2bc8b7c4e5fce88ac3be9c8f577156 (#1450) [2] ee4e42d23fcdf60c38c31e3a6a53ce19da8c5c70 (#1450) [3] 9f0694fb9a50e5302928ebaa33a1fe468ae15dfc (#1288) --- src/core/packet.lua | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/core/packet.lua b/src/core/packet.lua index c2adb7b9d3..ca71a3e547 100644 --- a/src/core/packet.lua +++ b/src/core/packet.lua @@ -30,6 +30,10 @@ default_headroom = 256 -- things aligned at least this much. minimum_alignment = 2 +-- Copy read-only constants to locals +local max_payload, packet_alignment, default_headroom, minimum_alignment = + max_payload, packet_alignment, default_headroom, minimum_alignment + local function get_alignment (addr, alignment) -- Precondition: alignment is a power of 2. return bit.band(addr, alignment - 1) @@ -273,6 +277,8 @@ function account_free (p) counter.add(engine.freebits, (12 + 8 + math.max(p.length, 60) + 4) * 8) end +local free_internal, account_free = + free_internal, account_free function free (p) account_free(p) free_internal(p) From 5f045e7d6161a5564e415b40c07ba5ab910377cf Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Tue, 18 Jan 2022 18:23:21 +0000 Subject: [PATCH 205/209] core.app: setvmprofile unconditionally lwAFTR introduced a way to run applications with vmprofile disabled, in a way that branched on engine.vmprofile_enabled on the fast path. This patch cleans this up to only disable the profiler timer, and run setvmprofile unconditionally in both cases. Also removes the duplicate/unused/ineffectual profile option from lib.ptree.worker (profiling can be disabled via lib.scheduling). --- src/core/app.lua | 16 +++++++--------- src/core/main.lua | 12 ++++++------ src/lib/ptree/worker.lua | 1 - src/lib/scheduling.lua | 4 ++-- 4 files changed, 15 insertions(+), 18 deletions(-) diff --git a/src/core/app.lua b/src/core/app.lua index c9ab6c8292..f4059d6d30 100644 --- a/src/core/app.lua +++ b/src/core/app.lua @@ -95,19 +95,17 @@ local function getvmprofile (name) end function setvmprofile (name) - if vmprofile_enabled then - C.vmprofile_set_profile(getvmprofile(name)) - end + C.vmprofile_set_profile(getvmprofile(name)) end function clearvmprofiles () + jit.vmprofile.stop() + for name, profile in pairs(vmprofiles) do + shm.unmap(profile) + shm.unlink("vmprofile/"..name..".vmprofile") + vmprofiles[name] = nil + end if vmprofile_enabled then - jit.vmprofile.stop() - for name, profile in pairs(vmprofiles) do - shm.unmap(profile) - shm.unlink("vmprofile/"..name..".vmprofile") - vmprofiles[name] = nil - end jit.vmprofile.start() end end diff --git a/src/core/main.lua b/src/core/main.lua index 1626492e0b..433fa3c09a 100644 --- a/src/core/main.lua +++ b/src/core/main.lua @@ -8,7 +8,7 @@ package.path = '' local STP = require("lib.lua.StackTracePlus") local ffi = require("ffi") -local vmprofile = require("jit.vmprofile") +local jit = require("jit") local lib = require("core.lib") local shm = require("core.shm") local C = ffi.C @@ -47,7 +47,10 @@ function main () error("fatal: "..ffi.os.."/"..ffi.arch.." is not a supported platform\n") end initialize() - vmprofile.start() + -- Setup audit.log, vmprofile + engine.enable_auditlog() + engine.setvmprofile("program") + jit.vmprofile.start() if lib.getenv("SNABB_PROGRAM_LUACODE") then -- Run the given Lua code instead of the command-line local expr = lib.getenv("SNABB_PROGRAM_LUACODE") @@ -67,7 +70,7 @@ function main () require(modulename(program)).run(args) end end - vmprofile.stop() + jit.vmprofile.stop() end -- Take the program name from the first argument, unless the first @@ -162,9 +165,6 @@ function initialize () _G.packet = require("core.packet"); _G.packet.initialize() _G.timer = require("core.timer") _G.main = getfenv() - -- Setup audit.log, vmprofile - engine.enable_auditlog() - engine.setvmprofile("program") end function handler (reason) diff --git a/src/lib/ptree/worker.lua b/src/lib/ptree/worker.lua index 3925fc642d..8d2128a8f6 100644 --- a/src/lib/ptree/worker.lua +++ b/src/lib/ptree/worker.lua @@ -47,7 +47,6 @@ function new_worker (conf) if conf.measure_memory then timer.activate(memory_info.HeapSizeMonitor.new():timer()) end - engine.vmprofile_enabled = conf.profile return ret end diff --git a/src/lib/scheduling.lua b/src/lib/scheduling.lua index 6607c8aa9c..425f6147e6 100644 --- a/src/lib/scheduling.lua +++ b/src/lib/scheduling.lua @@ -50,8 +50,8 @@ end function sched_apply.profile (profile) engine.vmprofile_enabled = profile - local vmprofile = require('jit.vmprofile') - if profile then vmprofile.start() else vmprofile.stop() end + local jit = require('jit') + if profile then jit.vmprofile.start() else jit.vmprofile.stop() end end function sched_apply.eval (str) From 52048a93add8caafad1c44eb0866ab53952f470a Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Wed, 19 Jan 2022 13:04:23 +0100 Subject: [PATCH 206/209] Fixup previous commit: remove unused profile option in lib.ptree.worker --- src/lib/ptree/worker.lua | 1 - 1 file changed, 1 deletion(-) diff --git a/src/lib/ptree/worker.lua b/src/lib/ptree/worker.lua index 8d2128a8f6..f188807802 100644 --- a/src/lib/ptree/worker.lua +++ b/src/lib/ptree/worker.lua @@ -21,7 +21,6 @@ local worker_config_spec = { duration = {}, measure_latency = {default=true}, measure_memory = {default=true}, - profile = {default=true}, no_report = {default=false}, report = {default={showapps=true,showlinks=true}}, Hz = {default=1000}, From 862af2cf7f043b2c6883424105ab9884e9c2eaff Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Wed, 19 Jan 2022 13:03:56 +0000 Subject: [PATCH 207/209] Fix regression in config-migrations --- .../migrate_configuration/migrate_configuration.lua | 11 +++++++---- .../lwaftr/tests/config-migrations/selftest.sh | 4 ++-- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/src/program/lwaftr/migrate_configuration/migrate_configuration.lua b/src/program/lwaftr/migrate_configuration/migrate_configuration.lua index 8afdba5cf6..8723bd877d 100644 --- a/src/program/lwaftr/migrate_configuration/migrate_configuration.lua +++ b/src/program/lwaftr/migrate_configuration/migrate_configuration.lua @@ -442,7 +442,7 @@ local function multiprocess_migration(src, conf_file) -- We should build up a hybrid schema from parts of v1 and v2. local v1_schema = yang.load_schema_by_name("snabb-softwire-v1") - local hybridscm = yang.load_schema_by_name("snabb-softwire-v3") + local hybridscm = yang.load_schema_by_name("snabb-softwire-v2") local v1_external = v1_schema.body["softwire-config"].body["external-interface"] local v1_internal = v1_schema.body["softwire-config"].body["internal-interface"] local external = hybridscm.body["softwire-config"].body["external-interface"] @@ -521,7 +521,7 @@ local function multiprocess_migration(src, conf_file) conf.softwire_config.external_interface.next_hop = nil conf.softwire_config.external_interface.vlan_tag = nil - return config_to_string('snabb-softwire-v3', conf) + return config_to_string('snabb-softwire-v2', conf) end local function v2_migration(src, conf_file) @@ -529,7 +529,7 @@ local function v2_migration(src, conf_file) -- switch over to v2 of snabb-softwire config. local v1_schema = yang.load_schema_by_name("snabb-softwire-v1") local v1_binding_table = v1_schema.body["softwire-config"].body["binding-table"] - local hybridscm = yang.load_schema_by_name("snabb-softwire-v3") + local hybridscm = yang.load_schema_by_name("snabb-softwire-v2") local binding_table = hybridscm.body["softwire-config"].body["binding-table"] -- Add the schema from v1 that we need to convert them. @@ -547,6 +547,9 @@ local function v2_migration(src, conf_file) -- Remove the mandatory requirement on softwire.br-address for the migration binding_table.body["softwire"].body["br-address"].mandatory = false + -- Remove the mandatory requirement on softwire.port-set.psid-length for the migration + binding_table.body["softwire"].body["port-set"].body["psid-length"].mandatory = false + local conf = yang.load_config_for_schema( hybridscm, mem.open_input_string(src, conf_file)) @@ -596,7 +599,7 @@ local migrations = { {version='3.0.1', migrator=migrate_3_0_1}, {version='3.0.1.1', migrator=migrate_3_0_1bis}, {version='3.2.0', migrator=migrate_3_2_0}, - {version='2017.07.01',migrator=migrate_2017_07_01} + {version='2017.07.01',migrator=migrate_2017_07_01}, } diff --git a/src/program/lwaftr/tests/config-migrations/selftest.sh b/src/program/lwaftr/tests/config-migrations/selftest.sh index d854728fca..4ac6f1f575 100755 --- a/src/program/lwaftr/tests/config-migrations/selftest.sh +++ b/src/program/lwaftr/tests/config-migrations/selftest.sh @@ -1,4 +1,4 @@ -#!/bin/sh +#!/usr/bin/env bash # Attempt to migration from legacy to latest LEGACY_OUT=`./snabb lwaftr migrate-configuration -f legacy \ @@ -19,4 +19,4 @@ if [[ "$?" -ne "0" ]]; then echo "3.2.0 configuration migration failed (status code != 0)" echo "$V320_OUT" exit 1 -fi \ No newline at end of file +fi From 992a8451148fe68ad7f2ef88cb686da65cab8cb7 Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Wed, 19 Jan 2022 14:43:32 +0000 Subject: [PATCH 208/209] lwaftr migrate-configuration: support snabb-softwire-v2, fix bugs --- .../migrate_configuration.lua | 46 ++++++++++++++++--- 1 file changed, 40 insertions(+), 6 deletions(-) diff --git a/src/program/lwaftr/migrate_configuration/migrate_configuration.lua b/src/program/lwaftr/migrate_configuration/migrate_configuration.lua index 8723bd877d..a960c936c8 100644 --- a/src/program/lwaftr/migrate_configuration/migrate_configuration.lua +++ b/src/program/lwaftr/migrate_configuration/migrate_configuration.lua @@ -12,6 +12,7 @@ local yang = require('lib.yang.yang') local binding_table = require("apps.lwaftr.binding_table") local Parser = require("program.lwaftr.migrate_configuration.conf_parser").Parser local data = require('lib.yang.data') +local schema = require('lib.yang.schema') local br_address_t = ffi.typeof('uint8_t[16]') local SOFTWIRE_TABLE_LOAD_FACTOR = 0.4 @@ -436,13 +437,38 @@ local function remove_psid_map(conf) return conf end +local function v3_migration(src, conf_file) + local v2_schema = yang.load_schema_by_name("snabb-softwire-v2") + local v3_schema = yang.load_schema_by_name("snabb-softwire-v3") + local conf = yang.load_config_for_schema( + v2_schema, mem.open_input_string(src, conf_file)) + + -- Move leaf external-interface/device up as external-device. + for device, instance in pairs(conf.softwire_config.instance) do + for id, queue in pairs(instance.queue) do + if queue.external_interface.device then + if instance.external_device then + io.stderr:write('Multiple external devices detected; '.. + 'manual verification needed.\n') + io.stderr:flush() + end + instance.external_device = queue.external_interface.device + queue.external_interface.device = nil + end + end + end + + return config_to_string(v3_schema, conf) +end + local function multiprocess_migration(src, conf_file) local device = "IPv6 PCI Address" local ex_device = "IPv4 PCI address" -- We should build up a hybrid schema from parts of v1 and v2. local v1_schema = yang.load_schema_by_name("snabb-softwire-v1") - local hybridscm = yang.load_schema_by_name("snabb-softwire-v2") + -- Make sure we load a fresh schema, as not to mutate a memoized copy + local hybridscm = schema.load_schema(schema.load_schema_source_by_name("snabb-softwire-v2")) local v1_external = v1_schema.body["softwire-config"].body["external-interface"] local v1_internal = v1_schema.body["softwire-config"].body["internal-interface"] local external = hybridscm.body["softwire-config"].body["external-interface"] @@ -473,9 +499,9 @@ local function multiprocess_migration(src, conf_file) -- Build up the instance list local instance = { - [device] = {queue = cltable.new({ key_type = queue_key }),}, + [device] = {queue={}}, } - local key = ffi.new(queue_key, 0) + local key = 0 local value = { external_interface = { device = ex_device, @@ -508,7 +534,7 @@ local function multiprocess_migration(src, conf_file) else error("One or both of next-hop values must be provided.") end - cltable.set(instance[device].queue, key, value) + instance[device].queue[key] = value conf.softwire_config.instance = instance -- Remove the fields which no longer should exist @@ -521,7 +547,7 @@ local function multiprocess_migration(src, conf_file) conf.softwire_config.external_interface.next_hop = nil conf.softwire_config.external_interface.vlan_tag = nil - return config_to_string('snabb-softwire-v2', conf) + return config_to_string(hybridscm, conf) end local function v2_migration(src, conf_file) @@ -529,7 +555,9 @@ local function v2_migration(src, conf_file) -- switch over to v2 of snabb-softwire config. local v1_schema = yang.load_schema_by_name("snabb-softwire-v1") local v1_binding_table = v1_schema.body["softwire-config"].body["binding-table"] - local hybridscm = yang.load_schema_by_name("snabb-softwire-v2") + + -- Make sure we load a fresh schema, as not to mutate a memoized copy + local hybridscm = schema.load_schema(schema.load_schema_source_by_name("snabb-softwire-v2")) local binding_table = hybridscm.body["softwire-config"].body["binding-table"] -- Add the schema from v1 that we need to convert them. @@ -593,6 +621,10 @@ local function migrate_2017_07_01(conf_file, src) return multiprocess_migration(src, conf_file) end +local function migrate_2022_01_19(conf_file, src) + return v3_migration(src, conf_file) +end + local migrations = { {version='legacy', migrator=migrate_legacy}, @@ -600,6 +632,7 @@ local migrations = { {version='3.0.1.1', migrator=migrate_3_0_1bis}, {version='3.2.0', migrator=migrate_3_2_0}, {version='2017.07.01',migrator=migrate_2017_07_01}, + {version='2022.01.19',migrator=migrate_2022_01_19}, } @@ -620,6 +653,7 @@ function run(args) local conf = io.open(conf_file, "r"):read("*a") for _, migration in next,migrations,start do + io.stderr:write(("-> %s migration\n"):format(migration.version)) conf = migration.migrator(conf_file, conf) -- Prompt the garbage collection to do a full collect after each migration collectgarbage() From 6bd9e7aa62971661d0103089210c39830a40ca5c Mon Sep 17 00:00:00 2001 From: Max Rottenkolber Date: Wed, 19 Jan 2022 15:46:10 +0100 Subject: [PATCH 209/209] snabbnfv: do not load ingress_drop_monitor --- src/program/snabbnfv/traffic/traffic.lua | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/program/snabbnfv/traffic/traffic.lua b/src/program/snabbnfv/traffic/traffic.lua index 12ea0f7bf3..883c6f360c 100644 --- a/src/program/snabbnfv/traffic/traffic.lua +++ b/src/program/snabbnfv/traffic/traffic.lua @@ -9,7 +9,6 @@ local ffi = require("ffi") local C = ffi.C local timer = require("core.timer") local pci = require("lib.hardware.pci") -local ingress_drop_monitor = require("lib.timers.ingress_drop_monitor") local counter = require("core.counter") local long_opts = { @@ -91,7 +90,6 @@ function traffic (pciaddr, confpath, sockpath) timer.activate(timer.new("reconf", check_for_reconfigure, 1e9, 'repeating')) -- Flush logs every second. timer.activate(timer.new("flush", io.flush, 1e9, 'repeating')) - timer.activate(ingress_drop_monitor.new({action='warn'}):timer()) while true do needs_reconfigure = false print("Loading " .. confpath)