Skip to content

Commit

Permalink
[CHASSIS][Voq][QoS]Increasing LACP timer for lag ports for broadcom-d…
Browse files Browse the repository at this point in the history
…nx neighbor EOS host (#14469)

escription of PR
Intermittently testQosSaiLossyQueue tests fails due to Port-channel flap on broadcom-dnx T2 Voq chassis.
The reason the port-channel goes down is because this test requires disabling TX on the egress port (which is a member of a port-channel)
With the huge buffer-size, it takes a longer time to send packets . This will result in the TX LACP packets to stop egressing, so after 3 LACP packets are missed (~90s) on the server side the LAG is torn down.

Issue # #11682
Summary:
Fixes # (issue)

What is the motivation for this PR?
Intermittently testQosSaiLossyQueue tests fails due to Port-channel flap

How did you do it?
The lacp timer multiplier on the EOS host is configurable.
By default, timeout is 30 secs with a failure tolerance of 3.
We changed the multiplier to an increased value to hold the connectivity for some time until all packets are sent.
And revert the changes after test case execution.

How did you verify/test it?
Executed qos test cases and verfiy the results.
  • Loading branch information
ansrajpu-git authored Nov 26, 2024
1 parent aa1d42a commit 399a1b6
Show file tree
Hide file tree
Showing 3 changed files with 71 additions and 6 deletions.
18 changes: 18 additions & 0 deletions tests/common/devices/eos.py
Original file line number Diff line number Diff line change
Expand Up @@ -556,3 +556,21 @@ def no_isis_metric(self, interface):
lines=['no isis metric'],
parents=['interface {}'.format(interface)])
return not self._has_cli_cmd_failed(out)

def set_interface_lacp_time_multiplier(self, interface_name, multiplier):
out = self.eos_config(
lines=['lacp timer multiplier %d' % multiplier],
parents='interface %s' % interface_name)

if out['failed'] is True or out['changed'] is False:
logging.warning("Unable to set interface [%s] lacp timer multiplier to [%d]" % (interface_name, multiplier))
else:
logging.info("Set interface [%s] lacp timer to [%d]" % (interface_name, multiplier))
return out

def no_lacp_time_multiplier(self, interface_name):
out = self.eos_config(
lines=['no lacp timer multiplier'],
parents=['interface {}'.format(interface_name)])
logging.info('Reset lacp timer to default for interface [%s]' % interface_name)
return out
47 changes: 47 additions & 0 deletions tests/qos/qos_sai_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
from tests.common.system_utils import docker # noqa F401
from tests.common.errors import RunAnsibleModuleFail
from tests.common import config_reload
from tests.common.devices.eos import EosHost

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -2577,3 +2578,49 @@ def isLonglink(self, dut_host):
if cable_length >= 120000:
return True
return False

@pytest.fixture(scope="function", autouse=False)
def change_lag_lacp_timer(self, duthosts, get_src_dst_asic_and_duts, tbinfo, nbrhosts, dutConfig, dutTestParams,
request):
if request.config.getoption("--neighbor_type") == "sonic":
yield
return

if ('platform_asic' in dutTestParams["basicParams"] and
dutTestParams["basicParams"]["platform_asic"] == "broadcom-dnx"):
src_dut = get_src_dst_asic_and_duts['src_dut']
dst_dut = get_src_dst_asic_and_duts['dst_dut']
if src_dut.sonichost.is_multi_asic and dst_dut.sonichost.is_multi_asic:
dst_mgfacts = dst_dut.get_extended_minigraph_facts(tbinfo)
dst_port_id = dutConfig['testPorts']['dst_port_id']
dst_interface = dutConfig['dutInterfaces'][dst_port_id]
lag_name = ''
for port_ch, port_intf in dst_mgfacts['minigraph_portchannels'].items():
if dst_interface in port_intf['members']:
lag_name = port_ch
break
if lag_name == '':
yield
return
lag_facts = dst_dut.lag_facts(host=dst_dut.hostname)['ansible_facts']['lag_facts']
po_interfaces = lag_facts['lags'][lag_name]['po_config']['ports']
vm_neighbors = dst_mgfacts['minigraph_neighbors']
neighbor_lag_intfs = [vm_neighbors[po_intf]['port'] for po_intf in po_interfaces]
neigh_intf = next(iter(po_interfaces.keys()))
peer_device = vm_neighbors[neigh_intf]['name']
vm_host = nbrhosts[peer_device]['host']
num = 600
for neighbor_lag_member in neighbor_lag_intfs:
logger.info(
"Changing lacp timer multiplier to 600 for %s in %s" % (neighbor_lag_member, peer_device))
if isinstance(vm_host, EosHost):
vm_host.set_interface_lacp_time_multiplier(neighbor_lag_member, num)

yield
if ('platform_asic' in dutTestParams["basicParams"] and
dutTestParams["basicParams"]["platform_asic"] == "broadcom-dnx"):
if src_dut.sonichost.is_multi_asic and dst_dut.sonichost.is_multi_asic:
for neighbor_lag_member in neighbor_lag_intfs:
logger.info(
"Changing lacp timer multiplier to default for %s in %s" % (neighbor_lag_member, peer_device))
vm_host.no_lacp_time_multiplier(neighbor_lag_member)
12 changes: 6 additions & 6 deletions tests/qos/test_qos_sai.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ def ignore_expected_loganalyzer_exception(get_src_dst_asic_and_duts, loganalyzer
# The following error log is related to the bug of https://github.com/sonic-net/sonic-buildimage/issues/13265
".*ERR lldp[0-9]*#lldpmgrd.*Command failed.*lldpcli.*configure.*ports.*unable to connect to socket.*",
".*ERR lldp[0-9]*#lldpmgrd.*Command failed.*lldpcli.*configure.*ports.*lldp.*unknown command from argument"
".*configure.*command was failed.*times, disabling retry.*"
".*configure.*command was failed.*times, disabling retry.*",
# Error related to syncd socket-timeout intermittenly
".*ERR syncd[0-9]*#dsserve: _ds2tty broken pipe.*"
]
Expand Down Expand Up @@ -325,7 +325,7 @@ def testParameter(
def testQosSaiPfcXoffLimit(
self, xoffProfile, duthosts, get_src_dst_asic_and_duts,
ptfhost, dutTestParams, dutConfig, dutQosConfig,
ingressLosslessProfile, egressLosslessProfile
ingressLosslessProfile, egressLosslessProfile, change_lag_lacp_timer
):
# NOTE: this test will be skipped for t2 cisco 8800 if it's not xoff_1 or xoff_2
"""
Expand Down Expand Up @@ -1147,7 +1147,7 @@ def testQosSaiBufferPoolWatermark(

def testQosSaiLossyQueue(
self, ptfhost, get_src_dst_asic_and_duts, dutTestParams, dutConfig, dutQosConfig,
ingressLossyProfile, skip_src_dst_different_asic
ingressLossyProfile, skip_src_dst_different_asic, change_lag_lacp_timer
):
"""
Test QoS SAI Lossy queue, shared buffer dynamic allocation
Expand Down Expand Up @@ -1591,7 +1591,7 @@ def testQosSaiDwrr(
@pytest.mark.parametrize("pgProfile", ["wm_pg_shared_lossless", "wm_pg_shared_lossy"])
def testQosSaiPgSharedWatermark(
self, pgProfile, ptfhost, get_src_dst_asic_and_duts, dutTestParams, dutConfig, dutQosConfig,
resetWatermark, _skip_watermark_multi_DUT, skip_src_dst_different_asic
resetWatermark, _skip_watermark_multi_DUT, skip_src_dst_different_asic, change_lag_lacp_timer
):
"""
Test QoS SAI PG shared watermark test for lossless/lossy traffic
Expand Down Expand Up @@ -1683,7 +1683,7 @@ def testQosSaiPgSharedWatermark(

def testQosSaiPgHeadroomWatermark(
self, ptfhost, get_src_dst_asic_and_duts, dutTestParams, dutConfig, dutQosConfig, resetWatermark,
):
change_lag_lacp_timer):
"""
Test QoS SAI PG headroom watermark test
Expand Down Expand Up @@ -1793,7 +1793,7 @@ def testQosSaiPGDrop(
@pytest.mark.parametrize("queueProfile", ["wm_q_shared_lossless", "wm_q_shared_lossy"])
def testQosSaiQSharedWatermark(
self, get_src_dst_asic_and_duts, queueProfile, ptfhost, dutTestParams, dutConfig, dutQosConfig,
resetWatermark, _skip_watermark_multi_DUT, skip_pacific_dst_asic
resetWatermark, _skip_watermark_multi_DUT, skip_pacific_dst_asic, change_lag_lacp_timer
):
"""
Test QoS SAI Queue shared watermark test for lossless/lossy traffic
Expand Down

0 comments on commit 399a1b6

Please sign in to comment.