From 0965f213a20b77bd42e8e44d371d5fe17af2478e Mon Sep 17 00:00:00 2001 From: Yaqiang Zhu Date: Fri, 27 Dec 2024 14:01:01 +0800 Subject: [PATCH] [sanity_check][bgp] Add default route check in sanity for single asic (#16235) What is the motivation for this PR? BGP routes would be setup during add-topo https://github.com/sonic-net/sonic-mgmt/blob/master/ansible/roles/vm_set/tasks/add_topo.yml#L276. But there are some scenarios that route in DUT has been messed up, but bgp sessions are all up, sanity would treat it as healthy and wouldn't take action to recover it. Loopbackv4 address has been replaced, it would cause all kernel routes from bgp miss In some test cases announce or withdraw routes from ptf but fail to recover (i.e. test_stress_routes) Healthy status: admin@sonic:~$ ip route show default default nhid 282 proto bgp src 10.1.0.32 metric 20 nexthop via 10.0.0.57 dev PortChannel101 weight 1 nexthop via 10.0.0.59 dev PortChannel103 weight 1 nexthop via 10.0.0.61 dev PortChannel105 weight 1 nexthop via 10.0.0.63 dev PortChannel106 weight 1 admin@sonic:~$ show ip bgp sum IPv4 Unicast Summary: BGP router identifier 10.1.0.32, local AS number 65100 vrf-id 0 BGP table version 2890 RIB entries 2893, using 648032 bytes of memory Peers 6, using 4451856 KiB of memory Peer groups 4, using 256 bytes of memory Neighbhor V AS MsgRcvd MsgSent TblVer InQ OutQ Up/Down State/PfxRcd NeighborName ----------- --- ----- --------- --------- -------- ----- ------ --------- -------------- -------------- 10.0.0.57 4 65200 763 764 0 0 0 11:46:17 1439 ARISTA01M1 10.0.0.59 4 65200 763 765 0 0 0 11:46:17 1439 ARISTA02M1 10.0.0.61 4 65200 763 765 0 0 0 11:46:17 1439 ARISTA03M1 10.0.0.63 4 65200 763 765 0 0 0 11:46:17 1439 ARISTA04M1 10.0.0.65 4 64001 712 761 0 0 0 11:46:15 2 ARISTA01MX 10.0.0.67 4 64002 712 761 0 0 0 11:46:15 2 ARISTA02MX Total number of neighbors 6 Issue status, no default route, but show ip bgp sum looks good admin@sonic:~$ ip route show default admin@sonic:~$ show ip bgp sum IPv4 Unicast Summary: BGP router identifier 10.1.0.32, local AS number 65100 vrf-id 0 BGP table version 2892 RIB entries 2893, using 648032 bytes of memory Peers 6, using 4451856 KiB of memory Peer groups 4, using 256 bytes of memory Neighbhor V AS MsgRcvd MsgSent TblVer InQ OutQ Up/Down State/PfxRcd NeighborName ----------- --- ----- --------- --------- -------- ----- ------ --------- -------------- -------------- 10.0.0.57 4 65200 764 767 0 0 0 11:47:14 1439 ARISTA01M1 10.0.0.59 4 65200 764 768 0 0 0 11:47:14 1439 ARISTA02M1 10.0.0.61 4 65200 764 768 0 0 0 11:47:14 1439 ARISTA03M1 10.0.0.63 4 65200 764 768 0 0 0 11:47:14 1439 ARISTA04M1 10.0.0.65 4 64001 713 764 0 0 0 11:47:12 2 ARISTA01MX 10.0.0.67 4 64002 713 764 0 0 0 11:47:12 2 ARISTA02MX Total number of neighbors 6 How did you do it? Add default routes check in sanity check, and re-announce routes if issue happen How did you verify/test it? Run sanity check --- tests/common/plugins/sanity_check/checks.py | 43 +++++++++++++++----- tests/common/plugins/sanity_check/recover.py | 17 +++++++- 2 files changed, 48 insertions(+), 12 deletions(-) diff --git a/tests/common/plugins/sanity_check/checks.py b/tests/common/plugins/sanity_check/checks.py index 8a33df090e4..d18c64bd020 100644 --- a/tests/common/plugins/sanity_check/checks.py +++ b/tests/common/plugins/sanity_check/checks.py @@ -166,6 +166,12 @@ def _check_bgp_on_dut(*args, **kwargs): dut = kwargs['node'] results = kwargs['results'] + def _check_default_route(version, dut): + # Return True if successfully get default route + res = dut.shell("ip {} route show default".format("" if version == 4 else "-6"), + module_ignore_errors=True) + return not res["rc"] and len(res["stdout"].strip()) != 0 + def _check_bgp_status_helper(): asic_check_results = [] bgp_facts = dut.bgp_facts(asic_index='all') @@ -187,23 +193,34 @@ def _check_bgp_status_helper(): if a_asic_neighbors is not None and len(a_asic_neighbors) > 0: down_neighbors = [k for k, v in list(a_asic_neighbors.items()) if v['state'] != 'established'] + asic_key = 'bgp' if dut.facts['num_asic'] == 1 else 'bgp' + str(asic_index) if down_neighbors: - if dut.facts['num_asic'] == 1: - check_result['bgp'] = {'down_neighbors': down_neighbors} - else: - check_result['bgp' + str(asic_index)] = {'down_neighbors': down_neighbors} + check_result[asic_key] = {'down_neighbors': down_neighbors} a_asic_result = True else: a_asic_result = False - if dut.facts['num_asic'] == 1: - if 'bgp' in check_result: - check_result['bgp'].pop('down_neighbors', None) - else: - if 'bgp' + str(asic_index) in check_result: - check_result['bgp' + str(asic_index)].pop('down_neighbors', None) + if asic_key in check_result: + check_result[asic_key].pop('down_neighbors', None) else: a_asic_result = True + # Add default route check for default route missing issue in below scenario: + # 1) Loopbackv4 ip address replace, it would cause all bgp routes missing + # 2) Announce or withdraw routes in some test cases and doesn't recover it + # Chassis and multi_asic is not supported for now + if not dut.is_multi_asic and not (dut.get_facts().get("modular_chassis") is True or + dut.get_facts().get("modular_chassis") == "True"): + if not _check_default_route(4, dut): + if asic_key not in check_result: + check_result[asic_key] = {} + check_result[asic_key]["no_v4_default_route"] = True + a_asic_result = True + if not _check_default_route(6, dut): + if asic_key not in check_result: + check_result[asic_key] = {} + check_result[asic_key]["no_v6_default_route"] = True + a_asic_result = True + asic_check_results.append(a_asic_result) if any(asic_check_results): @@ -243,8 +260,12 @@ def _check_bgp_status_helper(): if 'down_neighbors' in check_result[a_result]: logger.info('BGP neighbors down: %s on bgp instance %s on dut %s' % ( check_result[a_result]['down_neighbors'], a_result, dut.hostname)) + if "no_v4_default_route" in check_result[a_result]: + logger.info('Deafult v4 route for {} {} is missing'.format(dut.hostname, a_result)) + if "no_v6_default_route" in check_result[a_result]: + logger.info('Deafult v6 route for {} {} is missing'.format(dut.hostname, a_result)) else: - logger.info('No BGP neighbors are down on %s' % dut.hostname) + logger.info('No BGP neighbors are down or default route missing on %s' % dut.hostname) mgFacts = dut.get_extended_minigraph_facts(tbinfo) if dut.num_asics() == 1 and tbinfo['topo']['type'] != 't2' and \ diff --git a/tests/common/plugins/sanity_check/recover.py b/tests/common/plugins/sanity_check/recover.py index 33b19d48b40..14fd2ca81fb 100644 --- a/tests/common/plugins/sanity_check/recover.py +++ b/tests/common/plugins/sanity_check/recover.py @@ -147,6 +147,12 @@ def _recover_with_command(dut, cmd, wait_time): wait(wait_time, msg="Wait {} seconds for system to be stable.".format(wait_time)) +def re_announce_routes(localhost, topo_name, ptf_ip): + localhost.announce_routes(topo_name=topo_name, ptf_ip=ptf_ip, action="withdraw", path="../ansible/") + localhost.announce_routes(topo_name=topo_name, ptf_ip=ptf_ip, action="announce", path="../ansible/") + return None + + def adaptive_recover(dut, localhost, fanouthosts, nbrhosts, tbinfo, check_results, wait_time): outstanding_action = None for result in check_results: @@ -155,7 +161,16 @@ def adaptive_recover(dut, localhost, fanouthosts, nbrhosts, tbinfo, check_result action = _recover_interfaces(dut, fanouthosts, result, wait_time) elif result['check_item'] == 'services': action = _recover_services(dut, result) - elif result['check_item'] == 'bgp' or result['check_item'] == "neighbor_macsec_empty": + elif result['check_item'] == 'bgp': + # If there is only default route missing issue, only need to re-announce routes to recover + if ("no_v4_default_route" in result['bgp'] and len(result['bgp']) == 1 or + "no_v6_default_route" in result['bgp'] and len(result['bgp']) == 1 or + ("no_v4_default_route" in result['bgp'] and "no_v6_default_route" in result['bgp'] and + len(result['bgp']) == 2)): + action = re_announce_routes(localhost, tbinfo["topo"]["name"], tbinfo["ptf_ip"]) + else: + action = neighbor_vm_restore(dut, nbrhosts, tbinfo, result) + elif result['check_item'] == "neighbor_macsec_empty": action = neighbor_vm_restore(dut, nbrhosts, tbinfo, result) elif result['check_item'] in ['processes', 'mux_simulator']: action = 'config_reload'