diff --git a/tests_e2e/tests/lib/agent_log.py b/tests_e2e/tests/lib/agent_log.py index b5c4885209..452c5552a0 100644 --- a/tests_e2e/tests/lib/agent_log.py +++ b/tests_e2e/tests/lib/agent_log.py @@ -64,6 +64,17 @@ def from_dictionary(dictionary: Dict[str, str]): @property def timestamp(self) -> datetime: + # Extension logs may follow different timestamp formats + # 2023/07/10 20:50:13.459260 + ext_timestamp_regex_1 = r"\d{4}/\d{2}/\d{2} \d{2}:\d{2}:\d{2}[.\d]+" + # 2023/07/10 20:50:13 + ext_timestamp_regex_2 = r"\d{4}/\d{2}/\d{2} \d{2}:\d{2}:\d{2}" + + if re.match(ext_timestamp_regex_1, self.when): + return datetime.strptime(self.when, u'%Y/%m/%d %H:%M:%S.%f') + elif re.match(ext_timestamp_regex_2, self.when): + return datetime.strptime(self.when, u'%Y/%m/%d %H:%M:%S') + # Logs from agent follow this format: 2023-07-10T20:50:13.038599Z return datetime.strptime(self.when, u'%Y-%m-%dT%H:%M:%S.%fZ') diff --git a/tests_e2e/tests/scripts/agent_ext_workflow-validate_no_lag_between_agent_start_and_gs_processing.py b/tests_e2e/tests/scripts/agent_ext_workflow-validate_no_lag_between_agent_start_and_gs_processing.py index 7e75c87ef7..0b492d8153 100755 --- a/tests_e2e/tests/scripts/agent_ext_workflow-validate_no_lag_between_agent_start_and_gs_processing.py +++ b/tests_e2e/tests/scripts/agent_ext_workflow-validate_no_lag_between_agent_start_and_gs_processing.py @@ -21,6 +21,7 @@ from datetime import timedelta import re import sys +import time from pathlib import Path from tests_e2e.tests.lib.agent_log import AgentLog @@ -28,66 +29,86 @@ def main(): success = True - - agent_started_time = [] - agent_msg = [] - time_diff_max_secs = 15 - - # Example: Agent WALinuxAgent-2.2.47.2 is running as the goal state agent - agent_started_regex = r"Azure Linux Agent \(Goal State Agent version [0-9.]+\)" - gs_completed_regex = r"ProcessExtensionsGoalState completed\s\[(?P[a-z_\d]{13,14})\s(?P\d+)\sms\]" - - verified_atleast_one_log_line = False - verified_atleast_one_agent_started_log_line = False - verified_atleast_one_gs_complete_log_line = False - - agent_log = AgentLog(Path('/var/log/waagent.log')) - - try: - for agent_record in agent_log.read(): - verified_atleast_one_log_line = True - - agent_started = re.match(agent_started_regex, agent_record.message) - verified_atleast_one_agent_started_log_line = verified_atleast_one_agent_started_log_line or agent_started - if agent_started: - agent_started_time.append(agent_record.timestamp) - agent_msg.append(agent_record.text) - - gs_complete = re.match(gs_completed_regex, agent_record.message) - verified_atleast_one_gs_complete_log_line = verified_atleast_one_gs_complete_log_line or gs_complete - if agent_started_time and gs_complete: - duration = gs_complete.group('duration') - diff = agent_record.timestamp - agent_started_time.pop() - # Reduce the duration it took to complete the Goalstate, essentially we should only care about how long - # the agent took after start/restart to start processing GS - diff -= timedelta(milliseconds=int(duration)) - agent_msg_line = agent_msg.pop() - if diff.seconds > time_diff_max_secs: - success = False - print("Found delay between agent start and GoalState Processing > {0}secs: " - "Messages: \n {1} {2}".format(time_diff_max_secs, agent_msg_line, agent_record.text)) - - except IOError as e: - print("Unable to validate no lag time: {0}".format(str(e))) - - if not verified_atleast_one_log_line: - success = False - print("Didn't parse a single log line, ensure the log_parser is working fine and verify log regex") - - if not verified_atleast_one_agent_started_log_line: - success = False - print("Didn't parse a single agent started log line, ensure the Regex is working fine: {0}" - .format(agent_started_regex)) - - if not verified_atleast_one_gs_complete_log_line: - success = False - print("Didn't parse a single GS completed log line, ensure the Regex is working fine: {0}" - .format(gs_completed_regex)) - - if agent_started_time or agent_msg: - success = False - print("Mismatch between number of agent start messages and number of GoalState Processing messages\n " - "Agent Start Messages: \n {0}".format('\n'.join(agent_msg))) + needs_retry = True + retry = 3 + + while retry >= 0 and needs_retry: + success = True + needs_retry = False + + agent_started_time = [] + agent_msg = [] + time_diff_max_secs = 15 + last_agent_log_timestamp = None + + # Example: Agent WALinuxAgent-2.2.47.2 is running as the goal state agent + agent_started_regex = r"Azure Linux Agent \(Goal State Agent version [0-9.]+\)" + gs_completed_regex = r"ProcessExtensionsGoalState completed\s\[(?P[a-z_\d]{13,14})\s(?P\d+)\sms\]" + + verified_atleast_one_log_line = False + verified_atleast_one_agent_started_log_line = False + verified_atleast_one_gs_complete_log_line = False + + agent_log = AgentLog(Path('/var/log/waagent.log')) + + try: + for agent_record in agent_log.read(): + last_agent_log_timestamp = agent_record.timestamp + verified_atleast_one_log_line = True + + agent_started = re.match(agent_started_regex, agent_record.message) + verified_atleast_one_agent_started_log_line = verified_atleast_one_agent_started_log_line or agent_started + if agent_started: + agent_started_time.append(agent_record.timestamp) + agent_msg.append(agent_record.text) + + gs_complete = re.match(gs_completed_regex, agent_record.message) + verified_atleast_one_gs_complete_log_line = verified_atleast_one_gs_complete_log_line or gs_complete + if agent_started_time and gs_complete: + duration = gs_complete.group('duration') + diff = agent_record.timestamp - agent_started_time.pop() + # Reduce the duration it took to complete the Goalstate, essentially we should only care about how long + # the agent took after start/restart to start processing GS + diff -= timedelta(milliseconds=int(duration)) + agent_msg_line = agent_msg.pop() + if diff.seconds > time_diff_max_secs: + success = False + print("Found delay between agent start and GoalState Processing > {0}secs: " + "Messages: \n {1} {2}".format(time_diff_max_secs, agent_msg_line, agent_record.text)) + + except IOError as e: + print("Unable to validate no lag time: {0}".format(str(e))) + + if not verified_atleast_one_log_line: + success = False + print("Didn't parse a single log line, ensure the log_parser is working fine and verify log regex") + + if not verified_atleast_one_agent_started_log_line: + success = False + print("Didn't parse a single agent started log line, ensure the Regex is working fine: {0}" + .format(agent_started_regex)) + + if not verified_atleast_one_gs_complete_log_line: + success = False + print("Didn't parse a single GS completed log line, ensure the Regex is working fine: {0}" + .format(gs_completed_regex)) + + if agent_started_time or agent_msg: + # If agent_started_time or agent_msg is not empty, there is a mismatch in the number of agent start messages + # and GoalState Processing messages + # If another check hasn't already failed, and the last parsed log is less than 15 seconds after the + # mismatched agent start log, we should retry after sleeping for 5s to give the agent time to finish + # GoalState processing + if success and last_agent_log_timestamp < (agent_started_time[-1] + timedelta(seconds=15)): + needs_retry = True + print("Sleeping for 5 seconds to allow goal state processing to complete...") + time.sleep(5) + else: + success = False + print("Mismatch between number of agent start messages and number of GoalState Processing messages\n " + "Agent Start Messages: \n {0}".format('\n'.join(agent_msg))) + + retry -= 1 sys.exit(0 if success else 1)