diff --git a/ydb/tests/stability/library/__main__.py b/ydb/tests/stability/library/__main__.py index 4dff560d17b5..3b42e2c4e733 100644 --- a/ydb/tests/stability/library/__main__.py +++ b/ydb/tests/stability/library/__main__.py @@ -7,6 +7,7 @@ import stat import sys import argparse +import re from library.python import resource @@ -59,6 +60,91 @@ def _unpack_resource(self, name): os.chmod(path_to_unpack, st.st_mode | stat.S_IEXEC) return path_to_unpack + def clean_trace(self, traces): + cleaned_lines = [] + for line in traces.split('\n'): + line = re.sub(r' @ 0x[a-fA-F0-9]+', '', line) + # Убираем все до текста ошибки или указателя на строку кода + match_verify = re.search(r'VERIFY|FAIL|signal 11|signal 6|signal 15|uncaught exception', line) + match_code_file_line = re.search(r'\s+(\S+\.cpp:\d+).*', line) + + if match_verify: + cleaned_lines.append(match_verify.group()) + elif match_code_file_line: + cleaned_lines.append(match_code_file_line.group()) + + return "\n".join(cleaned_lines) + + def is_sublist(self, shorter, longer): + """ Check if 'shorter' is a sublist in 'longer' from the start """ + return longer[:len(shorter)] == shorter + + def find_unique_traces_with_counts(self, all_traces): + clean_traces_dict = {} + unique_traces = {} + + for trace in all_traces: + clean_trace = self.clean_trace(trace) + if clean_traces_dict.get(clean_trace): + clean_traces_dict[clean_trace].append(trace) + else: + clean_traces_dict[clean_trace] = [trace] + + clean_traces_dict = dict(sorted(clean_traces_dict.items(), key=lambda item: len(item[0]))) + for trace in clean_traces_dict: + for unique in unique_traces: + if self.is_sublist(trace, unique): + unique_traces[unique] = unique_traces[unique] + clean_traces_dict[trace] + break + elif self.is_sublist(unique, trace): + unique_traces[trace] = unique_traces[unique] + clean_traces_dict[trace] + del unique_traces[unique] + break + if not unique_traces.get(trace): + unique_traces[trace] = clean_traces_dict[trace] + + return dict(sorted(unique_traces.items(), key=lambda item: len(item[1]), reverse=True)) + + def process_lines(self, text): + traces = [] + trace = "" + for host in text: + host = host.split('\n') + for line in host: + if line in ("--", "\n", ""): + traces.append(trace) + trace = "" + else: + trace = trace + line + '\n' + return traces + + def get_all_errors(self): + logging.getLogger().setLevel(logging.WARNING) + all_results = [] + for node in self.kikimr_cluster.nodes.values(): + result = node.ssh_command(""" + ls -ltr /Berkanavt/kikimr*/logs/kikimr* | + awk '{print $NF}' | + while read file; do + case "$file" in + *.txt) cat "$file" ;; + *.gz) zcat "$file" ;; + *) cat "$file" ;; + esac + done | + grep -E 'VERIFY|FAIL|signal 11|signal 6|signal 15|uncaught exception' -A 20 + """, raise_on_error=False) + if result: + all_results.append(result.decode('utf-8')) + all_results = self.process_lines(all_results) + return all_results + + def get_errors(self): + errors = self.get_all_errors() + unique_traces = self.find_unique_traces_with_counts(errors) + for trace in unique_traces: + print(f"Trace (Occurrences: {len(unique_traces[trace])}):\n{trace}\n{'-'*60}") + def perform_checks(self): safety_violations = safety_warden_factory(self.kikimr_cluster, self.ssh_username).list_of_safety_violations() @@ -193,6 +279,7 @@ def parse_args(): type=str, nargs="+", choices=[ + "get_errors", "cleanup", "cleanup_logs", "cleanup_dumps", @@ -222,6 +309,8 @@ def main(): ) for action in args.actions: + if action == "get_errors": + stability_cluster.get_errors() if action == "deploy_ydb": stability_cluster.deploy_ydb() if action == "cleanup":