diff --git a/cloud/storage/core/tools/common/python/daemon.py b/cloud/storage/core/tools/common/python/daemon.py index e4c7998ca5b..d7ca74eaf3b 100644 --- a/cloud/storage/core/tools/common/python/daemon.py +++ b/cloud/storage/core/tools/common/python/daemon.py @@ -2,6 +2,7 @@ import logging import os import requests +import subprocess import tempfile import threading import time @@ -87,7 +88,7 @@ def __terminate_process(self): self.__verify_process() logger.info("terminating process") self.__process.terminate() - self.__process.wait(check_exit_code=False) + process_wait_and_check(self.__process, check_timeout=60, check_exit_code=False) self.__process = None # Should be guarded by self.__lock. @@ -238,3 +239,19 @@ def is_alive(self): self.__process.process.pid)) return False return True + + +def process_wait_and_check(process, check_timeout=60, **kwargs): + while True: + try: + process.wait(timeout=check_timeout, **kwargs) + except subprocess.TimeoutExpired: + logger.info( + f"wait for pid {process.pid} timed out after {check_timeout} seconds" + ) + bt = subprocess.getoutput( + f'sudo gdb --batch -p {process.pid} -ex "thread apply all bt"' + ) + logger.info(f"PID {process.pid}: backtrace:\n{bt}") + continue + break diff --git a/cloud/storage/core/tools/testing/unstable-process/__main__.py b/cloud/storage/core/tools/testing/unstable-process/__main__.py index 4300147731b..162666053f6 100644 --- a/cloud/storage/core/tools/testing/unstable-process/__main__.py +++ b/cloud/storage/core/tools/testing/unstable-process/__main__.py @@ -9,6 +9,8 @@ import sys import time +from cloud.storage.core.tools.common.python.daemon import process_wait_and_check + process = None @@ -57,6 +59,7 @@ def main(): parser.add_argument('--ping-success-codes', help='', nargs='*') parser.add_argument('--allow-restart-flag', help='file to look for before restart', type=str, default=None) parser.add_argument('-v', '--verbose', help='verbose mode', default=0, action='count') + parser.add_argument('--terminate-check-timeout', help='the timeout in seconds between wait attempts for terminated process', type=int, default=60) args = parser.parse_args() @@ -92,7 +95,8 @@ def main(): else: logging.info(f'terminating process {cmdline}') process.terminate() - process.wait() + process_wait_and_check(process, + check_timeout=args.terminate_check_timeout) def start_process(): logging.info(f'starting process {cmdline}') diff --git a/cloud/storage/core/tools/testing/unstable-process/ya.make b/cloud/storage/core/tools/testing/unstable-process/ya.make index 1132149fafd..ef2fa23ffe0 100644 --- a/cloud/storage/core/tools/testing/unstable-process/ya.make +++ b/cloud/storage/core/tools/testing/unstable-process/ya.make @@ -2,6 +2,10 @@ PY3_PROGRAM(storage-unstable-process) PEERDIR( contrib/python/requests/py3 + + cloud/storage/core/tools/common/python + + library/python/testing/yatest_common ) PY_SRCS(