Skip to content

Commit

Permalink
issue-2500: unstable-process will dump backtrace when terminating pro…
Browse files Browse the repository at this point in the history
…cess is stuck (#2501)
  • Loading branch information
budevg authored Nov 18, 2024
1 parent afefb4a commit fb77d86
Show file tree
Hide file tree
Showing 3 changed files with 27 additions and 2 deletions.
19 changes: 18 additions & 1 deletion cloud/storage/core/tools/common/python/daemon.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import logging
import os
import requests
import subprocess
import tempfile
import threading
import time
Expand Down Expand Up @@ -87,7 +88,7 @@ def __terminate_process(self):
self.__verify_process()
logger.info("terminating process")
self.__process.terminate()
self.__process.wait(check_exit_code=False)
process_wait_and_check(self.__process, check_timeout=60, check_exit_code=False)
self.__process = None

# Should be guarded by self.__lock.
Expand Down Expand Up @@ -238,3 +239,19 @@ def is_alive(self):
self.__process.process.pid))
return False
return True


def process_wait_and_check(process, check_timeout=60, **kwargs):
while True:
try:
process.wait(timeout=check_timeout, **kwargs)
except subprocess.TimeoutExpired:
logger.info(
f"wait for pid {process.pid} timed out after {check_timeout} seconds"
)
bt = subprocess.getoutput(
f'sudo gdb --batch -p {process.pid} -ex "thread apply all bt"'
)
logger.info(f"PID {process.pid}: backtrace:\n{bt}")
continue
break
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
import sys
import time

from cloud.storage.core.tools.common.python.daemon import process_wait_and_check

process = None


Expand Down Expand Up @@ -57,6 +59,7 @@ def main():
parser.add_argument('--ping-success-codes', help='', nargs='*')
parser.add_argument('--allow-restart-flag', help='file to look for before restart', type=str, default=None)
parser.add_argument('-v', '--verbose', help='verbose mode', default=0, action='count')
parser.add_argument('--terminate-check-timeout', help='the timeout in seconds between wait attempts for terminated process', type=int, default=60)

args = parser.parse_args()

Expand Down Expand Up @@ -92,7 +95,8 @@ def main():
else:
logging.info(f'terminating process {cmdline}')
process.terminate()
process.wait()
process_wait_and_check(process,
check_timeout=args.terminate_check_timeout)

def start_process():
logging.info(f'starting process {cmdline}')
Expand Down
4 changes: 4 additions & 0 deletions cloud/storage/core/tools/testing/unstable-process/ya.make
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@ PY3_PROGRAM(storage-unstable-process)

PEERDIR(
contrib/python/requests/py3

cloud/storage/core/tools/common/python

library/python/testing/yatest_common
)

PY_SRCS(
Expand Down

0 comments on commit fb77d86

Please sign in to comment.