17
Sometimes CUDA devices can get stuck, 'deadlock'. In this case it is often
18
better just the kill the process automatically. Use this guard to set a
19
maximum timespan for a python call, such as RunNet(). If it does not complete
20
in time, process is killed.
23
with timeout_guard.CompleteInTimeOrDie(10.0):
28
class WatcherThread(threading.Thread):
30
def __init__(self, timeout_secs):
31
threading.Thread.__init__(self)
32
self.timeout_secs = timeout_secs
33
self.completed = False
34
self.condition = threading.Condition()
36
self.caller_thread = threading.current_thread()
40
self.condition.acquire()
41
while time.time() - started < self.timeout_secs and not self.completed:
42
self.condition.wait(self.timeout_secs - (time.time() - started))
43
self.condition.release()
44
if not self.completed:
45
log = logging.getLogger("timeout_guard")
46
log.error("Call did not finish in time. Timeout:{}s PID: {}".format(
54
log.info("Prepared output, dumping threads. ")
55
print("Caller thread was: {}".format(self.caller_thread))
56
print("-----After force------")
57
log.info("-----After force------")
61
for threadId, stack in sys._current_frames().items():
62
if threadId == self.caller_thread.ident:
63
code.append("\n# ThreadID: %s" % threadId)
64
for filename, lineno, name, line in traceback.extract_stack(stack):
65
code.append('File: "%s", line %d, in %s' % (filename, lineno, name))
67
code.append(" %s" % (line.strip()))
70
print("\n".join(code))
71
log.info("\n".join(code))
72
log.error("Process did not terminate cleanly in 10 s, forcing")
75
forcet = threading.Thread(target=forcequit, args=())
78
print("Caller thread was: {}".format(self.caller_thread))
79
print("-----Before forcing------")
83
for threadId, stack in sys._current_frames().items():
84
code.append("\n# ThreadID: %s" % threadId)
85
for filename, lineno, name, line in traceback.extract_stack(stack):
86
code.append('File: "%s", line %d, in %s' % (filename, lineno, name))
88
code.append(" %s" % (line.strip()))
91
print("\n".join(code))
92
log.info("\n".join(code))
93
os.kill(os.getpid(), signal.SIGINT)
96
@contextlib.contextmanager
97
def CompleteInTimeOrDie(timeout_secs):
98
watcher = WatcherThread(timeout_secs)
101
watcher.completed = True
102
watcher.condition.acquire()
103
watcher.condition.notify()
104
watcher.condition.release()
107
def EuthanizeIfNecessary(timeout_secs=120):
109
Call this if you have problem with process getting stuck at shutdown.
110
It will kill the process if it does not terminate in timeout_secs.
112
watcher = WatcherThread(timeout_secs)