pytorch

Форк
0
/
timeout_guard.py 
113 строк · 3.9 Кб
1
## @package timeout_guard
2
# Module caffe2.python.timeout_guard
3

4

5

6

7

8
import contextlib
9
import threading
10
import os
11
import time
12
import signal
13
import logging
14

15

16
'''
17
Sometimes CUDA devices can get stuck, 'deadlock'. In this case it is often
18
better just the kill the process automatically. Use this guard to set a
19
maximum timespan for a python call, such as RunNet(). If it does not complete
20
in time, process is killed.
21

22
Example usage:
23
    with timeout_guard.CompleteInTimeOrDie(10.0):
24
        core.RunNet(...)
25
'''
26

27

28
class WatcherThread(threading.Thread):
29

30
    def __init__(self, timeout_secs):
31
        threading.Thread.__init__(self)
32
        self.timeout_secs = timeout_secs
33
        self.completed = False
34
        self.condition = threading.Condition()
35
        self.daemon = True
36
        self.caller_thread = threading.current_thread()
37

38
    def run(self):
39
        started = time.time()
40
        self.condition.acquire()
41
        while time.time() - started < self.timeout_secs and not self.completed:
42
            self.condition.wait(self.timeout_secs - (time.time() - started))
43
        self.condition.release()
44
        if not self.completed:
45
            log = logging.getLogger("timeout_guard")
46
            log.error("Call did not finish in time. Timeout:{}s PID: {}".format(
47
                self.timeout_secs,
48
                os.getpid(),
49
            ))
50

51
            # First try dying cleanly, but in 10 secs, exit properly
52
            def forcequit():
53
                time.sleep(10.0)
54
                log.info("Prepared output, dumping threads. ")
55
                print("Caller thread was: {}".format(self.caller_thread))
56
                print("-----After force------")
57
                log.info("-----After force------")
58
                import sys
59
                import traceback
60
                code = []
61
                for threadId, stack in sys._current_frames().items():
62
                    if threadId == self.caller_thread.ident:
63
                        code.append("\n# ThreadID: %s" % threadId)
64
                        for filename, lineno, name, line in traceback.extract_stack(stack):
65
                            code.append('File: "%s", line %d, in %s' % (filename, lineno, name))
66
                            if line:
67
                                code.append("  %s" % (line.strip()))
68

69
                # Log also with logger, as it is comment practice to suppress print().
70
                print("\n".join(code))
71
                log.info("\n".join(code))
72
                log.error("Process did not terminate cleanly in 10 s, forcing")
73
                os.abort()
74

75
            forcet = threading.Thread(target=forcequit, args=())
76
            forcet.daemon = True
77
            forcet.start()
78
            print("Caller thread was: {}".format(self.caller_thread))
79
            print("-----Before forcing------")
80
            import sys
81
            import traceback
82
            code = []
83
            for threadId, stack in sys._current_frames().items():
84
                code.append("\n# ThreadID: %s" % threadId)
85
                for filename, lineno, name, line in traceback.extract_stack(stack):
86
                    code.append('File: "%s", line %d, in %s' % (filename, lineno, name))
87
                    if line:
88
                        code.append("  %s" % (line.strip()))
89

90
            # Log also with logger, as it is comment practice to suppress print().
91
            print("\n".join(code))
92
            log.info("\n".join(code))
93
            os.kill(os.getpid(), signal.SIGINT)
94

95

96
@contextlib.contextmanager
97
def CompleteInTimeOrDie(timeout_secs):
98
    watcher = WatcherThread(timeout_secs)
99
    watcher.start()
100
    yield
101
    watcher.completed = True
102
    watcher.condition.acquire()
103
    watcher.condition.notify()
104
    watcher.condition.release()
105

106

107
def EuthanizeIfNecessary(timeout_secs=120):
108
    '''
109
    Call this if you have problem with process getting stuck at shutdown.
110
    It will kill the process if it does not terminate in timeout_secs.
111
    '''
112
    watcher = WatcherThread(timeout_secs)
113
    watcher.start()
114

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.