pytorch

Форк
0
/
monitor.py 
192 строки · 6.1 Кб
1
#!/usr/bin/env python3
2
import datetime
3
import json
4
import signal
5
import sys
6
import time
7
from typing import Any, Dict, List
8

9
import psutil  # type: ignore[import]
10
import pynvml  # type: ignore[import]
11

12
# ROCm does not currently have the rocm_smi module installed to a pythonic location.
13
# Must import from ROCm installation path.
14
# Cannot use the high-level rocm_smi cmdline module due to its use of exit().
15
# Must use the lower-level ctypes wrappers exposed through rsmiBindings.
16
sys.path.append("/opt/rocm/libexec/rocm_smi")
17
try:
18
    from ctypes import byref, c_uint32, c_uint64
19

20
    from rsmiBindings import (  # type: ignore[import]
21
        rocmsmi,
22
        rsmi_process_info_t,
23
        rsmi_status_t,
24
    )
25
except ImportError as e:
26
    pass
27

28

29
def get_processes_running_python_tests() -> List[Any]:
30
    python_processes = []
31
    for process in psutil.process_iter():
32
        try:
33
            if "python" in process.name() and process.cmdline():
34
                python_processes.append(process)
35
        except (psutil.NoSuchProcess, psutil.AccessDenied):
36
            # access denied or the process died
37
            pass
38
    return python_processes
39

40

41
def get_per_process_cpu_info() -> List[Dict[str, Any]]:
42
    processes = get_processes_running_python_tests()
43
    per_process_info = []
44
    for p in processes:
45
        info = {
46
            "pid": p.pid,
47
            "cmd": " ".join(p.cmdline()),
48
            "cpu_percent": p.cpu_percent(),
49
            "rss_memory": p.memory_info().rss,
50
        }
51

52
        # https://psutil.readthedocs.io/en/latest/index.html?highlight=memory_full_info
53
        # requires higher user privileges and could throw AccessDenied error, i.e. mac
54
        try:
55
            memory_full_info = p.memory_full_info()
56

57
            info["uss_memory"] = memory_full_info.uss
58
            if "pss" in memory_full_info:
59
                # only availiable in linux
60
                info["pss_memory"] = memory_full_info.pss
61

62
        except psutil.AccessDenied as e:
63
            # It's ok to skip this
64
            pass
65

66
        per_process_info.append(info)
67
    return per_process_info
68

69

70
def get_per_process_gpu_info(handle: Any) -> List[Dict[str, Any]]:
71
    processes = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
72
    per_process_info = []
73
    for p in processes:
74
        info = {"pid": p.pid, "gpu_memory": p.usedGpuMemory}
75
        per_process_info.append(info)
76
    return per_process_info
77

78

79
def rocm_ret_ok(ret: int) -> Any:
80
    return ret == rsmi_status_t.RSMI_STATUS_SUCCESS
81

82

83
def rocm_list_devices() -> List[int]:
84
    num = c_uint32(0)
85
    ret = rocmsmi.rsmi_num_monitor_devices(byref(num))
86
    if rocm_ret_ok(ret):
87
        return list(range(num.value))
88
    return []
89

90

91
def rocm_get_mem_use(device: int) -> float:
92
    memoryUse = c_uint64()
93
    memoryTot = c_uint64()
94

95
    ret = rocmsmi.rsmi_dev_memory_usage_get(device, 0, byref(memoryUse))
96
    if rocm_ret_ok(ret):
97
        ret = rocmsmi.rsmi_dev_memory_total_get(device, 0, byref(memoryTot))
98
        if rocm_ret_ok(ret):
99
            return float(memoryUse.value) / float(memoryTot.value)
100
    return 0.0
101

102

103
def rocm_get_gpu_use(device: int) -> float:
104
    percent = c_uint32()
105
    ret = rocmsmi.rsmi_dev_busy_percent_get(device, byref(percent))
106
    if rocm_ret_ok(ret):
107
        return float(percent.value)
108
    return 0.0
109

110

111
def rocm_get_pid_list() -> List[Any]:
112
    num_items = c_uint32()
113
    ret = rocmsmi.rsmi_compute_process_info_get(None, byref(num_items))
114
    if rocm_ret_ok(ret):
115
        buff_sz = num_items.value + 10
116
        procs = (rsmi_process_info_t * buff_sz)()
117
        procList = []
118
        ret = rocmsmi.rsmi_compute_process_info_get(byref(procs), byref(num_items))
119
        for i in range(num_items.value):
120
            procList.append(procs[i].process_id)
121
        return procList
122
    return []
123

124

125
def rocm_get_per_process_gpu_info() -> List[Dict[str, Any]]:
126
    per_process_info = []
127
    for pid in rocm_get_pid_list():
128
        proc = rsmi_process_info_t()
129
        ret = rocmsmi.rsmi_compute_process_info_by_pid_get(int(pid), byref(proc))
130
        if rocm_ret_ok(ret):
131
            info = {"pid": pid, "gpu_memory": proc.vram_usage}
132
            per_process_info.append(info)
133
    return per_process_info
134

135

136
if __name__ == "__main__":
137
    handle = None
138
    try:
139
        pynvml.nvmlInit()
140
        handle = pynvml.nvmlDeviceGetHandleByIndex(0)
141
    except pynvml.NVMLError:
142
        # no pynvml avaliable, probably because not cuda
143
        pass
144

145
    rsmi_handles = []
146
    try:
147
        ret = rocmsmi.rsmi_init(0)
148
        rsmi_handles = rocm_list_devices()
149
    except Exception:
150
        # no rocmsmi available, probably because not rocm
151
        pass
152

153
    kill_now = False
154

155
    def exit_gracefully(*args: Any) -> None:
156
        global kill_now
157
        kill_now = True
158

159
    signal.signal(signal.SIGTERM, exit_gracefully)
160

161
    while not kill_now:
162
        try:
163
            stats = {
164
                "time": datetime.datetime.utcnow().isoformat("T") + "Z",
165
                "total_cpu_percent": psutil.cpu_percent(),
166
                "per_process_cpu_info": get_per_process_cpu_info(),
167
            }
168
            if handle is not None:
169
                stats["per_process_gpu_info"] = get_per_process_gpu_info(handle)
170
                # https://docs.nvidia.com/deploy/nvml-api/structnvmlUtilization__t.html
171
                gpu_utilization = pynvml.nvmlDeviceGetUtilizationRates(handle)
172
                stats["total_gpu_utilization"] = gpu_utilization.gpu
173
                stats["total_gpu_mem_utilization"] = gpu_utilization.memory
174
            if rsmi_handles:
175
                stats["per_process_gpu_info"] = rocm_get_per_process_gpu_info()
176
                # There are 1 to 4 GPUs in use; these values may sum > 1.0.
177
                gpu_utilization = 0.0
178
                gpu_memory = 0.0
179
                for dev in rsmi_handles:
180
                    gpu_utilization += rocm_get_gpu_use(dev)
181
                    gpu_memory += rocm_get_mem_use(dev)
182
                stats["total_gpu_utilization"] = gpu_utilization
183
                stats["total_gpu_mem_utilization"] = gpu_memory
184

185
        except Exception as e:
186
            stats = {
187
                "time": datetime.datetime.utcnow().isoformat("T") + "Z",
188
                "error": str(e),
189
            }
190
        finally:
191
            print(json.dumps(stats))
192
            time.sleep(1)
193

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.