7
from typing import Any, Dict, List
16
sys.path.append("/opt/rocm/libexec/rocm_smi")
18
from ctypes import byref, c_uint32, c_uint64
20
from rsmiBindings import (
25
except ImportError as e:
29
def get_processes_running_python_tests() -> List[Any]:
31
for process in psutil.process_iter():
33
if "python" in process.name() and process.cmdline():
34
python_processes.append(process)
35
except (psutil.NoSuchProcess, psutil.AccessDenied):
38
return python_processes
41
def get_per_process_cpu_info() -> List[Dict[str, Any]]:
42
processes = get_processes_running_python_tests()
47
"cmd": " ".join(p.cmdline()),
48
"cpu_percent": p.cpu_percent(),
49
"rss_memory": p.memory_info().rss,
55
memory_full_info = p.memory_full_info()
57
info["uss_memory"] = memory_full_info.uss
58
if "pss" in memory_full_info:
60
info["pss_memory"] = memory_full_info.pss
62
except psutil.AccessDenied as e:
66
per_process_info.append(info)
67
return per_process_info
70
def get_per_process_gpu_info(handle: Any) -> List[Dict[str, Any]]:
71
processes = pynvml.nvmlDeviceGetComputeRunningProcesses(handle)
74
info = {"pid": p.pid, "gpu_memory": p.usedGpuMemory}
75
per_process_info.append(info)
76
return per_process_info
79
def rocm_ret_ok(ret: int) -> Any:
80
return ret == rsmi_status_t.RSMI_STATUS_SUCCESS
83
def rocm_list_devices() -> List[int]:
85
ret = rocmsmi.rsmi_num_monitor_devices(byref(num))
87
return list(range(num.value))
91
def rocm_get_mem_use(device: int) -> float:
92
memoryUse = c_uint64()
93
memoryTot = c_uint64()
95
ret = rocmsmi.rsmi_dev_memory_usage_get(device, 0, byref(memoryUse))
97
ret = rocmsmi.rsmi_dev_memory_total_get(device, 0, byref(memoryTot))
99
return float(memoryUse.value) / float(memoryTot.value)
103
def rocm_get_gpu_use(device: int) -> float:
105
ret = rocmsmi.rsmi_dev_busy_percent_get(device, byref(percent))
107
return float(percent.value)
111
def rocm_get_pid_list() -> List[Any]:
112
num_items = c_uint32()
113
ret = rocmsmi.rsmi_compute_process_info_get(None, byref(num_items))
115
buff_sz = num_items.value + 10
116
procs = (rsmi_process_info_t * buff_sz)()
118
ret = rocmsmi.rsmi_compute_process_info_get(byref(procs), byref(num_items))
119
for i in range(num_items.value):
120
procList.append(procs[i].process_id)
125
def rocm_get_per_process_gpu_info() -> List[Dict[str, Any]]:
126
per_process_info = []
127
for pid in rocm_get_pid_list():
128
proc = rsmi_process_info_t()
129
ret = rocmsmi.rsmi_compute_process_info_by_pid_get(int(pid), byref(proc))
131
info = {"pid": pid, "gpu_memory": proc.vram_usage}
132
per_process_info.append(info)
133
return per_process_info
136
if __name__ == "__main__":
140
handle = pynvml.nvmlDeviceGetHandleByIndex(0)
141
except pynvml.NVMLError:
147
ret = rocmsmi.rsmi_init(0)
148
rsmi_handles = rocm_list_devices()
155
def exit_gracefully(*args: Any) -> None:
159
signal.signal(signal.SIGTERM, exit_gracefully)
164
"time": datetime.datetime.utcnow().isoformat("T") + "Z",
165
"total_cpu_percent": psutil.cpu_percent(),
166
"per_process_cpu_info": get_per_process_cpu_info(),
168
if handle is not None:
169
stats["per_process_gpu_info"] = get_per_process_gpu_info(handle)
171
gpu_utilization = pynvml.nvmlDeviceGetUtilizationRates(handle)
172
stats["total_gpu_utilization"] = gpu_utilization.gpu
173
stats["total_gpu_mem_utilization"] = gpu_utilization.memory
175
stats["per_process_gpu_info"] = rocm_get_per_process_gpu_info()
177
gpu_utilization = 0.0
179
for dev in rsmi_handles:
180
gpu_utilization += rocm_get_gpu_use(dev)
181
gpu_memory += rocm_get_mem_use(dev)
182
stats["total_gpu_utilization"] = gpu_utilization
183
stats["total_gpu_mem_utilization"] = gpu_memory
185
except Exception as e:
187
"time": datetime.datetime.utcnow().isoformat("T") + "Z",
191
print(json.dumps(stats))