3
"""This script runs cuda-memcheck on the specified unit test. Each test case
4
is run in its isolated process with a timeout so that:
5
1) different test cases won't influence each other, and
6
2) in case of hang, the script would still finish in a finite amount of time.
7
The output will be written to a log file result.log
10
python run_cuda_memcheck.py ../test_torch.py 600
12
Note that running cuda-memcheck could be very slow.
22
import cuda_memcheck_common as cmc
29
GPUS = torch.cuda.device_count()
32
parser = argparse.ArgumentParser(description="Run isolated cuda-memcheck on unit tests")
34
"filename", help="the python file for a test, such as test_torch.py"
39
help="kill the test if it does not terminate in a certain amount of seconds",
44
help="Whether to show cublas/cudnn errors. These errors are ignored by default because"
45
"cublas/cudnn does not run error-free under cuda-memcheck, and ignoring these errors",
50
default=multiprocessing.cpu_count(),
51
help="Number of processes running tests, default to number of cores in the system",
56
help='GPU assignments for each process, it could be "all", or : separated list like "1,2:3,4:5,6"',
61
help="Whether this script is executed in CI. When executed inside a CI, this script fails when "
62
"an error is detected. Also, it will not show tqdm progress bar, but directly print the error"
65
parser.add_argument("--nohang", action="store_true", help="Treat timeout as success")
66
parser.add_argument("--split", type=int, default=1, help="Split the job into pieces")
68
"--rank", type=int, default=0, help="Which piece this process should pick"
70
args = parser.parse_args()
73
# Filters that ignores cublas/cudnn errors
74
# TODO (@zasdfgbnm): When can we remove this? Will cublas/cudnn run error-free under cuda-memcheck?
75
def is_ignored_only(output):
77
report = cmc.parse(output)
78
except cmc.ParseError:
79
# in case the simple parser fails parsing the output of cuda memcheck
80
# then this error is never ignored.
82
count_ignored_errors = 0
83
for e in report.errors:
85
"libcublas" in "".join(e.stack)
86
or "libcudnn" in "".join(e.stack)
87
or "libcufft" in "".join(e.stack)
89
count_ignored_errors += 1
90
return count_ignored_errors == report.num_errors
93
# Set environment PYTORCH_CUDA_MEMCHECK=1 to allow skipping some tests
94
os.environ["PYTORCH_CUDA_MEMCHECK"] = "1"
97
# To get a list of tests, run:
98
# pytest --setup-only test/test_torch.py
99
# and then parse the output
100
proc = subprocess.Popen(
101
["pytest", "--setup-only", args.filename],
102
stdout=subprocess.PIPE,
103
stderr=subprocess.PIPE,
105
stdout, stderr = proc.communicate()
106
lines = stdout.decode().strip().splitlines()
108
if "(fixtures used:" in line:
109
line = line.strip().split()[0]
110
line = line[line.find("::") + 2 :]
111
line = line.replace("::", ".")
112
ALL_TESTS.append(line)
115
# Do a simple filtering:
116
# if 'cpu' or 'CPU' is in the name and 'cuda' or 'CUDA' is not in the name, then skip it
117
def is_cpu_only(name):
119
return ("cpu" in name) and "cuda" not in name
122
ALL_TESTS = [x for x in ALL_TESTS if not is_cpu_only(x)]
124
# Split all tests into chunks, and only on the selected chunk
126
chunk_size = (len(ALL_TESTS) + args.split - 1) // args.split
127
start = chunk_size * args.rank
128
end = chunk_size * (args.rank + 1)
129
ALL_TESTS = ALL_TESTS[start:end]
132
# Since running cuda-memcheck on PyTorch unit tests is very slow, these tests must be run in parallel.
133
# This is done by using the coroutine feature in new Python versions. A number of coroutines are created;
134
# they create subprocesses and awaiting them to finish. The number of running subprocesses could be
135
# specified by the user and by default is the same as the number of CPUs in the machine.
136
# These subprocesses are balanced across different GPUs on the system by assigning one devices per process,
137
# or as specified by the user
140
logfile = open("result.log", "w")
141
progressbar = tqdm.tqdm(total=len(ALL_TESTS))
145
# create a fake progress bar that does not display anything
146
class ProgressbarStub:
147
def update(self, *args):
150
progressbar = ProgressbarStub()
153
async def run1(coroutine_id):
156
if args.gpus == "all":
157
gpuid = coroutine_id % GPUS
159
gpu_assignments = args.gpus.split(":")
160
assert args.nproc == len(
162
), "Please specify GPU assignment for each process, separated by :"
163
gpuid = gpu_assignments[coroutine_id]
165
while progress < len(ALL_TESTS):
166
test = ALL_TESTS[progress]
168
cmd = f"CUDA_VISIBLE_DEVICES={gpuid} cuda-memcheck --error-exitcode 1 python {args.filename} {test}"
169
proc = await asyncio.create_subprocess_shell(
170
cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
173
stdout, stderr = await asyncio.wait_for(proc.communicate(), args.timeout)
174
except asyncio.TimeoutError:
175
print("Timeout:", test, file=logfile)
177
if args.ci and not args.nohang:
178
sys.exit("Hang detected on cuda-memcheck")
180
if proc.returncode == 0:
181
print("Success:", test, file=logfile)
183
stdout = stdout.decode()
184
stderr = stderr.decode()
185
should_display = args.strict or not is_ignored_only(stdout)
187
print("Fail:", test, file=logfile)
188
print(stdout, file=logfile)
189
print(stderr, file=logfile)
191
sys.exit("Failure detected on cuda-memcheck")
193
print("Ignored:", test, file=logfile)
195
progressbar.update(1)
199
tasks = [asyncio.ensure_future(run1(i)) for i in range(args.nproc)]
204
if __name__ == "__main__":
205
loop = asyncio.get_event_loop()
206
loop.run_until_complete(main())