llvm-project
523 строки · 18.9 Кб
1#!/usr/bin/env python3
2
3# type: ignore
4
5"""
6compare.py - versatile benchmark output compare tool
7"""
8
9import argparse
10import json
11import os
12import sys
13import unittest
14from argparse import ArgumentParser
15
16import gbench
17from gbench import report, util
18
19
20def check_inputs(in1, in2, flags):
21"""
22Perform checking on the user provided inputs and diagnose any abnormalities
23"""
24in1_kind, in1_err = util.classify_input_file(in1)
25in2_kind, in2_err = util.classify_input_file(in2)
26output_file = util.find_benchmark_flag("--benchmark_out=", flags)
27output_type = util.find_benchmark_flag("--benchmark_out_format=", flags)
28if (
29in1_kind == util.IT_Executable
30and in2_kind == util.IT_Executable
31and output_file
32):
33print(
34(
35"WARNING: '--benchmark_out=%s' will be passed to both "
36"benchmarks causing it to be overwritten"
37)
38% output_file
39)
40if in1_kind == util.IT_JSON and in2_kind == util.IT_JSON:
41# When both sides are JSON the only supported flag is
42# --benchmark_filter=
43for flag in util.remove_benchmark_flags("--benchmark_filter=", flags):
44print(
45"WARNING: passing %s has no effect since both "
46"inputs are JSON" % flag
47)
48if output_type is not None and output_type != "json":
49print(
50(
51"ERROR: passing '--benchmark_out_format=%s' to 'compare.py`"
52" is not supported."
53)
54% output_type
55)
56sys.exit(1)
57
58
59def create_parser():
60parser = ArgumentParser(
61description="versatile benchmark output compare tool"
62)
63
64parser.add_argument(
65"-a",
66"--display_aggregates_only",
67dest="display_aggregates_only",
68action="store_true",
69help="If there are repetitions, by default, we display everything - the"
70" actual runs, and the aggregates computed. Sometimes, it is "
71"desirable to only view the aggregates. E.g. when there are a lot "
72"of repetitions. Do note that only the display is affected. "
73"Internally, all the actual runs are still used, e.g. for U test.",
74)
75
76parser.add_argument(
77"--no-color",
78dest="color",
79default=True,
80action="store_false",
81help="Do not use colors in the terminal output",
82)
83
84parser.add_argument(
85"-d",
86"--dump_to_json",
87dest="dump_to_json",
88help="Additionally, dump benchmark comparison output to this file in JSON format.",
89)
90
91utest = parser.add_argument_group()
92utest.add_argument(
93"--no-utest",
94dest="utest",
95default=True,
96action="store_false",
97help="The tool can do a two-tailed Mann-Whitney U test with the null hypothesis that it is equally likely that a randomly selected value from one sample will be less than or greater than a randomly selected value from a second sample.\nWARNING: requires **LARGE** (no less than {}) number of repetitions to be meaningful!\nThe test is being done by default, if at least {} repetitions were done.\nThis option can disable the U Test.".format(
98report.UTEST_OPTIMAL_REPETITIONS, report.UTEST_MIN_REPETITIONS
99),
100)
101alpha_default = 0.05
102utest.add_argument(
103"--alpha",
104dest="utest_alpha",
105default=alpha_default,
106type=float,
107help=(
108"significance level alpha. if the calculated p-value is below this value, then the result is said to be statistically significant and the null hypothesis is rejected.\n(default: %0.4f)"
109)
110% alpha_default,
111)
112
113subparsers = parser.add_subparsers(
114help="This tool has multiple modes of operation:", dest="mode"
115)
116
117parser_a = subparsers.add_parser(
118"benchmarks",
119help="The most simple use-case, compare all the output of these two benchmarks",
120)
121baseline = parser_a.add_argument_group("baseline", "The benchmark baseline")
122baseline.add_argument(
123"test_baseline",
124metavar="test_baseline",
125type=argparse.FileType("r"),
126nargs=1,
127help="A benchmark executable or JSON output file",
128)
129contender = parser_a.add_argument_group(
130"contender", "The benchmark that will be compared against the baseline"
131)
132contender.add_argument(
133"test_contender",
134metavar="test_contender",
135type=argparse.FileType("r"),
136nargs=1,
137help="A benchmark executable or JSON output file",
138)
139parser_a.add_argument(
140"benchmark_options",
141metavar="benchmark_options",
142nargs=argparse.REMAINDER,
143help="Arguments to pass when running benchmark executables",
144)
145
146parser_b = subparsers.add_parser(
147"filters", help="Compare filter one with the filter two of benchmark"
148)
149baseline = parser_b.add_argument_group("baseline", "The benchmark baseline")
150baseline.add_argument(
151"test",
152metavar="test",
153type=argparse.FileType("r"),
154nargs=1,
155help="A benchmark executable or JSON output file",
156)
157baseline.add_argument(
158"filter_baseline",
159metavar="filter_baseline",
160type=str,
161nargs=1,
162help="The first filter, that will be used as baseline",
163)
164contender = parser_b.add_argument_group(
165"contender", "The benchmark that will be compared against the baseline"
166)
167contender.add_argument(
168"filter_contender",
169metavar="filter_contender",
170type=str,
171nargs=1,
172help="The second filter, that will be compared against the baseline",
173)
174parser_b.add_argument(
175"benchmark_options",
176metavar="benchmark_options",
177nargs=argparse.REMAINDER,
178help="Arguments to pass when running benchmark executables",
179)
180
181parser_c = subparsers.add_parser(
182"benchmarksfiltered",
183help="Compare filter one of first benchmark with filter two of the second benchmark",
184)
185baseline = parser_c.add_argument_group("baseline", "The benchmark baseline")
186baseline.add_argument(
187"test_baseline",
188metavar="test_baseline",
189type=argparse.FileType("r"),
190nargs=1,
191help="A benchmark executable or JSON output file",
192)
193baseline.add_argument(
194"filter_baseline",
195metavar="filter_baseline",
196type=str,
197nargs=1,
198help="The first filter, that will be used as baseline",
199)
200contender = parser_c.add_argument_group(
201"contender", "The benchmark that will be compared against the baseline"
202)
203contender.add_argument(
204"test_contender",
205metavar="test_contender",
206type=argparse.FileType("r"),
207nargs=1,
208help="The second benchmark executable or JSON output file, that will be compared against the baseline",
209)
210contender.add_argument(
211"filter_contender",
212metavar="filter_contender",
213type=str,
214nargs=1,
215help="The second filter, that will be compared against the baseline",
216)
217parser_c.add_argument(
218"benchmark_options",
219metavar="benchmark_options",
220nargs=argparse.REMAINDER,
221help="Arguments to pass when running benchmark executables",
222)
223
224return parser
225
226
227def main():
228# Parse the command line flags
229parser = create_parser()
230args, unknown_args = parser.parse_known_args()
231if args.mode is None:
232parser.print_help()
233exit(1)
234assert not unknown_args
235benchmark_options = args.benchmark_options
236
237if args.mode == "benchmarks":
238test_baseline = args.test_baseline[0].name
239test_contender = args.test_contender[0].name
240filter_baseline = ""
241filter_contender = ""
242
243# NOTE: if test_baseline == test_contender, you are analyzing the stdev
244
245description = "Comparing %s to %s" % (test_baseline, test_contender)
246elif args.mode == "filters":
247test_baseline = args.test[0].name
248test_contender = args.test[0].name
249filter_baseline = args.filter_baseline[0]
250filter_contender = args.filter_contender[0]
251
252# NOTE: if filter_baseline == filter_contender, you are analyzing the
253# stdev
254
255description = "Comparing %s to %s (from %s)" % (
256filter_baseline,
257filter_contender,
258args.test[0].name,
259)
260elif args.mode == "benchmarksfiltered":
261test_baseline = args.test_baseline[0].name
262test_contender = args.test_contender[0].name
263filter_baseline = args.filter_baseline[0]
264filter_contender = args.filter_contender[0]
265
266# NOTE: if test_baseline == test_contender and
267# filter_baseline == filter_contender, you are analyzing the stdev
268
269description = "Comparing %s (from %s) to %s (from %s)" % (
270filter_baseline,
271test_baseline,
272filter_contender,
273test_contender,
274)
275else:
276# should never happen
277print("Unrecognized mode of operation: '%s'" % args.mode)
278parser.print_help()
279exit(1)
280
281check_inputs(test_baseline, test_contender, benchmark_options)
282
283if args.display_aggregates_only:
284benchmark_options += ["--benchmark_display_aggregates_only=true"]
285
286options_baseline = []
287options_contender = []
288
289if filter_baseline and filter_contender:
290options_baseline = ["--benchmark_filter=%s" % filter_baseline]
291options_contender = ["--benchmark_filter=%s" % filter_contender]
292
293# Run the benchmarks and report the results
294json1 = json1_orig = gbench.util.sort_benchmark_results(
295gbench.util.run_or_load_benchmark(
296test_baseline, benchmark_options + options_baseline
297)
298)
299json2 = json2_orig = gbench.util.sort_benchmark_results(
300gbench.util.run_or_load_benchmark(
301test_contender, benchmark_options + options_contender
302)
303)
304
305# Now, filter the benchmarks so that the difference report can work
306if filter_baseline and filter_contender:
307replacement = "[%s vs. %s]" % (filter_baseline, filter_contender)
308json1 = gbench.report.filter_benchmark(
309json1_orig, filter_baseline, replacement
310)
311json2 = gbench.report.filter_benchmark(
312json2_orig, filter_contender, replacement
313)
314
315diff_report = gbench.report.get_difference_report(json1, json2, args.utest)
316output_lines = gbench.report.print_difference_report(
317diff_report,
318args.display_aggregates_only,
319args.utest,
320args.utest_alpha,
321args.color,
322)
323print(description)
324for ln in output_lines:
325print(ln)
326
327# Optionally, diff and output to JSON
328if args.dump_to_json is not None:
329with open(args.dump_to_json, "w") as f_json:
330json.dump(diff_report, f_json, indent=1)
331
332
333class TestParser(unittest.TestCase):
334def setUp(self):
335self.parser = create_parser()
336testInputs = os.path.join(
337os.path.dirname(os.path.realpath(__file__)), "gbench", "Inputs"
338)
339self.testInput0 = os.path.join(testInputs, "test1_run1.json")
340self.testInput1 = os.path.join(testInputs, "test1_run2.json")
341
342def test_benchmarks_basic(self):
343parsed = self.parser.parse_args(
344["benchmarks", self.testInput0, self.testInput1]
345)
346self.assertFalse(parsed.display_aggregates_only)
347self.assertTrue(parsed.utest)
348self.assertEqual(parsed.mode, "benchmarks")
349self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
350self.assertEqual(parsed.test_contender[0].name, self.testInput1)
351self.assertFalse(parsed.benchmark_options)
352
353def test_benchmarks_basic_without_utest(self):
354parsed = self.parser.parse_args(
355["--no-utest", "benchmarks", self.testInput0, self.testInput1]
356)
357self.assertFalse(parsed.display_aggregates_only)
358self.assertFalse(parsed.utest)
359self.assertEqual(parsed.utest_alpha, 0.05)
360self.assertEqual(parsed.mode, "benchmarks")
361self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
362self.assertEqual(parsed.test_contender[0].name, self.testInput1)
363self.assertFalse(parsed.benchmark_options)
364
365def test_benchmarks_basic_display_aggregates_only(self):
366parsed = self.parser.parse_args(
367["-a", "benchmarks", self.testInput0, self.testInput1]
368)
369self.assertTrue(parsed.display_aggregates_only)
370self.assertTrue(parsed.utest)
371self.assertEqual(parsed.mode, "benchmarks")
372self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
373self.assertEqual(parsed.test_contender[0].name, self.testInput1)
374self.assertFalse(parsed.benchmark_options)
375
376def test_benchmarks_basic_with_utest_alpha(self):
377parsed = self.parser.parse_args(
378["--alpha=0.314", "benchmarks", self.testInput0, self.testInput1]
379)
380self.assertFalse(parsed.display_aggregates_only)
381self.assertTrue(parsed.utest)
382self.assertEqual(parsed.utest_alpha, 0.314)
383self.assertEqual(parsed.mode, "benchmarks")
384self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
385self.assertEqual(parsed.test_contender[0].name, self.testInput1)
386self.assertFalse(parsed.benchmark_options)
387
388def test_benchmarks_basic_without_utest_with_utest_alpha(self):
389parsed = self.parser.parse_args(
390[
391"--no-utest",
392"--alpha=0.314",
393"benchmarks",
394self.testInput0,
395self.testInput1,
396]
397)
398self.assertFalse(parsed.display_aggregates_only)
399self.assertFalse(parsed.utest)
400self.assertEqual(parsed.utest_alpha, 0.314)
401self.assertEqual(parsed.mode, "benchmarks")
402self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
403self.assertEqual(parsed.test_contender[0].name, self.testInput1)
404self.assertFalse(parsed.benchmark_options)
405
406def test_benchmarks_with_remainder(self):
407parsed = self.parser.parse_args(
408["benchmarks", self.testInput0, self.testInput1, "d"]
409)
410self.assertFalse(parsed.display_aggregates_only)
411self.assertTrue(parsed.utest)
412self.assertEqual(parsed.mode, "benchmarks")
413self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
414self.assertEqual(parsed.test_contender[0].name, self.testInput1)
415self.assertEqual(parsed.benchmark_options, ["d"])
416
417def test_benchmarks_with_remainder_after_doubleminus(self):
418parsed = self.parser.parse_args(
419["benchmarks", self.testInput0, self.testInput1, "--", "e"]
420)
421self.assertFalse(parsed.display_aggregates_only)
422self.assertTrue(parsed.utest)
423self.assertEqual(parsed.mode, "benchmarks")
424self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
425self.assertEqual(parsed.test_contender[0].name, self.testInput1)
426self.assertEqual(parsed.benchmark_options, ["e"])
427
428def test_filters_basic(self):
429parsed = self.parser.parse_args(["filters", self.testInput0, "c", "d"])
430self.assertFalse(parsed.display_aggregates_only)
431self.assertTrue(parsed.utest)
432self.assertEqual(parsed.mode, "filters")
433self.assertEqual(parsed.test[0].name, self.testInput0)
434self.assertEqual(parsed.filter_baseline[0], "c")
435self.assertEqual(parsed.filter_contender[0], "d")
436self.assertFalse(parsed.benchmark_options)
437
438def test_filters_with_remainder(self):
439parsed = self.parser.parse_args(
440["filters", self.testInput0, "c", "d", "e"]
441)
442self.assertFalse(parsed.display_aggregates_only)
443self.assertTrue(parsed.utest)
444self.assertEqual(parsed.mode, "filters")
445self.assertEqual(parsed.test[0].name, self.testInput0)
446self.assertEqual(parsed.filter_baseline[0], "c")
447self.assertEqual(parsed.filter_contender[0], "d")
448self.assertEqual(parsed.benchmark_options, ["e"])
449
450def test_filters_with_remainder_after_doubleminus(self):
451parsed = self.parser.parse_args(
452["filters", self.testInput0, "c", "d", "--", "f"]
453)
454self.assertFalse(parsed.display_aggregates_only)
455self.assertTrue(parsed.utest)
456self.assertEqual(parsed.mode, "filters")
457self.assertEqual(parsed.test[0].name, self.testInput0)
458self.assertEqual(parsed.filter_baseline[0], "c")
459self.assertEqual(parsed.filter_contender[0], "d")
460self.assertEqual(parsed.benchmark_options, ["f"])
461
462def test_benchmarksfiltered_basic(self):
463parsed = self.parser.parse_args(
464["benchmarksfiltered", self.testInput0, "c", self.testInput1, "e"]
465)
466self.assertFalse(parsed.display_aggregates_only)
467self.assertTrue(parsed.utest)
468self.assertEqual(parsed.mode, "benchmarksfiltered")
469self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
470self.assertEqual(parsed.filter_baseline[0], "c")
471self.assertEqual(parsed.test_contender[0].name, self.testInput1)
472self.assertEqual(parsed.filter_contender[0], "e")
473self.assertFalse(parsed.benchmark_options)
474
475def test_benchmarksfiltered_with_remainder(self):
476parsed = self.parser.parse_args(
477[
478"benchmarksfiltered",
479self.testInput0,
480"c",
481self.testInput1,
482"e",
483"f",
484]
485)
486self.assertFalse(parsed.display_aggregates_only)
487self.assertTrue(parsed.utest)
488self.assertEqual(parsed.mode, "benchmarksfiltered")
489self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
490self.assertEqual(parsed.filter_baseline[0], "c")
491self.assertEqual(parsed.test_contender[0].name, self.testInput1)
492self.assertEqual(parsed.filter_contender[0], "e")
493self.assertEqual(parsed.benchmark_options[0], "f")
494
495def test_benchmarksfiltered_with_remainder_after_doubleminus(self):
496parsed = self.parser.parse_args(
497[
498"benchmarksfiltered",
499self.testInput0,
500"c",
501self.testInput1,
502"e",
503"--",
504"g",
505]
506)
507self.assertFalse(parsed.display_aggregates_only)
508self.assertTrue(parsed.utest)
509self.assertEqual(parsed.mode, "benchmarksfiltered")
510self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
511self.assertEqual(parsed.filter_baseline[0], "c")
512self.assertEqual(parsed.test_contender[0].name, self.testInput1)
513self.assertEqual(parsed.filter_contender[0], "e")
514self.assertEqual(parsed.benchmark_options[0], "g")
515
516
517if __name__ == "__main__":
518# unittest.main()
519main()
520
521# vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4
522# kate: tab-width: 4; replace-tabs on; indent-width 4; tab-indents: off;
523# kate: indent-mode python; remove-trailing-spaces modified;
524