firecracker

ab_test.py
396 строк · 16.5 Кб
Перенос по словам
1
#!/usr/bin/env python3
2
# Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3
# SPDX-License-Identifier: Apache-2.0
4
"""
5
Script for running A/B-Tests
6

7
The script takes two git revisions and a pytest integration test. It utilizes
8
our integration test frameworks --binary-dir parameter to execute the given
9
test using binaries compiled from each revision, and captures the EMF logs
10
output. It the searches for list-valued properties/metrics in the EMF, and runs a
11
regression test comparing these lists for the two runs.
12

13
It performs the A/B-test as follows:
14
For each EMF log message output, look at the dimensions. The script assumes that
15
dimensions are unique across all log messages output from a single test run. In
16
each log message, then look for all properties that have lists assigned to them,
17
and collect them. For both runs of the test, the set of distinct dimensions
18
collected this way must be the same. Then, we match corresponding dimensions
19
between the two runs, performing statistical regression test across all the list-
20
valued properties collected.
21
"""
22
import argparse
23
import json
24
import statistics
25
import sys
26
from collections import defaultdict
27
from pathlib import Path
28

29
# Hack to be able to use our test framework code
30
sys.path.append(str(Path(__file__).parent.parent / "tests"))
31

32
# pylint:disable=wrong-import-position
33
from framework import utils
34
from framework.ab_test import check_regression, git_ab_test_with_binaries
35
from framework.properties import global_props
36
from host_tools.metrics import (
37
    emit_raw_emf,
38
    format_with_reduced_unit,
39
    get_metrics_logger,
40
)
41

42
# Performance tests that are known to be unstable and exhibit variances of up to 60% of the mean
43
IGNORED = [
44
    # Network throughput on m6a.metal
45
    {"instance": "m6a.metal", "performance_test": "test_network_tcp_throughput"},
46
    # Block throughput for 1 vcpu on m6g.metal/5.10
47
    {
48
        "performance_test": "test_block_performance",
49
        "instance": "m6g.metal",
50
        "host_kernel": "linux-5.10",
51
        "vcpus": "1",
52
    },
53
]
54

55

56
def is_ignored(dimensions) -> bool:
57
    """Checks whether the given dimensions match a entry in the IGNORED dictionary above"""
58
    for high_variance in IGNORED:
59
        matching = {key: dimensions[key] for key in high_variance if key in dimensions}
60

61
        if matching == high_variance:
62
            return True
63

64
    return False
65

66

67
def extract_dimensions(emf):
68
    """Extracts the cloudwatch dimensions from an EMF log message"""
69
    if not emf["_aws"]["CloudWatchMetrics"][0]["Dimensions"]:
70
        # Skipped tests emit a duration metric, but have no dimensions set
71
        return {}
72

73
    dimension_list = emf["_aws"]["CloudWatchMetrics"][0]["Dimensions"][0]
74
    return {key: emf[key] for key in emf if key in dimension_list}
75

76

77
def process_log_entry(emf: dict):
78
    """Parses the given EMF log entry
79

80
    Returns the entries dimensions and its list-valued properties/metrics, together with their units
81
    """
82
    result = {
83
        key: (value, find_unit(emf, key))
84
        for key, value in emf.items()
85
        if (
86
            "fc_metrics" not in key
87
            and "cpu_utilization" not in key
88
            and isinstance(value, list)
89
        )
90
    }
91
    # Since we don't consider metrics having fc_metrics in key
92
    # result could be empty so, return empty dimensions as well
93
    if not result:
94
        return {}, {}
95

96
    return extract_dimensions(emf), result
97

98

99
def find_unit(emf: dict, metric: str):
100
    """Determines the unit of the given metric"""
101
    metrics = {
102
        y["Name"]: y["Unit"] for y in emf["_aws"]["CloudWatchMetrics"][0]["Metrics"]
103
    }
104
    return metrics.get(metric, "None")
105

106

107
def load_data_series(report_path: Path, revision: str = None, *, reemit: bool = False):
108
    """Loads the data series relevant for A/B-testing from test_results/test-report.json
109
    into a dictionary mapping each message's cloudwatch dimensions to a dictionary of
110
    its list-valued properties/metrics.
111

112
    If `reemit` is True, it also reemits all EMF logs to a local EMF agent,
113
    overwriting the attached "git_commit_id" field with the given revision."""
114
    # Dictionary mapping EMF dimensions to A/B-testable metrics/properties
115
    processed_emf = {}
116

117
    report = json.loads(report_path.read_text("UTF-8"))
118
    for test in report["tests"]:
119
        for line in test["teardown"]["stdout"].splitlines():
120
            # Only look at EMF log messages. If we ever have other stdout that starts with braces,
121
            # we will need to rethink this heuristic.
122
            if line.startswith("{"):
123
                emf = json.loads(line)
124

125
                if reemit:
126
                    assert revision is not None
127

128
                    emf["git_commit_id"] = revision
129
                    emit_raw_emf(emf)
130

131
                dimensions, result = process_log_entry(emf)
132

133
                if not dimensions:
134
                    continue
135

136
                dimension_set = frozenset(dimensions.items())
137

138
                if dimension_set not in processed_emf:
139
                    processed_emf[dimension_set] = result
140
                else:
141
                    # If there are many data points for a metric, they will be split across
142
                    # multiple EMF log messages. We need to reassemble :(
143
                    assert (
144
                        processed_emf[dimension_set].keys() == result.keys()
145
                    ), f"Found incompatible metrics associated with dimension set {dimension_set}: {processed_emf[dimension_set].key()} in one EMF message, but {result.keys()} in another."
146

147
                    for metric, (values, unit) in processed_emf[dimension_set].items():
148
                        assert result[metric][1] == unit
149

150
                        values.extend(result[metric][0])
151

152
    return processed_emf
153

154

155
def collect_data(firecracker_binary: Path, jailer_binary: Path, test: str):
156
    """Executes the specified test using the provided firecracker binaries"""
157
    # Ensure the binaries are in the same directory. Will always be the case if used with git_ab_test_with_binaries
158
    assert jailer_binary.parent == firecracker_binary.parent
159

160
    binary_dir = firecracker_binary.parent
161
    revision = binary_dir.name
162

163
    print("Collecting samples")
164
    _, stdout, _ = utils.run_cmd(
165
        f"AWS_EMF_ENVIRONMENT=local AWS_EMF_NAMESPACE=local ./tools/test.sh --binary-dir=/firecracker/build/{revision} {test} -m ''"
166
    )
167
    print(stdout.strip())
168

169
    return load_data_series(
170
        Path("test_results/test-report.json"), revision, reemit=True
171
    )
172

173

174
def analyze_data(processed_emf_a, processed_emf_b, *, n_resamples: int = 9999):
175
    """
176
    Analyzes the A/B-test data produced by `collect_data`, by performing regression tests
177
    as described this script's doc-comment.
178

179
    Returns a mapping of dimensions and properties/metrics to the result of their regression test.
180
    """
181
    assert set(processed_emf_a.keys()) == set(
182
        processed_emf_b.keys()
183
    ), "A and B run produced incomparable data. This is a bug in the test!"
184

185
    results = {}
186

187
    metrics_logger = get_metrics_logger()
188

189
    for prop_name, prop_val in global_props.__dict__.items():
190
        metrics_logger.set_property(prop_name, prop_val)
191

192
    for dimension_set in processed_emf_a:
193
        metrics_a = processed_emf_a[dimension_set]
194
        metrics_b = processed_emf_b[dimension_set]
195

196
        assert set(metrics_a.keys()) == set(
197
            metrics_b.keys()
198
        ), "A and B run produced incomparable data. This is a bug in the test!"
199

200
        for metric, (values_a, unit) in metrics_a.items():
201
            print(
202
                f"Doing A/B-test for dimensions {dimension_set} and property {metric}"
203
            )
204
            result = check_regression(
205
                values_a, metrics_b[metric][0], n_resamples=n_resamples
206
            )
207

208
            metrics_logger.set_dimensions({"metric": metric, **dict(dimension_set)})
209
            metrics_logger.put_metric("p_value", float(result.pvalue), "None")
210
            metrics_logger.put_metric("mean_difference", float(result.statistic), unit)
211
            metrics_logger.set_property("data_a", values_a)
212
            metrics_logger.set_property("data_b", metrics_b[metric][0])
213
            metrics_logger.flush()
214

215
            results[dimension_set, metric] = (result, unit)
216

217
    return results
218

219

220
def ab_performance_test(
221
    a_revision, b_revision, test, p_thresh, strength_abs_thresh, noise_threshold
222
):
223
    """Does an A/B-test of the specified test across the given revisions"""
224
    _, commit_list, _ = utils.run_cmd(
225
        f"git --no-pager log --oneline {a_revision}..{b_revision}"
226
    )
227
    print(
228
        f"Performance A/B-test across {a_revision}..{b_revision}. This includes the following commits:"
229
    )
230
    print(commit_list.strip())
231

232
    processed_emf_a, processed_emf_b, results = git_ab_test_with_binaries(
233
        lambda firecracker_binary, jailer_binary: collect_data(
234
            firecracker_binary, jailer_binary, test
235
        ),
236
        lambda ah, be: analyze_data(ah, be, n_resamples=int(100 / p_thresh)),
237
        a_revision=a_revision,
238
        b_revision=b_revision,
239
    )
240

241
    # We sort our A/B-Testing results keyed by metric here. The resulting lists of values
242
    # will be approximately normal distributed, and we will use this property as a means of error correction.
243
    # The idea behind this is that testing the same metric (say, restore_latency) across different scenarios (e.g.
244
    # different vcpu counts) will be related in some unknown way (meaning most scenarios will show a change in the same
245
    # direction). In particular, if one scenario yields a slight improvement and the next yields a
246
    # slight degradation, we take this as evidence towards both being mere noise that cancels out.
247
    #
248
    # Empirical evidence for this assumption is that
249
    #  1. Historically, a true performance change has never shown up in just a single test, it always showed up
250
    #     across most (if not all) tests for a specific metric.
251
    #  2. Analyzing data collected from historical runs shows that across different parameterizations of the same
252
    #     metric, the collected samples approximately follow mean / variance = const, with the constant independent
253
    #     of the parameterization.
254
    #
255
    # Mathematically, this has the following justification: By the central
256
    # limit theorem, the means of samples are (approximately) normal distributed. Denote by A
257
    # and B the distributions of the mean of samples from the 'A' and 'B'
258
    # tests respectively. Under our null hypothesis, the distributions of the
259
    # 'A' and 'B' samples are identical (although we dont know what the exact
260
    # distributions are), meaning so are A and B, say A ~ B ~ N(mu, sigma^2).
261
    # The difference of two normal distributions is also normal distributed,
262
    # with the means being subtracted and the variances being added.
263
    # Therefore, A - B ~ N(0, 2sigma^2). If we now normalize this distribution by mu (which
264
    # corresponds to considering the distribution of relative regressions instead), we get (A-B)/mu ~ N(0, c), with c
265
    # being the constant from point 2. above. This means that we can combine the relative means across
266
    # different parameterizations, and get a distributions whose expected
267
    # value is 0, provided our null hypothesis was true. It is exactly this distribution
268
    # for which we collect samples in the dictionary below. Therefore, a sanity check
269
    # on the average of the average of the performance changes for a single metric
270
    # is a good candidates for a sanity check against false-positives.
271
    #
272
    # Note that with this approach, for performance changes to "cancel out", we would need essentially a perfect split
273
    # between scenarios that improve performance and scenarios that degrade performance, something we have not
274
    # ever observed to actually happen.
275
    relative_changes_by_metric = defaultdict(list)
276
    relative_changes_significant = defaultdict(list)
277

278
    failures = []
279
    for (dimension_set, metric), (result, unit) in results.items():
280
        if is_ignored(dict(dimension_set)):
281
            continue
282

283
        values_a = processed_emf_a[dimension_set][metric][0]
284
        baseline_mean = statistics.mean(values_a)
285

286
        relative_changes_by_metric[metric].append(result.statistic / baseline_mean)
287

288
        if result.pvalue < p_thresh and abs(result.statistic) > strength_abs_thresh:
289
            failures.append((dimension_set, metric, result, unit))
290

291
            relative_changes_significant[metric].append(
292
                result.statistic / baseline_mean
293
            )
294

295
    messages = []
296
    for dimension_set, metric, result, unit in failures:
297
        # Sanity check as described above
298
        if abs(statistics.mean(relative_changes_by_metric[metric])) <= noise_threshold:
299
            continue
300

301
        # No data points for this metric were deemed significant
302
        if metric not in relative_changes_significant:
303
            continue
304

305
        # The significant data points themselves are above the noise threshold
306
        if abs(statistics.mean(relative_changes_significant[metric])) > noise_threshold:
307
            old_mean = statistics.mean(processed_emf_a[dimension_set][metric][0])
308
            new_mean = statistics.mean(processed_emf_b[dimension_set][metric][0])
309

310
            msg = (
311
                f"\033[0;32m[Firecracker A/B-Test Runner]\033[0m A/B-testing shows a change of "
312
                f"{format_with_reduced_unit(result.statistic, unit)}, or {result.statistic / old_mean:.2%}, "
313
                f"(from {format_with_reduced_unit(old_mean, unit)} to {format_with_reduced_unit(new_mean, unit)}) "
314
                f"for metric \033[1m{metric}\033[0m with \033[0;31m\033[1mp={result.pvalue}\033[0m. "
315
                f"This means that observing a change of this magnitude or worse, assuming that performance "
316
                f"characteristics did not change across the tested commits, has a probability of {result.pvalue:.2%}. "
317
                f"Tested Dimensions:\n{json.dumps(dict(dimension_set), indent=2, sort_keys=True)}"
318
            )
319
            messages.append(msg)
320

321
    assert not messages, "\n" + "\n".join(messages)
322
    print("No regressions detected!")
323

324

325
def canonicalize_revision(revision):
326
    """Canonicalizes the given revision to a 40 digit hex SHA"""
327
    return utils.run_cmd(f"git rev-parse {revision}").stdout.strip()
328

329

330
if __name__ == "__main__":
331
    parser = argparse.ArgumentParser(
332
        description="Executes Firecracker's A/B testsuite across the specified commits"
333
    )
334
    subparsers = parser.add_subparsers(help="commands", dest="command", required=True)
335
    run_parser = subparsers.add_parser(
336
        "run",
337
        help="Run an specific test of our test suite as an A/B-test across two specified commits",
338
    )
339
    run_parser.add_argument(
340
        "a_revision",
341
        help="The baseline revision compared to which we want to avoid regressing",
342
    )
343
    run_parser.add_argument(
344
        "b_revision",
345
        help="The revision whose performance we want to compare against the results from a_revision",
346
    )
347
    run_parser.add_argument("--test", help="The test to run", required=True)
348
    analyze_parser = subparsers.add_parser(
349
        "analyze",
350
        help="Analyze the results of two manually ran tests based on their test-report.json files",
351
    )
352
    analyze_parser.add_argument(
353
        "report_a",
354
        help="The path to the test-report.json file of the baseline run",
355
        type=Path,
356
    )
357
    analyze_parser.add_argument(
358
        "report_b",
359
        help="The path to the test-report.json file of the run whose performance we want to compare against report_a",
360
        type=Path,
361
    )
362
    parser.add_argument(
363
        "--significance",
364
        help="The p-value threshold that needs to be crossed for a test result to be considered significant",
365
        type=float,
366
        default=0.01,
367
    )
368
    parser.add_argument(
369
        "--absolute-strength",
370
        help="The minimum absolute delta required before a regression will be considered valid",
371
        type=float,
372
        default=0.0,
373
    )
374
    parser.add_argument(
375
        "--noise-threshold",
376
        help="The minimal delta which a metric has to regress on average across all tests that emit it before the regressions will be considered valid.",
377
        type=float,
378
        default=0.05,
379
    )
380
    args = parser.parse_args()
381

382
    if args.command == "run":
383
        ab_performance_test(
384
            # These will show up in Cloudwatch, so canonicalize to long commit SHAs
385
            canonicalize_revision(args.a_revision),
386
            canonicalize_revision(args.b_revision),
387
            args.test,
388
            args.significance,
389
            args.absolute_strength,
390
            args.noise_threshold,
391
        )
392
    else:
393
        data_a = load_data_series(args.report_a)
394
        data_b = load_data_series(args.report_b)
395

396
        analyze_data(data_a, data_b)
397
firecracker

Использование cookies