firecracker
396 строк · 16.5 Кб
1#!/usr/bin/env python3
2# Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved.
3# SPDX-License-Identifier: Apache-2.0
4"""
5Script for running A/B-Tests
6
7The script takes two git revisions and a pytest integration test. It utilizes
8our integration test frameworks --binary-dir parameter to execute the given
9test using binaries compiled from each revision, and captures the EMF logs
10output. It the searches for list-valued properties/metrics in the EMF, and runs a
11regression test comparing these lists for the two runs.
12
13It performs the A/B-test as follows:
14For each EMF log message output, look at the dimensions. The script assumes that
15dimensions are unique across all log messages output from a single test run. In
16each log message, then look for all properties that have lists assigned to them,
17and collect them. For both runs of the test, the set of distinct dimensions
18collected this way must be the same. Then, we match corresponding dimensions
19between the two runs, performing statistical regression test across all the list-
20valued properties collected.
21"""
22import argparse23import json24import statistics25import sys26from collections import defaultdict27from pathlib import Path28
29# Hack to be able to use our test framework code
30sys.path.append(str(Path(__file__).parent.parent / "tests"))31
32# pylint:disable=wrong-import-position
33from framework import utils34from framework.ab_test import check_regression, git_ab_test_with_binaries35from framework.properties import global_props36from host_tools.metrics import (37emit_raw_emf,38format_with_reduced_unit,39get_metrics_logger,40)
41
42# Performance tests that are known to be unstable and exhibit variances of up to 60% of the mean
43IGNORED = [44# Network throughput on m6a.metal45{"instance": "m6a.metal", "performance_test": "test_network_tcp_throughput"},46# Block throughput for 1 vcpu on m6g.metal/5.1047{48"performance_test": "test_block_performance",49"instance": "m6g.metal",50"host_kernel": "linux-5.10",51"vcpus": "1",52},53]
54
55
56def is_ignored(dimensions) -> bool:57"""Checks whether the given dimensions match a entry in the IGNORED dictionary above"""58for high_variance in IGNORED:59matching = {key: dimensions[key] for key in high_variance if key in dimensions}60
61if matching == high_variance:62return True63
64return False65
66
67def extract_dimensions(emf):68"""Extracts the cloudwatch dimensions from an EMF log message"""69if not emf["_aws"]["CloudWatchMetrics"][0]["Dimensions"]:70# Skipped tests emit a duration metric, but have no dimensions set71return {}72
73dimension_list = emf["_aws"]["CloudWatchMetrics"][0]["Dimensions"][0]74return {key: emf[key] for key in emf if key in dimension_list}75
76
77def process_log_entry(emf: dict):78"""Parses the given EMF log entry79
80Returns the entries dimensions and its list-valued properties/metrics, together with their units
81"""
82result = {83key: (value, find_unit(emf, key))84for key, value in emf.items()85if (86"fc_metrics" not in key87and "cpu_utilization" not in key88and isinstance(value, list)89)90}91# Since we don't consider metrics having fc_metrics in key92# result could be empty so, return empty dimensions as well93if not result:94return {}, {}95
96return extract_dimensions(emf), result97
98
99def find_unit(emf: dict, metric: str):100"""Determines the unit of the given metric"""101metrics = {102y["Name"]: y["Unit"] for y in emf["_aws"]["CloudWatchMetrics"][0]["Metrics"]103}104return metrics.get(metric, "None")105
106
107def load_data_series(report_path: Path, revision: str = None, *, reemit: bool = False):108"""Loads the data series relevant for A/B-testing from test_results/test-report.json109into a dictionary mapping each message's cloudwatch dimensions to a dictionary of
110its list-valued properties/metrics.
111
112If `reemit` is True, it also reemits all EMF logs to a local EMF agent,
113overwriting the attached "git_commit_id" field with the given revision."""
114# Dictionary mapping EMF dimensions to A/B-testable metrics/properties115processed_emf = {}116
117report = json.loads(report_path.read_text("UTF-8"))118for test in report["tests"]:119for line in test["teardown"]["stdout"].splitlines():120# Only look at EMF log messages. If we ever have other stdout that starts with braces,121# we will need to rethink this heuristic.122if line.startswith("{"):123emf = json.loads(line)124
125if reemit:126assert revision is not None127
128emf["git_commit_id"] = revision129emit_raw_emf(emf)130
131dimensions, result = process_log_entry(emf)132
133if not dimensions:134continue135
136dimension_set = frozenset(dimensions.items())137
138if dimension_set not in processed_emf:139processed_emf[dimension_set] = result140else:141# If there are many data points for a metric, they will be split across142# multiple EMF log messages. We need to reassemble :(143assert (144processed_emf[dimension_set].keys() == result.keys()145), f"Found incompatible metrics associated with dimension set {dimension_set}: {processed_emf[dimension_set].key()} in one EMF message, but {result.keys()} in another."146
147for metric, (values, unit) in processed_emf[dimension_set].items():148assert result[metric][1] == unit149
150values.extend(result[metric][0])151
152return processed_emf153
154
155def collect_data(firecracker_binary: Path, jailer_binary: Path, test: str):156"""Executes the specified test using the provided firecracker binaries"""157# Ensure the binaries are in the same directory. Will always be the case if used with git_ab_test_with_binaries158assert jailer_binary.parent == firecracker_binary.parent159
160binary_dir = firecracker_binary.parent161revision = binary_dir.name162
163print("Collecting samples")164_, stdout, _ = utils.run_cmd(165f"AWS_EMF_ENVIRONMENT=local AWS_EMF_NAMESPACE=local ./tools/test.sh --binary-dir=/firecracker/build/{revision} {test} -m ''"166)167print(stdout.strip())168
169return load_data_series(170Path("test_results/test-report.json"), revision, reemit=True171)172
173
174def analyze_data(processed_emf_a, processed_emf_b, *, n_resamples: int = 9999):175"""176Analyzes the A/B-test data produced by `collect_data`, by performing regression tests
177as described this script's doc-comment.
178
179Returns a mapping of dimensions and properties/metrics to the result of their regression test.
180"""
181assert set(processed_emf_a.keys()) == set(182processed_emf_b.keys()183), "A and B run produced incomparable data. This is a bug in the test!"184
185results = {}186
187metrics_logger = get_metrics_logger()188
189for prop_name, prop_val in global_props.__dict__.items():190metrics_logger.set_property(prop_name, prop_val)191
192for dimension_set in processed_emf_a:193metrics_a = processed_emf_a[dimension_set]194metrics_b = processed_emf_b[dimension_set]195
196assert set(metrics_a.keys()) == set(197metrics_b.keys()198), "A and B run produced incomparable data. This is a bug in the test!"199
200for metric, (values_a, unit) in metrics_a.items():201print(202f"Doing A/B-test for dimensions {dimension_set} and property {metric}"203)204result = check_regression(205values_a, metrics_b[metric][0], n_resamples=n_resamples206)207
208metrics_logger.set_dimensions({"metric": metric, **dict(dimension_set)})209metrics_logger.put_metric("p_value", float(result.pvalue), "None")210metrics_logger.put_metric("mean_difference", float(result.statistic), unit)211metrics_logger.set_property("data_a", values_a)212metrics_logger.set_property("data_b", metrics_b[metric][0])213metrics_logger.flush()214
215results[dimension_set, metric] = (result, unit)216
217return results218
219
220def ab_performance_test(221a_revision, b_revision, test, p_thresh, strength_abs_thresh, noise_threshold222):223"""Does an A/B-test of the specified test across the given revisions"""224_, commit_list, _ = utils.run_cmd(225f"git --no-pager log --oneline {a_revision}..{b_revision}"226)227print(228f"Performance A/B-test across {a_revision}..{b_revision}. This includes the following commits:"229)230print(commit_list.strip())231
232processed_emf_a, processed_emf_b, results = git_ab_test_with_binaries(233lambda firecracker_binary, jailer_binary: collect_data(234firecracker_binary, jailer_binary, test235),236lambda ah, be: analyze_data(ah, be, n_resamples=int(100 / p_thresh)),237a_revision=a_revision,238b_revision=b_revision,239)240
241# We sort our A/B-Testing results keyed by metric here. The resulting lists of values242# will be approximately normal distributed, and we will use this property as a means of error correction.243# The idea behind this is that testing the same metric (say, restore_latency) across different scenarios (e.g.244# different vcpu counts) will be related in some unknown way (meaning most scenarios will show a change in the same245# direction). In particular, if one scenario yields a slight improvement and the next yields a246# slight degradation, we take this as evidence towards both being mere noise that cancels out.247#248# Empirical evidence for this assumption is that249# 1. Historically, a true performance change has never shown up in just a single test, it always showed up250# across most (if not all) tests for a specific metric.251# 2. Analyzing data collected from historical runs shows that across different parameterizations of the same252# metric, the collected samples approximately follow mean / variance = const, with the constant independent253# of the parameterization.254#255# Mathematically, this has the following justification: By the central256# limit theorem, the means of samples are (approximately) normal distributed. Denote by A257# and B the distributions of the mean of samples from the 'A' and 'B'258# tests respectively. Under our null hypothesis, the distributions of the259# 'A' and 'B' samples are identical (although we dont know what the exact260# distributions are), meaning so are A and B, say A ~ B ~ N(mu, sigma^2).261# The difference of two normal distributions is also normal distributed,262# with the means being subtracted and the variances being added.263# Therefore, A - B ~ N(0, 2sigma^2). If we now normalize this distribution by mu (which264# corresponds to considering the distribution of relative regressions instead), we get (A-B)/mu ~ N(0, c), with c265# being the constant from point 2. above. This means that we can combine the relative means across266# different parameterizations, and get a distributions whose expected267# value is 0, provided our null hypothesis was true. It is exactly this distribution268# for which we collect samples in the dictionary below. Therefore, a sanity check269# on the average of the average of the performance changes for a single metric270# is a good candidates for a sanity check against false-positives.271#272# Note that with this approach, for performance changes to "cancel out", we would need essentially a perfect split273# between scenarios that improve performance and scenarios that degrade performance, something we have not274# ever observed to actually happen.275relative_changes_by_metric = defaultdict(list)276relative_changes_significant = defaultdict(list)277
278failures = []279for (dimension_set, metric), (result, unit) in results.items():280if is_ignored(dict(dimension_set)):281continue282
283values_a = processed_emf_a[dimension_set][metric][0]284baseline_mean = statistics.mean(values_a)285
286relative_changes_by_metric[metric].append(result.statistic / baseline_mean)287
288if result.pvalue < p_thresh and abs(result.statistic) > strength_abs_thresh:289failures.append((dimension_set, metric, result, unit))290
291relative_changes_significant[metric].append(292result.statistic / baseline_mean293)294
295messages = []296for dimension_set, metric, result, unit in failures:297# Sanity check as described above298if abs(statistics.mean(relative_changes_by_metric[metric])) <= noise_threshold:299continue300
301# No data points for this metric were deemed significant302if metric not in relative_changes_significant:303continue304
305# The significant data points themselves are above the noise threshold306if abs(statistics.mean(relative_changes_significant[metric])) > noise_threshold:307old_mean = statistics.mean(processed_emf_a[dimension_set][metric][0])308new_mean = statistics.mean(processed_emf_b[dimension_set][metric][0])309
310msg = (311f"\033[0;32m[Firecracker A/B-Test Runner]\033[0m A/B-testing shows a change of "312f"{format_with_reduced_unit(result.statistic, unit)}, or {result.statistic / old_mean:.2%}, "313f"(from {format_with_reduced_unit(old_mean, unit)} to {format_with_reduced_unit(new_mean, unit)}) "314f"for metric \033[1m{metric}\033[0m with \033[0;31m\033[1mp={result.pvalue}\033[0m. "315f"This means that observing a change of this magnitude or worse, assuming that performance "316f"characteristics did not change across the tested commits, has a probability of {result.pvalue:.2%}. "317f"Tested Dimensions:\n{json.dumps(dict(dimension_set), indent=2, sort_keys=True)}"318)319messages.append(msg)320
321assert not messages, "\n" + "\n".join(messages)322print("No regressions detected!")323
324
325def canonicalize_revision(revision):326"""Canonicalizes the given revision to a 40 digit hex SHA"""327return utils.run_cmd(f"git rev-parse {revision}").stdout.strip()328
329
330if __name__ == "__main__":331parser = argparse.ArgumentParser(332description="Executes Firecracker's A/B testsuite across the specified commits"333)334subparsers = parser.add_subparsers(help="commands", dest="command", required=True)335run_parser = subparsers.add_parser(336"run",337help="Run an specific test of our test suite as an A/B-test across two specified commits",338)339run_parser.add_argument(340"a_revision",341help="The baseline revision compared to which we want to avoid regressing",342)343run_parser.add_argument(344"b_revision",345help="The revision whose performance we want to compare against the results from a_revision",346)347run_parser.add_argument("--test", help="The test to run", required=True)348analyze_parser = subparsers.add_parser(349"analyze",350help="Analyze the results of two manually ran tests based on their test-report.json files",351)352analyze_parser.add_argument(353"report_a",354help="The path to the test-report.json file of the baseline run",355type=Path,356)357analyze_parser.add_argument(358"report_b",359help="The path to the test-report.json file of the run whose performance we want to compare against report_a",360type=Path,361)362parser.add_argument(363"--significance",364help="The p-value threshold that needs to be crossed for a test result to be considered significant",365type=float,366default=0.01,367)368parser.add_argument(369"--absolute-strength",370help="The minimum absolute delta required before a regression will be considered valid",371type=float,372default=0.0,373)374parser.add_argument(375"--noise-threshold",376help="The minimal delta which a metric has to regress on average across all tests that emit it before the regressions will be considered valid.",377type=float,378default=0.05,379)380args = parser.parse_args()381
382if args.command == "run":383ab_performance_test(384# These will show up in Cloudwatch, so canonicalize to long commit SHAs385canonicalize_revision(args.a_revision),386canonicalize_revision(args.b_revision),387args.test,388args.significance,389args.absolute_strength,390args.noise_threshold,391)392else:393data_a = load_data_series(args.report_a)394data_b = load_data_series(args.report_b)395
396analyze_data(data_a, data_b)397