Skip to content

Commit

Permalink
chore(metrics): Test for breaking change in Firecracker metrics
Browse files Browse the repository at this point in the history
Add test to make sure all fields of FirecrackerMetrics are present in
the flushed metrics and make sure that all are numbers.

Signed-off-by: Sudan Landge <sudanl@amazon.com>
  • Loading branch information
Sudan Landge authored and wearyzen committed Oct 23, 2023
1 parent 62b48a5 commit 9b0e03a
Showing 1 changed file with 258 additions and 67 deletions.
325 changes: 258 additions & 67 deletions tests/integration_tests/functional/test_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,43 +6,271 @@
import math
import platform

import jsonschema

FirecrackerMetrics = {
"api_server": [
"process_startup_time_us",
"process_startup_time_cpu_us",
"sync_response_fails",
"sync_vmm_send_timeout_count",
],
"balloon": [
"activate_fails",
"inflate_count",
"stats_updates_count",
"stats_update_fails",
"deflate_count",
"event_fails",
],
"block": [
"activate_fails",
"cfg_fails",
"no_avail_buffer",
"event_fails",
"execute_fails",
"invalid_reqs_count",
"flush_count",
"queue_event_count",
"rate_limiter_event_count",
"update_count",
"update_fails",
"read_bytes",
"write_bytes",
"read_count",
"write_count",
"rate_limiter_throttled_events",
"io_engine_throttled_events",
],
"deprecated_api": [
"deprecated_http_api_calls",
"deprecated_cmd_line_api_calls",
],
"get_api_requests": [
"instance_info_count",
"machine_cfg_count",
"mmds_count",
"vmm_version_count",
],
"i8042": [
"error_count",
"missed_read_count",
"missed_write_count",
"read_count",
"reset_count",
"write_count",
],
"latencies_us": [
"full_create_snapshot",
"diff_create_snapshot",
"load_snapshot",
"pause_vm",
"resume_vm",
"vmm_full_create_snapshot",
"vmm_diff_create_snapshot",
"vmm_load_snapshot",
"vmm_pause_vm",
"vmm_resume_vm",
],
"logger": [
"missed_metrics_count",
"metrics_fails",
"missed_log_count",
"log_fails",
],
"mmds": [
"rx_accepted",
"rx_accepted_err",
"rx_accepted_unusual",
"rx_bad_eth",
"rx_count",
"tx_bytes",
"tx_count",
"tx_errors",
"tx_frames",
"connections_created",
"connections_destroyed",
],
"net": [
"activate_fails",
"cfg_fails",
"mac_address_updates",
"no_rx_avail_buffer",
"no_tx_avail_buffer",
"event_fails",
"rx_queue_event_count",
"rx_event_rate_limiter_count",
"rx_partial_writes",
"rx_rate_limiter_throttled",
"rx_tap_event_count",
"rx_bytes_count",
"rx_packets_count",
"rx_fails",
"rx_count",
"tap_read_fails",
"tap_write_fails",
"tx_bytes_count",
"tx_malformed_frames",
"tx_fails",
"tx_count",
"tx_packets_count",
"tx_partial_reads",
"tx_queue_event_count",
"tx_rate_limiter_event_count",
"tx_rate_limiter_throttled",
"tx_spoofed_mac_count",
],
"patch_api_requests": [
"drive_count",
"drive_fails",
"network_count",
"network_fails",
"machine_cfg_count",
"machine_cfg_fails",
"mmds_count",
"mmds_fails",
],
"put_api_requests": [
"actions_count",
"actions_fails",
"boot_source_count",
"boot_source_fails",
"drive_count",
"drive_fails",
"logger_count",
"logger_fails",
"machine_cfg_count",
"machine_cfg_fails",
"cpu_cfg_count",
"cpu_cfg_fails",
"metrics_count",
"metrics_fails",
"network_count",
"network_fails",
"mmds_count",
"mmds_fails",
"vsock_count",
"vsock_fails",
],
"seccomp": [
"num_faults",
],
"vcpu": [
"exit_io_in",
"exit_io_out",
"exit_mmio_read",
"exit_mmio_write",
"failures",
],
"vmm": [
"device_events",
"panic_count",
],
"uart": [
"error_count",
"flush_count",
"missed_read_count",
"missed_write_count",
"read_count",
"write_count",
],
"signals": [
"sigbus",
"sigsegv",
"sigxfsz",
"sigxcpu",
"sigpipe",
"sighup",
"sigill",
],
"vsock": [
"activate_fails",
"cfg_fails",
"rx_queue_event_fails",
"tx_queue_event_fails",
"ev_queue_event_fails",
"muxer_event_fails",
"conn_event_fails",
"rx_queue_event_count",
"tx_queue_event_count",
"rx_bytes_count",
"tx_bytes_count",
"rx_packets_count",
"tx_packets_count",
"conns_added",
"conns_killed",
"conns_removed",
"killq_resync",
"tx_flush_fails",
"tx_write_fails",
"rx_read_fails",
],
"entropy": [
"activate_fails",
"entropy_event_fails",
"entropy_event_count",
"entropy_bytes",
"host_rng_fails",
"entropy_rate_limiter_throttled",
"rate_limiter_event_count",
],
}


def _validate_metrics(metrics):
"""
This functions makes sure that all components
of FirecrackerMetrics struct are present.
In depth validation of metrics for each component
should be implemented in its own test.
e.g. validation of NetDeviceMetrics should implement
_validate_net_metrics() to check for breaking change etc.
"""
exp_keys = [
"utc_timestamp_ms",
"api_server",
"balloon",
"block",
"deprecated_api",
"get_api_requests",
"i8042",
"latencies_us",
"logger",
"mmds",
"net",
"patch_api_requests",
"put_api_requests",
"seccomp",
"vcpu",
"vmm",
"uart",
"signals",
"vsock",
"entropy",
]

if platform.machine() == "aarch64":
exp_keys.append("rtc")
FirecrackerMetrics["rtc"] = [
"error_count",
"missed_read_count",
"missed_write_count",
]

assert set(exp_keys).issubset(metrics.keys())
firecracker_metrics_schema = {
"type": "object",
"properties": {},
"required": [],
}

for metrics_name, metrics_fields in FirecrackerMetrics.items():
metrics_schema = {
"type": "object",
"required": metrics_fields,
"properties": {},
}
for metrics_field in metrics_fields:
metrics_schema["properties"][metrics_field] = {"type": "number"}
firecracker_metrics_schema["properties"][metrics_name] = metrics_schema
firecracker_metrics_schema["required"].append(metrics_name)

jsonschema.validate(instance=metrics, schema=firecracker_metrics_schema)

# remove some metrics and confirm that fields and not just top level metrics
# are validated.
temp_pop_metrics = metrics["api_server"].pop("process_startup_time_us")
try:
jsonschema.validate(instance=metrics, schema=firecracker_metrics_schema)
except jsonschema.exceptions.ValidationError as error:
if error.message.strip() == "'process_startup_time_us' is a required property":
pass
else:
raise error
metrics["api_server"]["process_startup_time_us"] = temp_pop_metrics

if platform.machine() == "aarch64":
temp_pop_metrics = metrics["rtc"].pop("error_count")
try:
jsonschema.validate(instance=metrics, schema=firecracker_metrics_schema)
except jsonschema.exceptions.ValidationError as error:
if error.message.strip() == "'error_count' is a required property":
pass
else:
raise error
metrics["rtc"]["error_count"] = temp_pop_metrics

utc_time = datetime.datetime.now(datetime.timezone.utc)
utc_timestamp_ms = math.floor(utc_time.timestamp() * 1000)
Expand All @@ -60,9 +288,8 @@ class FcDeviceMetrics:
aggregation of metrics
"""

def __init__(self, name, validate_fn, num_dev):
def __init__(self, name, num_dev):
self.dev_name = name
self.validate_dev_metrics = validate_fn
self.num_dev = num_dev

def validate(self, microvm):
Expand All @@ -74,9 +301,6 @@ def validate(self, microvm):
# make sure all items of FirecrackerMetrics are as expected
_validate_metrics(fc_metrics)

# check for breaking change in device specific metrics
self.validate_dev_metrics(fc_metrics[self.dev_name])

# make sure "{self.name}" is aggregate of "{self.name}_*"
# and that there are only {num_dev} entries of "{self.name}_*"
self.validate_aggregation(fc_metrics)
Expand Down Expand Up @@ -115,39 +339,6 @@ def test_flush_metrics(test_microvm_with_api):
_validate_metrics(metrics)


def _validate_net_metrics(net_metrics):
exp_keys = [
"activate_fails",
"cfg_fails",
"mac_address_updates",
"no_rx_avail_buffer",
"no_tx_avail_buffer",
"event_fails",
"rx_queue_event_count",
"rx_event_rate_limiter_count",
"rx_partial_writes",
"rx_rate_limiter_throttled",
"rx_tap_event_count",
"rx_bytes_count",
"rx_packets_count",
"rx_fails",
"rx_count",
"tap_read_fails",
"tap_write_fails",
"tx_bytes_count",
"tx_malformed_frames",
"tx_fails",
"tx_count",
"tx_packets_count",
"tx_partial_reads",
"tx_queue_event_count",
"tx_rate_limiter_event_count",
"tx_rate_limiter_throttled",
"tx_spoofed_mac_count",
]
assert set(net_metrics.keys()) == set(exp_keys)


def test_net_metrics(test_microvm_with_api):
"""
Validate that NetDeviceMetrics doesn't have a breaking change
Expand All @@ -162,7 +353,7 @@ def test_net_metrics(test_microvm_with_api):
# randomly selected 10 as the number of net devices to test
num_net_devices = 10

net_metrics = FcDeviceMetrics("net", _validate_net_metrics, num_net_devices)
net_metrics = FcDeviceMetrics("net", num_net_devices)

# create more than 1 net devices to test aggregation
for _ in range(num_net_devices):
Expand Down

0 comments on commit 9b0e03a

Please sign in to comment.