diff --git a/tests/integration_tests/functional/test_metrics.py b/tests/integration_tests/functional/test_metrics.py index 27f2bae7c72..2713b7e8606 100644 --- a/tests/integration_tests/functional/test_metrics.py +++ b/tests/integration_tests/functional/test_metrics.py @@ -6,43 +6,271 @@ import math import platform +import jsonschema + +FirecrackerMetrics = { + "api_server": [ + "process_startup_time_us", + "process_startup_time_cpu_us", + "sync_response_fails", + "sync_vmm_send_timeout_count", + ], + "balloon": [ + "activate_fails", + "inflate_count", + "stats_updates_count", + "stats_update_fails", + "deflate_count", + "event_fails", + ], + "block": [ + "activate_fails", + "cfg_fails", + "no_avail_buffer", + "event_fails", + "execute_fails", + "invalid_reqs_count", + "flush_count", + "queue_event_count", + "rate_limiter_event_count", + "update_count", + "update_fails", + "read_bytes", + "write_bytes", + "read_count", + "write_count", + "rate_limiter_throttled_events", + "io_engine_throttled_events", + ], + "deprecated_api": [ + "deprecated_http_api_calls", + "deprecated_cmd_line_api_calls", + ], + "get_api_requests": [ + "instance_info_count", + "machine_cfg_count", + "mmds_count", + "vmm_version_count", + ], + "i8042": [ + "error_count", + "missed_read_count", + "missed_write_count", + "read_count", + "reset_count", + "write_count", + ], + "latencies_us": [ + "full_create_snapshot", + "diff_create_snapshot", + "load_snapshot", + "pause_vm", + "resume_vm", + "vmm_full_create_snapshot", + "vmm_diff_create_snapshot", + "vmm_load_snapshot", + "vmm_pause_vm", + "vmm_resume_vm", + ], + "logger": [ + "missed_metrics_count", + "metrics_fails", + "missed_log_count", + "log_fails", + ], + "mmds": [ + "rx_accepted", + "rx_accepted_err", + "rx_accepted_unusual", + "rx_bad_eth", + "rx_count", + "tx_bytes", + "tx_count", + "tx_errors", + "tx_frames", + "connections_created", + "connections_destroyed", + ], + "net": [ + "activate_fails", + "cfg_fails", + "mac_address_updates", + "no_rx_avail_buffer", + "no_tx_avail_buffer", + "event_fails", + "rx_queue_event_count", + "rx_event_rate_limiter_count", + "rx_partial_writes", + "rx_rate_limiter_throttled", + "rx_tap_event_count", + "rx_bytes_count", + "rx_packets_count", + "rx_fails", + "rx_count", + "tap_read_fails", + "tap_write_fails", + "tx_bytes_count", + "tx_malformed_frames", + "tx_fails", + "tx_count", + "tx_packets_count", + "tx_partial_reads", + "tx_queue_event_count", + "tx_rate_limiter_event_count", + "tx_rate_limiter_throttled", + "tx_spoofed_mac_count", + ], + "patch_api_requests": [ + "drive_count", + "drive_fails", + "network_count", + "network_fails", + "machine_cfg_count", + "machine_cfg_fails", + "mmds_count", + "mmds_fails", + ], + "put_api_requests": [ + "actions_count", + "actions_fails", + "boot_source_count", + "boot_source_fails", + "drive_count", + "drive_fails", + "logger_count", + "logger_fails", + "machine_cfg_count", + "machine_cfg_fails", + "cpu_cfg_count", + "cpu_cfg_fails", + "metrics_count", + "metrics_fails", + "network_count", + "network_fails", + "mmds_count", + "mmds_fails", + "vsock_count", + "vsock_fails", + ], + "seccomp": [ + "num_faults", + ], + "vcpu": [ + "exit_io_in", + "exit_io_out", + "exit_mmio_read", + "exit_mmio_write", + "failures", + ], + "vmm": [ + "device_events", + "panic_count", + ], + "uart": [ + "error_count", + "flush_count", + "missed_read_count", + "missed_write_count", + "read_count", + "write_count", + ], + "signals": [ + "sigbus", + "sigsegv", + "sigxfsz", + "sigxcpu", + "sigpipe", + "sighup", + "sigill", + ], + "vsock": [ + "activate_fails", + "cfg_fails", + "rx_queue_event_fails", + "tx_queue_event_fails", + "ev_queue_event_fails", + "muxer_event_fails", + "conn_event_fails", + "rx_queue_event_count", + "tx_queue_event_count", + "rx_bytes_count", + "tx_bytes_count", + "rx_packets_count", + "tx_packets_count", + "conns_added", + "conns_killed", + "conns_removed", + "killq_resync", + "tx_flush_fails", + "tx_write_fails", + "rx_read_fails", + ], + "entropy": [ + "activate_fails", + "entropy_event_fails", + "entropy_event_count", + "entropy_bytes", + "host_rng_fails", + "entropy_rate_limiter_throttled", + "rate_limiter_event_count", + ], +} + def _validate_metrics(metrics): """ This functions makes sure that all components of FirecrackerMetrics struct are present. - In depth validation of metrics for each component - should be implemented in its own test. - e.g. validation of NetDeviceMetrics should implement - _validate_net_metrics() to check for breaking change etc. """ - exp_keys = [ - "utc_timestamp_ms", - "api_server", - "balloon", - "block", - "deprecated_api", - "get_api_requests", - "i8042", - "latencies_us", - "logger", - "mmds", - "net", - "patch_api_requests", - "put_api_requests", - "seccomp", - "vcpu", - "vmm", - "uart", - "signals", - "vsock", - "entropy", - ] if platform.machine() == "aarch64": - exp_keys.append("rtc") + FirecrackerMetrics["rtc"] = [ + "error_count", + "missed_read_count", + "missed_write_count", + ] - assert set(exp_keys).issubset(metrics.keys()) + firecracker_metrics_schema = { + "type": "object", + "properties": {}, + "required": [], + } + + for metrics_name, metrics_fields in FirecrackerMetrics.items(): + metrics_schema = { + "type": "object", + "required": metrics_fields, + "properties": {}, + } + for metrics_field in metrics_fields: + metrics_schema["properties"][metrics_field] = {"type": "number"} + firecracker_metrics_schema["properties"][metrics_name] = metrics_schema + firecracker_metrics_schema["required"].append(metrics_name) + + jsonschema.validate(instance=metrics, schema=firecracker_metrics_schema) + + # remove some metrics and confirm that fields and not just top level metrics + # are validated. + temp_pop_metrics = metrics["api_server"].pop("process_startup_time_us") + try: + jsonschema.validate(instance=metrics, schema=firecracker_metrics_schema) + except jsonschema.exceptions.ValidationError as error: + if error.message.strip() == "'process_startup_time_us' is a required property": + pass + else: + raise error + metrics["api_server"]["process_startup_time_us"] = temp_pop_metrics + + if platform.machine() == "aarch64": + temp_pop_metrics = metrics["rtc"].pop("error_count") + try: + jsonschema.validate(instance=metrics, schema=firecracker_metrics_schema) + except jsonschema.exceptions.ValidationError as error: + if error.message.strip() == "'error_count' is a required property": + pass + else: + raise error + metrics["rtc"]["error_count"] = temp_pop_metrics utc_time = datetime.datetime.now(datetime.timezone.utc) utc_timestamp_ms = math.floor(utc_time.timestamp() * 1000) @@ -60,9 +288,8 @@ class FcDeviceMetrics: aggregation of metrics """ - def __init__(self, name, validate_fn, num_dev): + def __init__(self, name, num_dev): self.dev_name = name - self.validate_dev_metrics = validate_fn self.num_dev = num_dev def validate(self, microvm): @@ -74,9 +301,6 @@ def validate(self, microvm): # make sure all items of FirecrackerMetrics are as expected _validate_metrics(fc_metrics) - # check for breaking change in device specific metrics - self.validate_dev_metrics(fc_metrics[self.dev_name]) - # make sure "{self.name}" is aggregate of "{self.name}_*" # and that there are only {num_dev} entries of "{self.name}_*" self.validate_aggregation(fc_metrics) @@ -115,39 +339,6 @@ def test_flush_metrics(test_microvm_with_api): _validate_metrics(metrics) -def _validate_net_metrics(net_metrics): - exp_keys = [ - "activate_fails", - "cfg_fails", - "mac_address_updates", - "no_rx_avail_buffer", - "no_tx_avail_buffer", - "event_fails", - "rx_queue_event_count", - "rx_event_rate_limiter_count", - "rx_partial_writes", - "rx_rate_limiter_throttled", - "rx_tap_event_count", - "rx_bytes_count", - "rx_packets_count", - "rx_fails", - "rx_count", - "tap_read_fails", - "tap_write_fails", - "tx_bytes_count", - "tx_malformed_frames", - "tx_fails", - "tx_count", - "tx_packets_count", - "tx_partial_reads", - "tx_queue_event_count", - "tx_rate_limiter_event_count", - "tx_rate_limiter_throttled", - "tx_spoofed_mac_count", - ] - assert set(net_metrics.keys()) == set(exp_keys) - - def test_net_metrics(test_microvm_with_api): """ Validate that NetDeviceMetrics doesn't have a breaking change @@ -162,7 +353,7 @@ def test_net_metrics(test_microvm_with_api): # randomly selected 10 as the number of net devices to test num_net_devices = 10 - net_metrics = FcDeviceMetrics("net", _validate_net_metrics, num_net_devices) + net_metrics = FcDeviceMetrics("net", num_net_devices) # create more than 1 net devices to test aggregation for _ in range(num_net_devices):