diff --git a/tests_e2e/orchestrator/runbook.yml b/tests_e2e/orchestrator/runbook.yml index 080d7f2c2f..5b18c83f14 100644 --- a/tests_e2e/orchestrator/runbook.yml +++ b/tests_e2e/orchestrator/runbook.yml @@ -51,7 +51,7 @@ variable: # # The test suites to execute - name: test_suites - value: "agent_bvt, no_outbound_connections, extensions_disabled, agent_not_provisioned, fips, agent_ext_workflow, agent_update, agent_status, multi_config_ext" + value: "agent_bvt, no_outbound_connections, extensions_disabled, agent_not_provisioned, fips, agent_ext_workflow, agent_update, agent_status, multi_config_ext, agent_cgroups, ext_cgroups" - name: cloud value: "AzureCloud" is_case_visible: true diff --git a/tests_e2e/test_suites/agent_cgroups.yml b/tests_e2e/test_suites/agent_cgroups.yml new file mode 100644 index 0000000000..239f37e32a --- /dev/null +++ b/tests_e2e/test_suites/agent_cgroups.yml @@ -0,0 +1,7 @@ +# +# The test suite verify the agent running in expected cgroups and also, checks agent tracking the cgroups for polling resource metrics. +# +name: "AgentCgroups" +tests: + - "agent_cgroups/agent_cgroups.py" +images: "cgroups-endorsed" \ No newline at end of file diff --git a/tests_e2e/test_suites/ext_cgroups.yml b/tests_e2e/test_suites/ext_cgroups.yml new file mode 100644 index 0000000000..5b3e017f52 --- /dev/null +++ b/tests_e2e/test_suites/ext_cgroups.yml @@ -0,0 +1,10 @@ +# +# The test suite installs the few extensions and +# verify those extensions are running in expected cgroups and also, checks agent tracking those cgroups for polling resource metrics. +# +name: "ExtCgroups" +tests: + - "ext_cgroups/ext_cgroups.py" +images: "cgroups-endorsed" +# The DCR test extension installs sample service, so this test suite uses it to test services cgroups but this is only published in southcentralus region in public cloud. +locations: "AzureCloud:southcentralus" \ No newline at end of file diff --git a/tests_e2e/test_suites/images.yml b/tests_e2e/test_suites/images.yml index 5440486c25..433d0733d9 100644 --- a/tests_e2e/test_suites/images.yml +++ b/tests_e2e/test_suites/images.yml @@ -37,6 +37,12 @@ image-sets: - "rhel_90_arm64" - "ubuntu_2204_arm64" + # As of today agent only support and enabled resource governance feature on following distros + cgroups-endorsed: + - "ubuntu_1604" + - "ubuntu_1804" + - "ubuntu_2004" + # # An image can be specified by a string giving its urn, as in # diff --git a/tests_e2e/tests/agent_cgroups/agent_cgroups.py b/tests_e2e/tests/agent_cgroups/agent_cgroups.py new file mode 100644 index 0000000000..d976c0338e --- /dev/null +++ b/tests_e2e/tests/agent_cgroups/agent_cgroups.py @@ -0,0 +1,40 @@ +#!/usr/bin/env python3 + +# Microsoft Azure Linux Agent +# +# Copyright 2018 Microsoft Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +from tests_e2e.tests.lib.agent_test import AgentTest +from tests_e2e.tests.lib.agent_test_context import AgentTestContext +from tests_e2e.tests.lib.logging import log + + +class AgentCgroups(AgentTest): + """ + This test verifies that the agent is running in the expected cgroups. + """ + + def __init__(self, context: AgentTestContext): + super().__init__(context) + self._ssh_client = self._context.create_ssh_client() + + def run(self): + log.info("=====Validating agent cgroups=====") + self._run_remote_test("agent_cgroups-check_cgroups_agent.py") + log.info("Successfully Verified that agent present in correct cgroups") + + +if __name__ == "__main__": + AgentCgroups.run_from_command_line() diff --git a/tests_e2e/tests/ext_cgroups/ext_cgroups.py b/tests_e2e/tests/ext_cgroups/ext_cgroups.py new file mode 100644 index 0000000000..33092ca41e --- /dev/null +++ b/tests_e2e/tests/ext_cgroups/ext_cgroups.py @@ -0,0 +1,43 @@ +#!/usr/bin/env python3 + +# Microsoft Azure Linux Agent +# +# Copyright 2018 Microsoft Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +from tests_e2e.tests.ext_cgroups.install_extensions import InstallExtensions +from tests_e2e.tests.lib.agent_test import AgentTest +from tests_e2e.tests.lib.agent_test_context import AgentTestContext +from tests_e2e.tests.lib.logging import log + + +class ExtCgroups(AgentTest): + """ + This test verifies the installed extensions assigned correctly in their cgroups. + """ + + def __init__(self, context: AgentTestContext): + super().__init__(context) + self._ssh_client = self._context.create_ssh_client() + + def run(self): + log.info("=====Installing extensions to validate ext cgroups scenario") + InstallExtensions(self._context).run() + log.info("=====Executing remote script check_cgroups_extensions.py to validate extension cgroups") + self._run_remote_test("ext_cgroups-check_cgroups_extensions.py", use_sudo=True) + log.info("Successfully verified that extensions present in correct cgroup") + + +if __name__ == "__main__": + ExtCgroups.run_from_command_line() diff --git a/tests_e2e/tests/ext_cgroups/install_extensions.py b/tests_e2e/tests/ext_cgroups/install_extensions.py new file mode 100644 index 0000000000..6617730ed0 --- /dev/null +++ b/tests_e2e/tests/ext_cgroups/install_extensions.py @@ -0,0 +1,112 @@ +#!/usr/bin/env python3 + +# Microsoft Azure Linux Agent +# +# Copyright 2018 Microsoft Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +from datetime import datetime, timedelta +from pathlib import Path + +from tests_e2e.tests.lib.agent_test_context import AgentTestContext +from tests_e2e.tests.lib.identifiers import VmExtensionIds +from tests_e2e.tests.lib.logging import log +from tests_e2e.tests.lib.virtual_machine_extension_client import VirtualMachineExtensionClient + + +class InstallExtensions: + """ + This test installs the multiple extensions in order to verify extensions cgroups in the next test. + """ + + def __init__(self, context: AgentTestContext): + self._context = context + self._ssh_client = self._context.create_ssh_client() + + def run(self): + self._prepare_agent() + # Install the GATest extension to test service cgroups + self._install_gatest_extension() + # Install the Azure Monitor Agent to test long running process cgroup + self._install_ama() + # Install the VM Access extension to test sample extension + self._install_vmaccess() + # Install the CSE extension to test extension cgroup + self._install_cse() + + def _prepare_agent(self): + log.info("=====Executing update-waagent-conf remote script to update monitoring deadline flag for tracking azuremonitoragent service") + future_date = datetime.utcnow() + timedelta(days=2) + expiry_time = future_date.date().strftime("%Y-%m-%d") + # Agent needs extension info and it's services info in the handlermanifest.xml to monitor and limit the resource usage. + # As part of pilot testing , agent hardcoded azuremonitoragent service name to monitor it for sometime in production without need of manifest update from extesnion side. + # So that they can get sense of resource usage for their extensions. This we did for few months and now we no logner monitoring it in production. + # But we are changing the config flag expiry time to future date in this test. So that test agent will start track the cgroups that is used by the service. + result = self._ssh_client.run_command(f"update-waagent-conf Debug.CgroupMonitorExpiryTime={expiry_time}", use_sudo=True) + log.info(result) + log.info("Updated agent cgroups config(CgroupMonitorExpiryTime)") + + def _install_ama(self): + ama_extension = VirtualMachineExtensionClient( + self._context.vm, VmExtensionIds.AzureMonitorLinuxAgent, + resource_name="AMAAgent") + log.info("Installing %s", ama_extension) + ama_extension.enable() + ama_extension.assert_instance_view() + + def _install_vmaccess(self): + # fetch the public key + public_key_file: Path = Path(self._context.private_key_file).with_suffix(".pub") + with public_key_file.open() as f: + public_key = f.read() + # Invoke the extension + vm_access = VirtualMachineExtensionClient(self._context.vm, VmExtensionIds.VmAccess, resource_name="VmAccess") + log.info("Installing %s", vm_access) + vm_access.enable( + protected_settings={ + 'username': self._context.username, + 'ssh_key': public_key, + 'reset_ssh': 'false' + } + ) + vm_access.assert_instance_view() + + def _install_gatest_extension(self): + gatest_extension = VirtualMachineExtensionClient( + self._context.vm, VmExtensionIds.GATestExtension, + resource_name="GATestExt") + log.info("Installing %s", gatest_extension) + gatest_extension.enable() + gatest_extension.assert_instance_view() + + + def _install_cse(self): + # Use custom script to output the cgroups assigned to it at runtime and save to /var/lib/waagent/tmp/custom_script_check. + script_contents = """ +mkdir /var/lib/waagent/tmp +cp /proc/$$/cgroup /var/lib/waagent/tmp/custom_script_check +""" + custom_script_2_0 = VirtualMachineExtensionClient( + self._context.vm, + VmExtensionIds.CustomScript, + resource_name="CustomScript") + + log.info("Installing %s", custom_script_2_0) + custom_script_2_0.enable( + protected_settings={ + 'commandToExecute': f"echo \'{script_contents}\' | bash" + } + ) + custom_script_2_0.assert_instance_view() + diff --git a/tests_e2e/tests/lib/cgroup_helpers.py b/tests_e2e/tests/lib/cgroup_helpers.py new file mode 100644 index 0000000000..7eb3a9b1f7 --- /dev/null +++ b/tests_e2e/tests/lib/cgroup_helpers.py @@ -0,0 +1,149 @@ +import os +import re + +from assertpy import assert_that, fail + +from azurelinuxagent.common.osutil import systemd +from azurelinuxagent.common.utils import shellutil +from azurelinuxagent.common.version import DISTRO_NAME, DISTRO_VERSION +from tests_e2e.tests.lib.agent_log import AgentLog +from tests_e2e.tests.lib.logging import log + +BASE_CGROUP = '/sys/fs/cgroup' +AGENT_CGROUP_NAME = 'WALinuxAgent' +AGENT_SERVICE_NAME = systemd.get_agent_unit_name() +AGENT_CONTROLLERS = ['cpu', 'memory'] +EXT_CONTROLLERS = ['cpu', 'memory'] + +CGROUP_TRACKED_PATTERN = re.compile(r'Started tracking cgroup ([^\s]+)\s+\[(?P[^\s]+)\]') + +GATESTEXT_FULL_NAME = "Microsoft.Azure.Extensions.Edp.GATestExtGo" +GATESTEXT_SERVICE = "gatestext.service" +AZUREMONITOREXT_FULL_NAME = "Microsoft.Azure.Monitor.AzureMonitorLinuxAgent" +AZUREMONITORAGENT_SERVICE = "azuremonitoragent.service" +MDSD_SERVICE = "mdsd.service" + + +def verify_if_distro_supports_cgroup(): + """ + checks if agent is running in a distro that supports cgroups + """ + log.info("===== Checking if distro supports cgroups") + + base_cgroup_fs_exists = os.path.exists(BASE_CGROUP) + + assert_that(base_cgroup_fs_exists).is_true().described_as("Cgroup file system:{0} not found in Distro {1}-{2}".format(BASE_CGROUP, DISTRO_NAME, DISTRO_VERSION)) + + log.info('Distro %s-%s supports cgroups\n', DISTRO_NAME, DISTRO_VERSION) + + +def print_cgroups(): + """ + log the mounted cgroups information + """ + log.info("====== Currently mounted cgroups ======") + for m in shellutil.run_command(['mount']).splitlines(): + # output is similar to + # mount + # sysfs on /sys type sysfs (rw,nosuid,nodev,noexec,relatime,seclabel) + # proc on /proc type proc (rw,nosuid,nodev,noexec,relatime) + # devtmpfs on /dev type devtmpfs (rw,nosuid,seclabel,size=1842988k,nr_inodes=460747,mode=755) + # cgroup on /sys/fs/cgroup/systemd type cgroup (rw,nosuid,nodev,noexec,relatime,seclabel,xattr,release_agent=/usr/lib/systemd/systemd-cgroups-agent,name=systemd) + # cgroup on /sys/fs/cgroup/pids type cgroup (rw,nosuid,nodev,noexec,relatime,seclabel,pids) + # cgroup on /sys/fs/cgroup/memory type cgroup (rw,nosuid,nodev,noexec,relatime,seclabel,memory) + # cgroup on /sys/fs/cgroup/blkio type cgroup (rw,nosuid,nodev,noexec,relatime,seclabel,blkio) + # cgroup on /sys/fs/cgroup/hugetlb type cgroup (rw,nosuid,nodev,noexec,relatime,seclabel,hugetlb) + if 'type cgroup' in m: + log.info('\t%s', m) + + +def print_service_status(): + log.info("====== Agent Service status ======") + output = shellutil.run_command(["systemctl", "status", systemd.get_agent_unit_name()]) + for line in output.splitlines(): + log.info("\t%s", line) + + +def get_agent_cgroup_mount_path(): + return os.path.join('/', 'azure.slice', AGENT_SERVICE_NAME) + + +def get_extension_cgroup_mount_path(extension_name): + return os.path.join('/', 'azure.slice/azure-vmextensions.slice', + "azure-vmextensions-" + extension_name + ".slice") + + +def get_unit_cgroup_mount_path(unit_name): + """ + Returns the cgroup mount path for the given unit + """ + output = shellutil.run_command(["systemctl", "show", unit_name, "--property", "ControlGroup"]) + # Output is similar to + # systemctl show walinuxagent.service --property ControlGroup + # ControlGroup=/azure.slice/walinuxagent.service + # matches above output and extract right side value + match = re.match("[^=]+=(?P.+)", output) + if match is not None: + return match.group('value') + return None + + +def verify_agent_cgroup_assigned_correctly(): + """ + This method checks agent is running and assigned to the correct cgroup using service status output + """ + log.info("===== Verifying the daemon and the agent are assigned to the same correct cgroup using systemd") + service_status = shellutil.run_command(["systemctl", "status", systemd.get_agent_unit_name()]) + log.info("Agent service status output:\n%s", service_status) + is_active = False + is_cgroup_assigned = False + cgroup_mount_path = get_agent_cgroup_mount_path() + is_active_pattern = re.compile(r".*Active:\s+active.*") + + for line in service_status.splitlines(): + if re.match(is_active_pattern, line): + is_active = True + elif cgroup_mount_path in line: + is_cgroup_assigned = True + + if not is_active: + fail('walinuxagent service was not active/running. Service status:{0}'.format(service_status)) + if not is_cgroup_assigned: + fail('walinuxagent service was not assigned to the expected cgroup:{0}'.format(cgroup_mount_path)) + + log.info("Successfully verified the agent cgroup assigned correctly by systemd\n") + + +def get_agent_cpu_quota(): + """ + Returns the cpu quota for the agent service + """ + output = shellutil.run_command(["systemctl", "show", AGENT_SERVICE_NAME, "--property", "CPUQuotaPerSecUSec"]) + # Output is similar to + # systemctl show walinuxagent --property CPUQuotaPerSecUSec + # CPUQuotaPerSecUSec=infinity + match = re.match("[^=]+=(?P.+)", output) + if match is not None: + return match.group('value') + return None + + +def check_agent_quota_disabled(): + """ + Returns True if the cpu quota is infinity + """ + cpu_quota = get_agent_cpu_quota() + return cpu_quota == 'infinity' + + +def check_cgroup_disabled_with_unknown_process(): + """ + Returns True if the cgroup is disabled with unknown process + """ + for record in AgentLog().read(): + match = re.search("Disabling resource usage monitoring. Reason: Check on cgroups failed:.+UNKNOWN", + record.message, flags=re.DOTALL) + if match is not None: + log.info("Found message:\n\t%s", record.text.replace("\n", "\n\t")) + return True + return False diff --git a/tests_e2e/tests/lib/identifiers.py b/tests_e2e/tests/lib/identifiers.py index 149d89ce3b..7bb067a835 100644 --- a/tests_e2e/tests/lib/identifiers.py +++ b/tests_e2e/tests/lib/identifiers.py @@ -63,3 +63,5 @@ class VmExtensionIds(object): RunCommandHandler: VmExtensionIdentifier = VmExtensionIdentifier(publisher='Microsoft.CPlat.Core', ext_type='RunCommandHandlerLinux', version="1.0") VmAccess: VmExtensionIdentifier = VmExtensionIdentifier(publisher='Microsoft.OSTCExtensions', ext_type='VMAccessForLinux', version="1.0") GuestAgentDcrTestExtension: VmExtensionIdentifier = VmExtensionIdentifier(publisher='Microsoft.Azure.TestExtensions.Edp', ext_type='GuestAgentDcrTest', version='1.0') + AzureMonitorLinuxAgent: VmExtensionIdentifier = VmExtensionIdentifier(publisher='Microsoft.Azure.Monitor', ext_type='AzureMonitorLinuxAgent', version="1.5") + GATestExtension: VmExtensionIdentifier = VmExtensionIdentifier(publisher='Microsoft.Azure.Extensions.Edp', ext_type='GATestExtGo', version="1.2") diff --git a/tests_e2e/tests/lib/remote_test.py b/tests_e2e/tests/lib/remote_test.py index c5bf979f01..ad71ae69b3 100644 --- a/tests_e2e/tests/lib/remote_test.py +++ b/tests_e2e/tests/lib/remote_test.py @@ -27,7 +27,7 @@ ERROR_EXIT_CODE = 200 -def run_remote_test(test_method: Callable[[], int]) -> None: +def run_remote_test(test_method: Callable[[], None]) -> None: """ Helper function to run a remote test; implements coding conventions for remote tests, e.g. error message goes to stderr, test log goes to stdout, etc. diff --git a/tests_e2e/tests/scripts/agent_cgroups-check_cgroups_agent.py b/tests_e2e/tests/scripts/agent_cgroups-check_cgroups_agent.py new file mode 100755 index 0000000000..2f3b877a0b --- /dev/null +++ b/tests_e2e/tests/scripts/agent_cgroups-check_cgroups_agent.py @@ -0,0 +1,115 @@ +#!/usr/bin/env pypy3 + +# Microsoft Azure Linux Agent +# +# Copyright 2018 Microsoft Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import os +import re + +from assertpy import fail + +from tests_e2e.tests.lib.agent_log import AgentLog +from tests_e2e.tests.lib.cgroup_helpers import BASE_CGROUP, AGENT_CONTROLLERS, get_agent_cgroup_mount_path, \ + AGENT_SERVICE_NAME, verify_if_distro_supports_cgroup, print_cgroups, \ + verify_agent_cgroup_assigned_correctly +from tests_e2e.tests.lib.logging import log +from tests_e2e.tests.lib.remote_test import run_remote_test + + +def verify_if_cgroup_controllers_are_mounted(): + """ + Checks if controllers CPU, Memory that agent use are mounted in the system + """ + log.info("===== Verifying cgroup controllers that agent use are mounted in the system") + + all_controllers_present = os.path.exists(BASE_CGROUP) + missing_controllers = [] + mounted_controllers = [] + + for controller in AGENT_CONTROLLERS: + controller_path = os.path.join(BASE_CGROUP, controller) + if not os.path.exists(controller_path): + all_controllers_present = False + missing_controllers.append(controller_path) + else: + mounted_controllers.append(controller_path) + + if not all_controllers_present: + fail('Not all of the controllers {0} mounted in expected cgroups. Mounted controllers are: {1}.\n ' + 'Missing controllers are: {2} \n System mounted cgroups are:\n{3}'.format(AGENT_CONTROLLERS, mounted_controllers, missing_controllers, print_cgroups())) + + log.info('Verified all cgroup controllers are present.\n {0}'.format(mounted_controllers)) + + +def verify_agent_cgroup_created_on_file_system(): + """ + Checks agent service is running in azure.slice/{agent_service) cgroup and mounted in same system cgroup controllers mounted path + """ + log.info("===== Verifying the agent cgroup paths exist on file system") + agent_cgroup_mount_path = get_agent_cgroup_mount_path() + all_agent_cgroup_controllers_path_exist = True + missing_agent_cgroup_controllers_path = [] + verified_agent_cgroup_controllers_path = [] + + log.info("expected agent cgroup mount path: %s", agent_cgroup_mount_path) + + for controller in AGENT_CONTROLLERS: + agent_controller_path = os.path.join(BASE_CGROUP, controller, agent_cgroup_mount_path[1:]) + + if not os.path.exists(agent_controller_path): + all_agent_cgroup_controllers_path_exist = False + missing_agent_cgroup_controllers_path.append(agent_controller_path) + else: + verified_agent_cgroup_controllers_path.append(agent_controller_path) + + if not all_agent_cgroup_controllers_path_exist: + fail("Agent's cgroup paths couldn't be found on file system. Missing agent cgroups path :{0}.\n Verified agent cgroups path:{1}".format(missing_agent_cgroup_controllers_path, verified_agent_cgroup_controllers_path)) + + log.info('Verified all agent cgroup paths are present.\n {0}'.format(verified_agent_cgroup_controllers_path)) + + +def verify_agent_cgroups_tracked(): + """ + Checks if agent is tracking agent cgroups path for polling resource usage. This is verified by checking the agent log for the message "Started tracking cgroup" + """ + log.info("===== Verifying agent started tracking cgroups from the log") + + tracking_agent_cgroup_message_re = r'Started tracking cgroup [^\s]+\s+\[(?P[^\s]+)\]' + tracked_cgroups = [] + + for record in AgentLog().read(): + match = re.search(tracking_agent_cgroup_message_re, record.message) + if match is not None: + tracked_cgroups.append(match.group('path')) + + for controller in AGENT_CONTROLLERS: + if not any(AGENT_SERVICE_NAME in cgroup_path and controller in cgroup_path for cgroup_path in tracked_cgroups): + fail('Agent {0} is not being tracked. Tracked cgroups:{1}'.format(controller, tracked_cgroups)) + + log.info("Agent is tracking cgroups correctly.\n%s", tracked_cgroups) + + +def main(): + verify_if_distro_supports_cgroup() + + verify_if_cgroup_controllers_are_mounted() + verify_agent_cgroup_created_on_file_system() + + verify_agent_cgroup_assigned_correctly() + verify_agent_cgroups_tracked() + + +run_remote_test(main) diff --git a/tests_e2e/tests/scripts/ext_cgroups-check_cgroups_extensions.py b/tests_e2e/tests/scripts/ext_cgroups-check_cgroups_extensions.py new file mode 100755 index 0000000000..48bd3f902e --- /dev/null +++ b/tests_e2e/tests/scripts/ext_cgroups-check_cgroups_extensions.py @@ -0,0 +1,224 @@ +#!/usr/bin/env pypy3 + +# Microsoft Azure Linux Agent +# +# Copyright 2018 Microsoft Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import os + +from assertpy import fail + +from tests_e2e.tests.lib.agent_log import AgentLog +from tests_e2e.tests.lib.cgroup_helpers import verify_if_distro_supports_cgroup, \ + verify_agent_cgroup_assigned_correctly, BASE_CGROUP, EXT_CONTROLLERS, get_unit_cgroup_mount_path, \ + GATESTEXT_SERVICE, AZUREMONITORAGENT_SERVICE, MDSD_SERVICE, check_agent_quota_disabled, \ + check_cgroup_disabled_with_unknown_process, CGROUP_TRACKED_PATTERN, AZUREMONITOREXT_FULL_NAME, GATESTEXT_FULL_NAME, \ + print_cgroups +from tests_e2e.tests.lib.logging import log +from tests_e2e.tests.lib.remote_test import run_remote_test + + +def verify_custom_script_cgroup_assigned_correctly(): + """ + This method verifies that the CSE script is created expected folder after install and also checks if CSE ran under the expected cgroups + """ + log.info("===== Verifying custom script was assigned to the correct cgroups") + + # CSE creates this folder to save the output of cgroup information where the CSE script was executed. Since CSE process exits after execution, + # and cgroup paths gets cleaned up by the system, so this information saved at run time when the extension executed. + check_temporary_folder_exists() + + cpu_mounted = False + memory_mounted = False + + log.info("custom script cgroup mounts:") + + with open('/var/lib/waagent/tmp/custom_script_check') as fh: + controllers = fh.read() + log.info("%s", controllers) + + extension_path = "/azure.slice/azure-vmextensions.slice/azure-vmextensions-Microsoft.Azure.Extensions.CustomScript" + + correct_cpu_mount_v1 = "cpu,cpuacct:{0}".format(extension_path) + correct_cpu_mount_v2 = "cpuacct,cpu:{0}".format(extension_path) + + correct_memory_mount = "memory:{0}".format(extension_path) + + for mounted_controller in controllers.split("\n"): + if correct_cpu_mount_v1 in mounted_controller or correct_cpu_mount_v2 in mounted_controller: + log.info('Custom script extension mounted under correct cgroup ' + 'for CPU: %s', mounted_controller) + cpu_mounted = True + elif correct_memory_mount in mounted_controller: + log.info('Custom script extension mounted under correct cgroup ' + 'for Memory: %s', mounted_controller) + memory_mounted = True + + if not cpu_mounted: + fail('Custom script not mounted correctly for CPU! Expected {0} or {1}'.format(correct_cpu_mount_v1, correct_cpu_mount_v2)) + + if not memory_mounted: + fail('Custom script not mounted correctly for Memory! Expected {0}'.format(correct_memory_mount)) + + +def check_temporary_folder_exists(): + tmp_folder = "/var/lib/waagent/tmp" + if not os.path.exists(tmp_folder): + fail("Temporary folder {0} was not created which means CSE script did not run!".format(tmp_folder)) + + +def verify_ext_cgroup_controllers_created_on_file_system(): + """ + This method ensure that extension cgroup controllers are created on file system after extension install + """ + log.info("===== Verifying ext cgroup controllers exist on file system") + + all_controllers_present = os.path.exists(BASE_CGROUP) + missing_controllers_path = [] + verified_controllers_path = [] + + for controller in EXT_CONTROLLERS: + controller_path = os.path.join(BASE_CGROUP, controller) + if not os.path.exists(controller_path): + all_controllers_present = False + missing_controllers_path.append(controller_path) + else: + verified_controllers_path.append(controller_path) + + if not all_controllers_present: + fail('Expected all of the extension controller: {0} paths present in the file system after extension install. But missing cgroups paths are :{1}\n' + 'and verified cgroup paths are: {2} \nSystem mounted cgroups are \n{3}'.format(EXT_CONTROLLERS, missing_controllers_path, verified_controllers_path, print_cgroups())) + + log.info('Verified all extension cgroup controller paths are present and they are: \n {0}'.format(verified_controllers_path)) + + +def verify_extension_service_cgroup_created_on_file_system(): + """ + This method ensure that extension service cgroup paths are created on file system after running extension + """ + log.info("===== Verifying the extension service cgroup paths exist on file system") + + # GA Test Extension Service + gatestext_cgroup_mount_path = get_unit_cgroup_mount_path(GATESTEXT_SERVICE) + verify_extension_service_cgroup_created(GATESTEXT_SERVICE, gatestext_cgroup_mount_path) + + # Azure Monitor Extension Service + azuremonitoragent_cgroup_mount_path = get_unit_cgroup_mount_path(AZUREMONITORAGENT_SERVICE) + azuremonitoragent_service_name = AZUREMONITORAGENT_SERVICE + # Old versions of AMA extension has different service name + if azuremonitoragent_cgroup_mount_path is None: + azuremonitoragent_cgroup_mount_path = get_unit_cgroup_mount_path(MDSD_SERVICE) + azuremonitoragent_service_name = MDSD_SERVICE + verify_extension_service_cgroup_created(azuremonitoragent_service_name, azuremonitoragent_cgroup_mount_path) + + log.info('Verified all extension service cgroup paths created in file system .\n') + + +def verify_extension_service_cgroup_created(service_name, cgroup_mount_path): + log.info("expected extension service cgroup mount path: %s", cgroup_mount_path) + + all_controllers_present = True + missing_cgroups_path = [] + verified_cgroups_path = [] + + for controller in EXT_CONTROLLERS: + # cgroup_mount_path is similar to /azure.slice/walinuxagent.service + # cgroup_mount_path[1:] = azure.slice/walinuxagent.service + # expected extension_service_controller_path similar to /sys/fs/cgroup/cpu/azure.slice/walinuxagent.service + extension_service_controller_path = os.path.join(BASE_CGROUP, controller, cgroup_mount_path[1:]) + + if not os.path.exists(extension_service_controller_path): + all_controllers_present = False + missing_cgroups_path.append(extension_service_controller_path) + else: + verified_cgroups_path.append(extension_service_controller_path) + + if not all_controllers_present: + fail("Extension service: [{0}] cgroup paths couldn't be found on file system. Missing cgroup paths are: {1} \n Verified cgroup paths are: {2} \n " + "System mounted cgroups are \n{3}".format(service_name, missing_cgroups_path, verified_cgroups_path, print_cgroups())) + + +def verify_ext_cgroups_tracked(): + """ + Checks if ext cgroups are tracked by the agent. This is verified by checking the agent log for the message "Started tracking cgroup {extension_name}" + """ + log.info("===== Verifying ext cgroups tracked") + + cgroups_added_for_telemetry = [] + gatestext_cgroups_tracked = False + azuremonitoragent_cgroups_tracked = False + gatestext_service_cgroups_tracked = False + azuremonitoragent_service_cgroups_tracked = False + + for record in AgentLog().read(): + + # Cgroup tracking logged as + # 2021-11-14T13:09:59.351961Z INFO ExtHandler ExtHandler Started tracking cgroup Microsoft.Azure.Extensions.Edp.GATestExtGo-1.0.0.2 + # [/sys/fs/cgroup/cpu,cpuacct/azure.slice/azure-vmextensions.slice/azure-vmextensions-Microsoft.Azure.Extensions.Edp.GATestExtGo_1.0.0.2.slice] + cgroup_tracked_match = CGROUP_TRACKED_PATTERN.findall(record.message) + if len(cgroup_tracked_match) != 0: + name, path = cgroup_tracked_match[0][0], cgroup_tracked_match[0][1] + if name.startswith(GATESTEXT_FULL_NAME): + gatestext_cgroups_tracked = True + elif name.startswith(AZUREMONITOREXT_FULL_NAME): + azuremonitoragent_cgroups_tracked = True + elif name.startswith(GATESTEXT_SERVICE): + gatestext_service_cgroups_tracked = True + elif name.startswith(AZUREMONITORAGENT_SERVICE) or name.startswith(MDSD_SERVICE): + azuremonitoragent_service_cgroups_tracked = True + cgroups_added_for_telemetry.append((name, path)) + + # agent, gatest extension, azuremonitor extension and extension service cgroups + if len(cgroups_added_for_telemetry) < 1: + fail('Expected cgroups were not tracked, according to the agent log. ' + 'Pattern searched for: {0} and found \n{1}'.format(CGROUP_TRACKED_PATTERN.pattern, cgroups_added_for_telemetry)) + + if not gatestext_cgroups_tracked: + fail('Expected gatestext cgroups were not tracked, according to the agent log. ' + 'Pattern searched for: {0} and found \n{1}'.format(CGROUP_TRACKED_PATTERN.pattern, cgroups_added_for_telemetry)) + + if not azuremonitoragent_cgroups_tracked: + fail('Expected azuremonitoragent cgroups were not tracked, according to the agent log. ' + 'Pattern searched for: {0} and found \n{1}'.format(CGROUP_TRACKED_PATTERN.pattern, cgroups_added_for_telemetry)) + + if not gatestext_service_cgroups_tracked: + fail('Expected gatestext service cgroups were not tracked, according to the agent log. ' + 'Pattern searched for: {0} and found \n{1}'.format(CGROUP_TRACKED_PATTERN.pattern, cgroups_added_for_telemetry)) + + if not azuremonitoragent_service_cgroups_tracked: + fail('Expected azuremonitoragent service cgroups were not tracked, according to the agent log. ' + 'Pattern searched for: {0} and found \n{1}'.format(CGROUP_TRACKED_PATTERN.pattern, cgroups_added_for_telemetry)) + + log.info("Extension cgroups tracked as expected\n%s", cgroups_added_for_telemetry) + + +def main(): + verify_if_distro_supports_cgroup() + verify_ext_cgroup_controllers_created_on_file_system() + verify_custom_script_cgroup_assigned_correctly() + verify_agent_cgroup_assigned_correctly() + verify_extension_service_cgroup_created_on_file_system() + verify_ext_cgroups_tracked() + + +try: + run_remote_test(main) +except Exception as e: + # It is possible that agent cgroup can be disabled due to UNKNOWN process or throttled before we run this check, in that case, we should ignore the validation + if check_agent_quota_disabled() and check_cgroup_disabled_with_unknown_process(): + log.info("Cgroup is disabled due to UNKNOWN process, ignoring ext cgroups validations") + else: + raise