Merge pull request #459 from cloud-gov/feat-build-metrics

save build container metrics
cloud-gov · May 6, 2024 · 4bf4ddb · 4bf4ddb
2 parents e7d3cc5 + d121aeb
commit 4bf4ddb
Show file tree

Hide file tree

Showing 10 changed files with 361 additions and 317 deletions.
diff --git a/requirements.txt b/requirements.txt
@@ -1,6 +1,5 @@
 requests==2.31.0
 boto3==1.14.20
-humanize==4.4.0
 stopit==1.1.2
 psycopg2==2.9.9
 cryptography==42.0.2

diff --git a/src/build.py b/src/build.py
@@ -5,6 +5,7 @@
 from datetime import datetime
 from stopit import TimeoutException, SignalTimeout as Timeout
 import boto3
+from functools import partial
 
 from common import CLONE_DIR_PATH
 
@@ -14,7 +15,8 @@
 )
 from log_utils.remote_logs import (
     post_build_complete, post_build_error,
-    post_build_timeout, post_build_processing
+    post_build_timeout, post_build_processing,
+    post_metrics,
 )
 
 from crypto.decrypt import decrypt
@@ -92,8 +94,14 @@ def build(
 
             logger = get_logger('main')
 
-            def run_step(returncode, msg):
-                if returncode != 0:
+            # partially apply the callback url to post_metrics
+            post_metrics_p = partial(post_metrics, status_callback)
+
+            def run_step(step, msg, *args, **kwargs):
+                try:
+                    step(*args, **kwargs)
+                except Exception as e:
+                    logger.error(e)
                     raise StepException(msg)
 
             logger.info(f'Running build for {owner}/{repository}/{branch}')
@@ -104,7 +112,11 @@ def run_step(returncode, msg):
             # start a separate scheduled thread for memory/cpu monitoring
             MONITORING_INTERVAL = 30
             monitoring_logger = get_logger('monitor')
-            thread = RepeatTimer(MONITORING_INTERVAL, log_monitoring_metrics, [monitoring_logger])
+            thread = RepeatTimer(
+                MONITORING_INTERVAL,
+                log_monitoring_metrics,
+                [monitoring_logger, post_metrics_p],
+            )
             thread.start()
 
             # S3 client used in multiple steps
@@ -119,8 +131,9 @@ def run_step(returncode, msg):
             # FETCH
             #
             run_step(
-                fetch_repo(owner, repository, branch, github_token),
-                'There was a problem fetching the repository, see the above logs for details.'
+                fetch_repo,
+                'There was a problem fetching the repository, see the above logs for details.',
+                owner, repository, branch, github_token,
             )
 
             commit_sha = fetch_commit_sha(CLONE_DIR_PATH)
@@ -141,57 +154,62 @@ def run_step(returncode, msg):
 
             if federalist_config.full_clone():
                 run_step(
-                    update_repo(CLONE_DIR_PATH),
-                    'There was a problem updating the repository, see the above logs for details.'
+                    update_repo,
+                    'There was a problem updating the repository, see the above logs for details.',
+                    CLONE_DIR_PATH,
                 )
 
             ##
             # BUILD
             #
             run_step(
-                setup_node(federalist_config.should_cache(), bucket, s3_client),
-                'There was a problem setting up Node, see the above logs for details.'
+                setup_node,
+                'There was a problem setting up Node, see the above logs for details.',
+                federalist_config.should_cache(),
+                bucket,
+                s3_client,
+                post_metrics_p,
             )
 
             # Run the npm `federalist` task (if it is defined)
             run_step(
-                run_build_script(
-                    branch, owner, repository, site_prefix, baseurl, decrypted_uevs
-                ),
-                'There was a problem running the federalist script, see the above logs for details.'
+                run_build_script,
+                'There was a problem running the federalist script, see the above logs for details.',  # noqa: E501
+                branch, owner, repository, site_prefix, baseurl, decrypted_uevs,
             )
 
             # Run the appropriate build engine based on generator
             if generator == 'jekyll':
                 run_step(
-                    setup_ruby(),
-                    'There was a problem setting up Ruby, see the above logs for details.'
+                    setup_ruby,
+                    'There was a problem setting up Ruby, see the above logs for details.',
+                    federalist_config.should_cache(), post_metrics_p,
                 )
 
                 run_step(
-                    setup_bundler(federalist_config.should_cache(), bucket, s3_client),
-                    'There was a problem setting up Bundler, see the above logs for details.'
+                    setup_bundler,
+                    'There was a problem setting up Bundler, see the above logs for details.',
+                    federalist_config.should_cache(), bucket, s3_client,
                 )
 
                 run_step(
-                    build_jekyll(
-                        branch, owner, repository, site_prefix, baseurl, config, decrypted_uevs
-                    ),
-                    'There was a problem running Jekyll, see the above logs for details.'
+                    build_jekyll,
+                    'There was a problem running Jekyll, see the above logs for details.',
+                    branch, owner, repository, site_prefix, baseurl, config, decrypted_uevs,
                 )
 
             elif generator == 'hugo':
                 # extra: --hugo-version (not yet used)
                 run_step(
-                    download_hugo(),
-                    'There was a problem downloading Hugo, see the above logs for details.'
+                    download_hugo,
+                    'There was a problem downloading Hugo, see the above logs for details.',
+                    post_metrics_p
                 )
 
                 run_step(
-                    build_hugo(
-                        branch, owner, repository, site_prefix, baseurl, decrypted_uevs
-                    ),
-                    'There was a problem running Hugo, see the above logs for details.'
+                    build_hugo,
+                    'There was a problem running Hugo, see the above logs for details.',
+                    branch, owner, repository, site_prefix, baseurl, decrypted_uevs,
                 )
 
             elif generator == 'static':

diff --git a/src/log_utils/monitoring.py b/src/log_utils/monitoring.py
@@ -1,6 +1,11 @@
 from threading import Timer
 import psutil
-from humanize import naturalsize
+
+max_metrics = dict(
+    cpu=0,
+    mem=0,
+    disk=0
+)
 
 
 # https://stackoverflow.com/a/48741004
@@ -10,8 +15,12 @@ def run(self):
             self.function(*self.args, **self.kwargs)
 
 
-def log_monitoring_metrics(logger):
+def log_monitoring_metrics(logger, post_metrics):
     disk = psutil.disk_usage("/")
-    logger.info(f'CPU Usage Percentage: {psutil.cpu_percent()}')
-    logger.info(f'Memory Usage Percentage: {psutil.virtual_memory().percent}')
-    logger.info(f'Disk usage: {naturalsize(disk.used)} / {naturalsize(disk.total)}')
+
+    # compute new maximum metrics and post to the application
+    max_metrics["cpu"] = max(psutil.cpu_percent(), max_metrics["cpu"])
+    max_metrics["mem"] = max(psutil.virtual_memory().percent, max_metrics["mem"])
+    max_metrics["disk"] = max(disk.used, max_metrics["disk"])
+
+    post_metrics(dict(machine=max_metrics))
diff --git a/src/log_utils/remote_logs.py b/src/log_utils/remote_logs.py
@@ -2,6 +2,7 @@
 
 import base64
 import requests
+from typing import Dict
 
 from .common import (STATUS_COMPLETE, STATUS_ERROR, STATUS_PROCESSING)
 
@@ -63,3 +64,15 @@ def post_build_timeout(status_callback_url, commit_sha=None):
 
     # Post to the Pages web application with status and output
     post_status(status_callback_url, status=STATUS_ERROR, output=output, commit_sha=commit_sha)
+
+
+def post_metrics(status_callback_url: str, metrics: Dict):
+    '''
+    POST build metrics to the metrics API
+    '''
+    url = status_callback_url.replace('status', 'metrics')
+    requests.post(
+        url,
+        json=metrics,
+        timeout=10
+    )
diff --git a/src/runner/__init__.py b/src/runner/__init__.py
@@ -3,6 +3,7 @@
 import pwd
 import shlex
 import subprocess  # nosec
+from io import StringIO
 
 NVM_PATH = '~/.nvm/nvm.sh'
 RVM_PATH = '/usr/local/rvm/scripts/rvm'
@@ -13,26 +14,24 @@ def setuser():
     os.setuid(pwd.getpwnam('customer').pw_uid)
 
 
-def run(logger, command, cwd=None, env=None, shell=False, check=False, node=False, ruby=False):
+def run(logger, command, cwd=None, env=None, shell=False, check=True, node=False, ruby=False, skip_log=False):  # noqa: E501
     '''
     Run an OS command with provided cwd or env, stream logs to logger, and return the exit code.
 
     Errors that occur BEFORE the command is actually executed are caught and handled here.
 
-    Errors encountered by the executed command are NOT caught. Instead a non-zero exit code
-    will be returned to be handled by the caller.
+    Errors encountered by the executed command are caught unless `check=False`. In these cases a
+    non-zero exit code will be returned to be handled by the caller.
 
     See https://docs.python.org/3/library/subprocess.html#popen-constructor for details.
     '''
 
-    # TODO - refactor to put the appropriate bundler binaries in PATH so this isn't necessary
     if ruby:
         command = f'source {RVM_PATH} && {command}'
         shell = True
 
-    # TODO - refactor to put the appropriate node/npm binaries in PATH so this isn't necessary
     if node:
-        command = f'source {NVM_PATH} && nvm use default && {command}'
+        command = f'source {NVM_PATH} && {command}'
         shell = True
 
     if isinstance(command, str) and not shell:
@@ -41,6 +40,9 @@ def run(logger, command, cwd=None, env=None, shell=False, check=False, node=Fals
     # When a shell is needed, use `bash` instead of `sh`
     executable = '/bin/bash' if shell else None
 
+    # aggregate stdout in case we need to return
+    output = StringIO()
+
     try:
         p = subprocess.Popen(  # nosec
             command,
@@ -56,12 +58,20 @@ def run(logger, command, cwd=None, env=None, shell=False, check=False, node=Fals
             preexec_fn=setuser
         )
         while p.poll() is None:
-            logger.info(p.stdout.readline().strip())
+            line = p.stdout.readline().strip()
+            if not skip_log:
+                logger.info(line)
+            output.write(line)
 
-        logger.info(p.stdout.readline().strip())
+        line = p.stdout.readline().strip()
+        if not skip_log:
+            logger.info(line)
+        output.write(line)
 
-        if check and p.returncode:
-            raise subprocess.CalledProcessError(p.returncode, command)
+        if check:
+            if p.returncode:
+                raise subprocess.CalledProcessError(p.returncode, command)
+            return output.getvalue()
 
         return p.returncode