Skip to content

Commit

Permalink
Prometheus Integration (#268)
Browse files Browse the repository at this point in the history
Changes to incorporate Prometheus as a data source for summarization
Co-authored-by: Conner Saeli <saelic01@mail.buffalostate.edu>
  • Loading branch information
connersaeli committed Aug 31, 2023
1 parent 325645e commit ddd97b1
Show file tree
Hide file tree
Showing 46 changed files with 2,418 additions and 606 deletions.
65 changes: 46 additions & 19 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
@@ -1,32 +1,64 @@
version: 2.1
jobs:
build:
docker:
- image: rockylinux:8
steps:
- checkout
- run:
name: Install System Dependencies
command: ./tests/ci/setup.sh build
- run:
name: Build Software Package
command: ./tests/ci/build.sh
- persist_to_workspace:
root: .
paths:
- dist/*

test:
parameters:
test-mode:
type: string
install-type:
type: string
docker:
# The first image entry here will be used as the image for the parent container.
- image: tools-ext-01.ccr.xdmod.org/xdmod-job_performance-10.0.0:rockylinux8-0.1
environment:
TERM: xterm
TERMINFO: /bin/bash
COMPOSER_ALLOW_SUPERUSER: 1
XDMOD_REALMS: 'jobs,storage,cloud'
XDMOD_IS_CORE: yes
XDMOD_INSTALL_DIR: /xdmod
XDMOD_TEST_MODE: << parameters.install-type >>
XDMOD_TEST_MODE: << parameters.test-mode >>
SUPREMM_INSTALL_TYPE: << parameters.install-type >>
steps:
- checkout
- run:
name: Install System Dependencies
command: ./tests/ci/setup.sh
- run:
name: Create Test Result Directories
command: |
mkdir -p shippable/testresults
mkdir -p shippable/codecoverage
- attach_workspace:
at: .
- run:
name: Install Docker Compose
command: |
dnf install -y dnf-utils
dnf config-manager --add-repo https://download.docker.com/linux/centos/docker-ce.repo
dnf install -y docker-ce docker-ce-cli docker-compose-plugin
- setup_remote_docker
- run:
name: Build services
command: docker compose -f ./tests/ci/srv/services.yml build
- run:
name: Start services
command: docker compose -f ./tests/ci/srv/services.yml up -d
- run:
name: Run Bootstrap
command: ./tests/integration_tests/bootstrap.sh
command: ./tests/ci/test/bootstrap.sh $SUPREMM_INSTALL_TYPE
- run:
name: Run Integration Tests
command: ./tests/integration_tests/integration_test.bash
Expand All @@ -42,24 +74,15 @@ jobs:
- run:
name: Ingest Jobs
command: ingest_jobscripts.py -d
- run:
name: Remove Currently Installed SUPREMM
command: yum remove -y supremm
- run:
name: Install SUPREMM
command: python3 setup.py install --user --prefix=
- run:
name: Pylint
command: pylint-3 --errors-only supremm
- run:
name: Pytest
command: pytest-3 --junitxml=shippable/testresults/testreport.xml --cov=supremm --cov-report xml:shippable/codecoverage/coverage.xml
- run:
name: Summarize Jobs
command: /root/.local/bin/summarize_jobs.py -h > /dev/null
- run:
name: Index Archives
command: /root/.local/bin/indexarchives.py -h > /dev/null
name: Remove Currently Installed SUPREMM
command: dnf remove -y supremm
- store_test_results:
path: shippable/testresults
- store_artifacts:
Expand All @@ -70,7 +93,11 @@ jobs:
workflows:
full-build:
jobs:
- build:
- build
- test:
matrix:
parameters:
install-type: ["fresh_install", "upgrade"]
test-mode: ["fresh_install", "upgrade"]
install-type: ["rpm", "wheel", "src"]
requires:
- build
169 changes: 169 additions & 0 deletions config/prometheus/mapping.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
{
"common": {
"params": ["host"],
"defaults": {"environment": "prod"}
},
"metrics": {
"cgroup.memory.usage": {
"name": "cgroup_memory_used_bytes",
"params": ["cgroup"],
"groupby": "cgroup"
},
"cgroup.memory.limit": {
"name": "cgroup_memory_total_bytes",
"params": ["cgroup"],
"groupby": "cgroup"
},
"disk.dev.read": {
"name": "node_disk_reads_completed_total",
"groupby": "device"
},
"disk.dev.read_bytes": {
"name": "node_disk_read_bytes_total",
"scaling": "0.0009765625",
"groupby": "device"
},
"disk.dev.write": {
"name": "node_disk_writes_completed_total",
"groupby": "device"
},
"disk.dev.write_bytes": {
"name": "node_disk_written_bytes_total",
"scaling": "0.0009765625",
"groupby": "device"
},
"infiniband.port.switch.in.bytes": {
"name": "node_infiniband_port_data_received_bytes_total",
"groupby": "port",
"out_fmt": ["{}:{}", "device", "port"]
},
"infiniband.port.switch.in.packets": {
"name": "node_infiniband_port_packets_received_total",
"groupby": "port",
"out_fmt": ["{}:{}", "device", "port"]
},
"infiniband.port.switch.out.bytes": {
"name": "node_infiniband_port_data_transmitted_bytes_total",
"groupby": "port",
"out_fmt": ["{}:{}", "device", "port"]
},
"infiniband.port.switch.out.packets": {
"name": "node_infiniband_port_packets_transmitted_total",
"groupby": "port",
"out_fmt": ["{}:{}", "device", "port"]
},
"ipmi.dcmi.power": {
"name": "ipmi_dcmi_power_consumption_watts",
"groupby": "host"
},
"kernel.all.load": {
"name": "node_load1",
"groupby": "host"
},
"kernel.percpu.cpu.user": {
"name": "node_cpu_seconds_total",
"defaults": {"mode" : "user"},
"scaling": "1000",
"groupby": "cpu",
"out_fmt": ["cpu{}", "cpu"]
},
"kernel.percpu.cpu.idle": {
"name": "node_cpu_seconds_total",
"defaults": {"mode" : "idle"},
"scaling": "1000",
"groupby": "cpu",
"out_fmt": ["cpu{}", "cpu"]
},
"kernel.percpu.cpu.nice": {
"name": "node_cpu_seconds_total",
"defaults": {"mode" : "nice"},
"scaling": "1000",
"groupby": "cpu",
"out_fmt": ["cpu{}", "cpu"]
},
"kernel.percpu.cpu.sys": {
"name": "node_cpu_seconds_total",
"defaults": {"mode" : "system"},
"scaling": "1000",
"groupby": "cpu",
"out_fmt": ["cpu{}", "cpu"]
},
"kernel.percpu.cpu.wait.total": {
"name": "node_cpu_seconds_total",
"defaults": {"mode" : "iowait"},
"scaling": "1000",
"groupby": "cpu",
"out_fmt": ["cpu{}", "cpu"]
},
"kernel.percpu.cpu.irq.hard": {
"name": "node_cpu_seconds_total",
"defaults": {"mode" : "irq"},
"scaling": "1000",
"groupby": "cpu",
"out_fmt": ["cpu{}", "cpu"]
},
"kernel.percpu.cpu.irq.soft": {
"name": "node_cpu_seconds_total",
"defaults": {"mode" : "softirq"},
"scaling": "1000",
"groupby": "cpu",
"out_fmt": ["cpu{}", "cpu"]
},
"mem.numa.util.filePages": {
"name": "node_memory_numa_FilePages",
"groupby": "node"
},
"mem.numa.util.slab": {
"name": "node_memory_numa_Slab",
"groupby": "node"
},
"mem.numa.util.used": {
"name": "node_memory_numa_MemUsed",
"groupby": "node"
},
"mem.freemem": {
"name": "node_memory_MemFree_bytes",
"scaling": "0.0009765625",
"groupby": "host"
},
"mem.physmem": {
"name": "node_memory_MemTotal_bytes",
"scaling": "0.0009765625",
"groupby": "host"
},
"network.interface.in.bytes": {
"name": "node_network_receive_bytes_total",
"groupby": "device"
},
"network.interface.out.bytes": {
"name": "node_network_transmit_bytes_total",
"groupby": "device"
},
"nvidia.gpuactive": {
"name": "DCGM_FI_DEV_GPU_UTIL",
"groupby": "gpu",
"out_fmt": ["gpu{}", "gpu"]
},
"nvidia.memused": {
"name": "DCGM_FI_DEV_FB_USED",
"groupby": "gpu",
"out_fmt": ["gpu{}", "gpu"]
},
"nvidia.powerused": {
"name": "DCGM_FI_DEV_POWER_USAGE",
"scaling": "1000",
"groupby": "gpu",
"out_fmt": ["gpu{}", "gpu"]
},
"prom:cgroup_cpu_info": {
"name": "cgroup_cpu_info",
"params": ["cgroup"],
"groupby": "cpus"
},
"prom:cgroup_process_exec_count": {
"name": "cgroup_process_exec_count",
"params": ["cgroup"],
"groupby": "exec"
}
}
}
7 changes: 5 additions & 2 deletions setup.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
[bdist_rpm]
release = 1.0-beta3%%{?dist}
release = 1.0-rc.3%%{?dist}
build_requires = python36-devel, pcp-libs-devel >= 5.3, pcp-libs-devel < 5.4
requires = python3, python3-pymongo, python3-numpy, python3-scipy, python3-PyMySQL, python3-pcp >= 5.3, python3-pcp < 5.4, pcp-libs >= 5.3, pcp-libs < 5.4, python3-Cython, python3-pytz
requires = python3, python3-pymongo, python3-numpy, python3-scipy, python3-PyMySQL, python3-pcp >= 5.3, python3-pcp < 5.4, pcp-libs >= 5.3, pcp-libs < 5.4, python3-Cython, python3-pytz, python3-requests
install_script = .rpm_install_script.txt

[bdist_wheel]
python-tag = py36
11 changes: 6 additions & 5 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,10 @@
packages=find_packages(where='src'),
package_data={
'supremm': ['assets/modw_supremm.sql', 'assets/mongo_setup.js', '*.pxd', '*.pyx'],
'supremm.pcpcinterface': ['*.pxd', '*.pyx']
'supremm.datasource.pcp.pcpcinterface': ['*.pxd', '*.pyx']
},
data_files=[
(confpath, ['config/config.json']),
(confpath, ['config/config.json', 'config/prometheus/mapping.json']),
('share/supremm/templates/slurm', ['config/templates/slurm/slurm-epilog', 'config/templates/slurm/slurm-prolog']),
('share/supremm/templates/hotproc', ['config/templates/hotproc/hotproc.conf']),
('share/supremm/templates/pmlogger', ['config/templates/pmlogger/control', 'config/templates/pmlogger/pmlogger-supremm.config'])
Expand All @@ -52,7 +52,7 @@
'gen-pmlogger-control.py = supremm.gen_pmlogger_control:main',
'summarize_jobs.py = supremm.summarize_jobs:main',
'summarize_mpi.py = supremm.summarize_mpi:main',
'indexarchives.py = supremm.indexarchives:runindexing',
'indexarchives.py = supremm.datasource.pcp.indexarchives:runindexing',
'account.py = supremm.account:runingest',
'supremmconf.py = supremm.supremmconf:main',
'supremm-setup = supremm.supremm_setup:main',
Expand All @@ -68,10 +68,11 @@
'Cython',
'scipy',
'pymongo',
'pytz'
'pytz',
'requests'
],
ext_modules=cythonize([
Extension("supremm.pcpcinterface.pcpcinterface", ["src/supremm/pcpcinterface/pcpcinterface.pyx"], libraries=["pcp"], include_dirs=[numpy.get_include()])
Extension("supremm.datasource.pcp.pcpcinterface.pcpcinterface", ["src/supremm/datasource/pcp/pcpcinterface/pcpcinterface.pyx"], libraries=["pcp"], include_dirs=[numpy.get_include()])
])
)

Expand Down
6 changes: 6 additions & 0 deletions src/supremm/Job.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ def __init__(self, job_pk_id, job_id, acct):
self.job_id = job_id
self.acct = acct
self._nodecount = acct['nodes']

self._start_datetime = datetimeconvert(acct['start_time'])
self._end_datetime = datetimeconvert(acct['end_time'])

Expand Down Expand Up @@ -130,6 +131,11 @@ def rawarchives(self):
if len(nodedata.rawarchives) > 0:
yield nodename, nodedata.rawarchives

def nodenames(self):
""" iterator for all nodenames that the job ran on """
for nodename in self._nodes.keys():
yield nodename

def nodearchives(self):
""" iterator for the combined archives for the nodes in the job """
for nodename, nodedata in self._nodes.items():
Expand Down
5 changes: 3 additions & 2 deletions src/supremm/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import pkg_resources
import logging


def iscomment(line):
""" check is line is a c++ style comment """
if re.search(r"^\s*//", line):
Expand Down Expand Up @@ -47,7 +48,7 @@ def __str__(self):
return json.dumps(self._config, indent=4)

@staticmethod
def autodetectconfpath():
def autodetectconfpath(filename="config.json"):
""" search known paths for the configuration directory
List of paths support the three typical install locations
1) Environment variable SUPREMM_CONFIG_DIR
Expand All @@ -63,7 +64,7 @@ def autodetectconfpath():
]

for path in searchpaths:
if os.path.exists(os.path.join(path, "config.json")):
if os.path.exists(os.path.join(path, filename)):
return os.path.abspath(path)

return None
Expand Down
File renamed without changes.
Loading

0 comments on commit ddd97b1

Please sign in to comment.