From 0e4171b3ef3f97a008cf4ce6816a0c4065b73d5b Mon Sep 17 00:00:00 2001 From: Darrell Breeden Date: Fri, 27 Mar 2020 08:37:59 -0400 Subject: [PATCH 1/4] feature: adding version command and injecting at build-time from goreleaser --- cmd/server/.goreleaser.yml | 25 ++++++++++++++----------- cmd/version.go | 21 +++++++++++++++++++++ 2 files changed, 35 insertions(+), 11 deletions(-) create mode 100644 cmd/version.go diff --git a/cmd/server/.goreleaser.yml b/cmd/server/.goreleaser.yml index c1d49b6..9ddd176 100644 --- a/cmd/server/.goreleaser.yml +++ b/cmd/server/.goreleaser.yml @@ -7,16 +7,7 @@ nfpms: id: gridengine_prometheus # You can change the name of the package. # Default: `{{ .ProjectName }}_{{ .Version }}_{{ .Os }}_{{ .Arch }}{{ if .Arm }}v{{ .Arm }}{{ end }}` - name_template: "{{ .ProjectName }}_{{ .Version }}_{{ .Os }}_{{ .Arch }}" - - # Replacements for GOOS and GOARCH in the package name. - # Keys should be valid GOOSs or GOARCHs. - # Values are the respective replacements. - # Default is empty. - replacements: - amd64: 64-bit - 386: 32-bit - darwin: macOS + name_template: "{{ .ProjectName }}_{{ .Os }}_{{ .Arch }}" # Your app's vendor. # Default is empty. @@ -74,4 +65,16 @@ nfpms: # Keys are the possible targets during the installation process # Values are the paths to the scripts which will be executed scripts: - postinstall: "scripts/postinstall.sh" \ No newline at end of file + postinstall: "scripts/postinstall.sh" +builds: + - + id: "default" + ldflags: + - -s -w -X 'github.com/metrumresearchgroup/gridengine_prometheus/cmd.Version={{ .Env.VERSION }}' + +archives: + - + builds: + - default + name_template: "{{ .ProjectName }}_{{ .Os }}_{{ .Arch }}" + format: tar.gz \ No newline at end of file diff --git a/cmd/version.go b/cmd/version.go new file mode 100644 index 0000000..03b9c0b --- /dev/null +++ b/cmd/version.go @@ -0,0 +1,21 @@ +package cmd + +import "github.com/spf13/cobra" + +var Version string = "develop" + +var versionCmd = &cobra.Command{ + Use: "version", + Short: "Display binary version", + Long: "Display binary version", + Example: `gridengine_prometheus version`, + Run: version, +} + +func version(cmd *cobra.Command, args []string) { + println(Version) +} + +func init(){ + RootCmd.AddCommand(versionCmd) +} \ No newline at end of file From 51c2d7b99fe5e84d64c18716444f09b01535c1cb Mon Sep 17 00:00:00 2001 From: Darrell Breeden Date: Fri, 27 Mar 2020 09:02:58 -0400 Subject: [PATCH 2/4] ci: Updating drone to always test --- .drone.yml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/.drone.yml b/.drone.yml index 651faea..a7d6dff 100644 --- a/.drone.yml +++ b/.drone.yml @@ -10,10 +10,6 @@ steps: image: golang commands: - go test ./... -covermode=count -coverprofile=coverage.out - when: - event: - exclude: - - pull_request - name: Coverall Work image: golang From 5abd15ffcc666950950e6c2d8ac6977f9244b2b4 Mon Sep 17 00:00:00 2001 From: Darrell Breeden Date: Fri, 27 Mar 2020 09:12:18 -0400 Subject: [PATCH 3/4] feature: Adding discrete state value to job related metrics --- grid.go | 12 ++++++------ grid_test.go | 6 +++--- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/grid.go b/grid.go index 6b5b5ec..d18e7dd 100644 --- a/grid.go +++ b/grid.go @@ -73,17 +73,17 @@ func NewGridEngine() *GridEngine { JobState: prometheus.NewDesc( "job_state_value", "Indicates whether job is running (1) or not (0)", - []string{"hostname", "queue", "name", "owner", "job_number", "task_id"}, + []string{"hostname", "queue", "name", "owner", "job_number", "task_id", "state"}, nil), JobPriority: prometheus.NewDesc( "job_priority_value", "Qstat priority for given job", - []string{"hostname", "queue", "name", "owner", "job_number", "task_id"}, + []string{"hostname", "queue", "name", "owner", "job_number", "task_id", "state"}, nil), JobSlots: prometheus.NewDesc( "job_slots_count", "Number of slots on the selected job", - []string{"hostname", "queue", "name", "owner", "job_number", "task_id"}, + []string{"hostname", "queue", "name", "owner", "job_number", "task_id", "state"}, nil), } } @@ -201,7 +201,7 @@ func processJob(j gogridengine.Job, ch chan<- prometheus.Metric, collector *Grid number := strconv.FormatInt(j.JBJobNumber, 10) taskID := strconv.Itoa(int(j.Tasks.TaskID)) - ch <- prometheus.MustNewConstMetric(collector.JobState, prometheus.GaugeValue, float64(gogridengine.IsJobRunning(j)), hostname, queue, name, owner, number, taskID) - ch <- prometheus.MustNewConstMetric(collector.JobPriority, prometheus.GaugeValue, j.JATPriority, hostname, queue, name, owner, number, taskID) - ch <- prometheus.MustNewConstMetric(collector.JobSlots, prometheus.GaugeValue, float64(j.Slots), hostname, queue, name, owner, number, taskID) + ch <- prometheus.MustNewConstMetric(collector.JobState, prometheus.GaugeValue, float64(gogridengine.IsJobRunning(j)), hostname, queue, name, owner, number, taskID, j.State) + ch <- prometheus.MustNewConstMetric(collector.JobPriority, prometheus.GaugeValue, j.JATPriority, hostname, queue, name, owner, number, taskID, j.State) + ch <- prometheus.MustNewConstMetric(collector.JobSlots, prometheus.GaugeValue, float64(j.Slots), hostname, queue, name, owner, number, taskID, j.State) } diff --git a/grid_test.go b/grid_test.go index 3603814..ce4a8a1 100644 --- a/grid_test.go +++ b/grid_test.go @@ -64,17 +64,17 @@ func Test_newGridEngine(t *testing.T) { JobState: prometheus.NewDesc( "job_state_value", "Indicates whether job is running (1) or not (0)", - []string{"hostname", "queue", "name", "owner", "job_number", "task_id"}, + []string{"hostname", "queue", "name", "owner", "job_number", "task_id", "state"}, nil), JobPriority: prometheus.NewDesc( "job_priority_value", "Qstat priority for given job", - []string{"hostname", "queue", "name", "owner", "job_number", "task_id"}, + []string{"hostname", "queue", "name", "owner", "job_number", "task_id", "state"}, nil), JobSlots: prometheus.NewDesc( "job_slots_count", "Number of slots on the selected job", - []string{"hostname", "queue", "name", "owner", "job_number", "task_id"}, + []string{"hostname", "queue", "name", "owner", "job_number", "task_id", "state"}, nil), }, }, From c1e24b46274dc65c7ad06c563e13ea659c6967f5 Mon Sep 17 00:00:00 2001 From: Darrell Breeden Date: Fri, 27 Mar 2020 11:14:53 -0400 Subject: [PATCH 4/4] feature: adding error state evaluation to facilitate easier dashboarding --- go.mod | 2 +- go.sum | 2 ++ grid.go | 45 ++++++++++++++++++++++++++++++++++----------- grid_test.go | 8 ++++++++ 4 files changed, 45 insertions(+), 12 deletions(-) diff --git a/go.mod b/go.mod index 057bd49..bfab336 100644 --- a/go.mod +++ b/go.mod @@ -3,7 +3,7 @@ module github.com/metrumresearchgroup/gridengine_prometheus go 1.13 require ( - github.com/metrumresearchgroup/gogridengine v0.0.0-20191225174611-35c8b62cd3fe + github.com/metrumresearchgroup/gogridengine v0.0.1 github.com/prometheus/client_golang v1.2.1 github.com/sirupsen/logrus v1.4.2 github.com/spf13/cobra v0.0.5 diff --git a/go.sum b/go.sum index fff5a39..2ba5e23 100644 --- a/go.sum +++ b/go.sum @@ -83,6 +83,8 @@ github.com/metrumresearchgroup/gogridengine v0.0.0-20191218182527-bc69784db233 h github.com/metrumresearchgroup/gogridengine v0.0.0-20191218182527-bc69784db233/go.mod h1:opfxgPaKkfTsV5EbwTkH3oUMAyJE7VL1twLdhEPXorg= github.com/metrumresearchgroup/gogridengine v0.0.0-20191225174611-35c8b62cd3fe h1:Gl5qwfu5Q0ZSaLN5Z0EEIXVhVDzH0M+QudJhabyfJQ0= github.com/metrumresearchgroup/gogridengine v0.0.0-20191225174611-35c8b62cd3fe/go.mod h1:opfxgPaKkfTsV5EbwTkH3oUMAyJE7VL1twLdhEPXorg= +github.com/metrumresearchgroup/gogridengine v0.0.1 h1:QG+ynp0mC7ydrZOsIIRYl87bwzfb2UEFfB5KbUgHuv4= +github.com/metrumresearchgroup/gogridengine v0.0.1/go.mod h1:opfxgPaKkfTsV5EbwTkH3oUMAyJE7VL1twLdhEPXorg= github.com/mitchellh/go-homedir v1.1.0 h1:lukF9ziXFxDFPkA1vsr5zpc1XuPDn/wFntq5mG+4E0Y= github.com/mitchellh/go-homedir v1.1.0/go.mod h1:SfyaCUpYCn1Vlf4IUYiD9fPX4A5wJrkLzIz1N1q0pr0= github.com/mitchellh/mapstructure v1.1.2 h1:fmNYVwqnSfB9mZU6OS2O6GsXM+wcskZDuKQzvN1EDeE= diff --git a/grid.go b/grid.go index d18e7dd..8f4d212 100644 --- a/grid.go +++ b/grid.go @@ -26,64 +26,86 @@ type GridEngine struct { JobState *prometheus.Desc JobPriority *prometheus.Desc JobSlots *prometheus.Desc + JobErrors *prometheus.Desc } func NewGridEngine() *GridEngine { + + hostLabels := []string{ + "hostname", + "queue", + } + + jobLabels := []string{ + "hostname", + "queue", + "name", + "owner", + "job_number", + "task_id", + "state", + } + return &GridEngine{ TotalSlots: prometheus.NewDesc( "total_slots_count", "Total Number of slots available to the host", - []string{"hostname", "queue"}, + hostLabels, nil), UsedSlots: prometheus.NewDesc( "used_slots_count", "Number of used slots on host", - []string{"hostname", "queue"}, + hostLabels, nil), ReservedSlots: prometheus.NewDesc( "reserved_slots_count", "Number of reserved slots on host", - []string{"hostname", "queue"}, + hostLabels, nil), LoadAverage: prometheus.NewDesc( "sge_load_average", "Load average of this specific SGE host", - []string{"hostname", "queue"}, + hostLabels, nil), FreeMemory: prometheus.NewDesc( "free_memory_bytes", "Number of bytes in free memory", - []string{"hostname", "queue"}, + hostLabels, nil), UsedMemory: prometheus.NewDesc( "sge_used_memory_bytes", "Number of bytes in used memory", - []string{"hostname", "queue"}, + hostLabels, nil), TotalMemory: prometheus.NewDesc( "sge_total_memory_bytes", "Number of bytes in total memory", - []string{"hostname", "queue"}, + hostLabels, nil), CPUUtilization: prometheus.NewDesc( "sge_cpu_utilization_percent", "Decimal representing total CPU utilization on host", - []string{"hostname", "queue"}, + hostLabels, nil), JobState: prometheus.NewDesc( "job_state_value", "Indicates whether job is running (1) or not (0)", - []string{"hostname", "queue", "name", "owner", "job_number", "task_id", "state"}, + jobLabels, nil), JobPriority: prometheus.NewDesc( "job_priority_value", "Qstat priority for given job", - []string{"hostname", "queue", "name", "owner", "job_number", "task_id", "state"}, + jobLabels, nil), JobSlots: prometheus.NewDesc( "job_slots_count", "Number of slots on the selected job", - []string{"hostname", "queue", "name", "owner", "job_number", "task_id", "state"}, + jobLabels, + nil), + JobErrors: prometheus.NewDesc( + "job_errors", + "Jobs that are reported in an errored or anomalous state", + jobLabels, nil), } } @@ -204,4 +226,5 @@ func processJob(j gogridengine.Job, ch chan<- prometheus.Metric, collector *Grid ch <- prometheus.MustNewConstMetric(collector.JobState, prometheus.GaugeValue, float64(gogridengine.IsJobRunning(j)), hostname, queue, name, owner, number, taskID, j.State) ch <- prometheus.MustNewConstMetric(collector.JobPriority, prometheus.GaugeValue, j.JATPriority, hostname, queue, name, owner, number, taskID, j.State) ch <- prometheus.MustNewConstMetric(collector.JobSlots, prometheus.GaugeValue, float64(j.Slots), hostname, queue, name, owner, number, taskID, j.State) + ch <- prometheus.MustNewConstMetric(collector.JobErrors, prometheus.GaugeValue, float64(gogridengine.IsJobInErrorState(j)), hostname, queue, name, owner, number, taskID, j.State) } diff --git a/grid_test.go b/grid_test.go index ce4a8a1..8c88290 100644 --- a/grid_test.go +++ b/grid_test.go @@ -76,6 +76,11 @@ func Test_newGridEngine(t *testing.T) { "Number of slots on the selected job", []string{"hostname", "queue", "name", "owner", "job_number", "task_id", "state"}, nil), + JobErrors: prometheus.NewDesc( + "job_errors", + "Jobs that are reported in an errored or anomalous state", + []string{"hostname", "queue", "name", "owner", "job_number", "task_id", "state"}, + nil), }, }, } @@ -175,6 +180,7 @@ func TestGridEngine_Collect(t *testing.T) { JobState *prometheus.Desc JobPriority *prometheus.Desc JobSlots *prometheus.Desc + JobErrors *prometheus.Desc } type args struct { ch chan<- prometheus.Metric @@ -198,6 +204,7 @@ func TestGridEngine_Collect(t *testing.T) { JobState: description.JobState, JobPriority: description.JobPriority, JobSlots: description.JobSlots, + JobErrors: description.JobErrors, }, args: args{ ch: channel, @@ -218,6 +225,7 @@ func TestGridEngine_Collect(t *testing.T) { JobState: tt.fields.JobState, JobPriority: tt.fields.JobPriority, JobSlots: tt.fields.JobSlots, + JobErrors: tt.fields.JobErrors, } collector.Collect(tt.args.ch) })