Skip to content

Commit

Permalink
Add Prometheus metrics for DarkBlue Service (#48)
Browse files Browse the repository at this point in the history
* Added loglevel method

* Added required libs

* fixed failing test case

* fixed failing test case

* refractored the metrics

* added unless check on nil

* removed unit test

* Refactored metrics

* Refactored metrics unittest

* Refactored metrics unittest

* Refactored metrics unittest

* Refactored metrics unittest

* Refactored metrics unittest

* Fixed style issue

* Fixed failing unittest

* renamed file

* commented out data to fix failing unit test

* commented out data to fix failing unit test

* Fixed the data setup

* Fixed the failing unit test

* refactored the code

* Allow for injection of registry; use registry mock to test gauge setup; move things around

* registry -> registry_mock; call registry_mock.verify

* refactored the code

* removed unused require

* updated the bag_identifier changes

* code clean up

* code clean up

---------

Co-authored-by: Samuel Sciolla <ssciolla@umich.edu>
  • Loading branch information
jayamala17 and ssciolla authored May 17, 2024
1 parent 92f6948 commit b4ca813
Show file tree
Hide file tree
Showing 14 changed files with 385 additions and 13 deletions.
1 change: 1 addition & 0 deletions Gemfile
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ gem "canister", "~> 0.9.2"
gem "faraday", "~> 2.9"
gem "faraday-retry", "~> 2.2"
gem "mysql2", "~> 0.5.6"
gem "prometheus-client", "~> 4.2.2"
gem "rexml", "~> 3.2.6"
gem "semantic_logger", "~> 4.15"
gem "sequel", "~> 5.77"
Expand Down
2 changes: 2 additions & 0 deletions Gemfile.lock
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ GEM
mysql2 (0.5.6)
net-http (0.4.1)
uri
prometheus-client (4.2.2)
pastel (0.8.0)
tty-color (~> 0.5)
rake (13.1.0)
Expand Down Expand Up @@ -73,6 +74,7 @@ DEPENDENCIES
faraday-retry (~> 2.2)
minitest (~> 5.20)
mysql2 (~> 0.5.6)
prometheus-client (~> 4.2.2)
rake (~> 13.1)
rexml (~> 3.2.6)
semantic_logger (~> 4.15)
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -150,3 +150,4 @@ docker compose run dark-blue rake test
- [aws-sdk-s3](https://docs.aws.amazon.com/sdk-for-ruby/v3/api/Aws/S3.html)
- [mlibrary/sftp](https://github.com/mlibrary/sftp)
- [Sequel](http://sequel.jeremyevans.net/)
- [Prometheus Client](https://github.com/prometheus/client_ruby/blob/main/README.md)
10 changes: 10 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -39,5 +39,15 @@ services:
interval: 5s
timeout: 5s
retries: 3
prometheus:
image: prom/prometheus
ports:
- 9090:9090
volumes:
- ./prometheus.yml:/etc/prometheus/prometheus.yml
pushgateway:
image: prom/pushgateway
ports:
- 9091:9091
volumes:
database:
3 changes: 3 additions & 0 deletions example.env
Original file line number Diff line number Diff line change
Expand Up @@ -63,3 +63,6 @@ APTRUST_REMOTE_SETTINGS_RESTORE_PATH=umich.edu
APTRUST_REMOTE_SETTINGS_BUCKET_REGION=
APTRUST_REMOTE_SETTINGS_AWS_ACCESS_KEY_ID=
APTRUST_REMOTE_SETTINGS_AWS_SECRET_ACCESS_KEY=

# Metrics push gateway url
PROMETHEUS_PUSH_GATEWAY=http://pushgateway:9091
10 changes: 9 additions & 1 deletion lib/config.rb
Original file line number Diff line number Diff line change
Expand Up @@ -202,13 +202,20 @@ def get_subset_by_key_stem(stem)
keyword_init: true
)

Metrics = Struct.new(
"MetricsConfig",
:push_gateway_url,
keyword_init: true
)

Config = Struct.new(
"Config",
:settings,
:database,
:repository,
:dark_blue,
:aptrust,
:metrics,
keyword_init: true
)

Expand Down Expand Up @@ -310,7 +317,8 @@ def self.create_config(data)
base_url: data.get_value(key: "APTRUST_API_BASE_URL")
),
remote: create_remote_config(data.get_subset_by_key_stem("APTRUST_REMOTE_"))
)
),
metrics: Metrics.new(push_gateway_url: data.get_value(key: "PROMETHEUS_PUSH_GATEWAY"))
)
end

Expand Down
111 changes: 111 additions & 0 deletions lib/metrics.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
require "prometheus/client"
require "prometheus/client/push"
require "prometheus/client/registry"

require_relative "bag_status"

module Metrics
class PushGatewayClientError < StandardError; end

class Timer
def self.time_processing
start_time = Time.now.to_i
yield
end_time = Time.now.to_i
[start_time, end_time]
end
end

class MetricsProvider
def initialize(
status_event_repo:,
push_gateway_url:,
start_time:,
end_time:,
registry: nil
)
@start_time = start_time
@end_time = end_time
@status_event_repo = status_event_repo
@push_gateway_url = push_gateway_url
@registry = registry
end

def registry
@registry ||= Prometheus::Client::Registry.new
end
private :registry

def get_latest_bag_events_by_time
st_time = Time.at(@start_time)
@status_event_repo.get_latest_event_for_bags(start_time: st_time)
end

def get_success_count(events_by_time)
success_count = events_by_time.select { |e| e.status == BagStatus::DEPOSITED }
success_count.count
end

def get_failure_count(events_by_time)
failure_count = events_by_time.select { |e| e.status == BagStatus::FAILED }
failure_count.count
end

def set_success_count(events_by_time)
dark_blue_success_count = registry.gauge(
:dark_blue_success_count,
docstring: "Number of successful bag transfers"
)
dark_blue_success_count.set(get_success_count(events_by_time))
end

def set_failed_count(events_by_time)
dark_blue_failed_count = registry.gauge(
:dark_blue_failed_count,
docstring: "Number of failed bag transfers"
)
dark_blue_failed_count.set(get_failure_count(events_by_time))
end

def set_last_successful_run
dark_blue_last_successful_run = registry.gauge(
:dark_blue_last_successful_run,
docstring: "Timestamp of the last successful run of the cron job"
)
# converting starttime to milliseconds to support converting epoch time to datetime
# https://github.com/grafana/grafana/issues/6297
time_in_milli_sec = @start_time * 1000
dark_blue_last_successful_run.set(time_in_milli_sec)
end

def set_processing_duration
dark_blue_processing_duration = registry.gauge(
:dark_blue_processing_duration,
docstring: "Duration of processing in seconds for the cron job"
)
dark_blue_processing_duration.set(@end_time - @start_time)
end

def gateway
@gateway ||= Prometheus::Client::Push.new(
job: "DarkBlueMetric",
gateway: @push_gateway_url
)
end
private :gateway

def push_metrics
gateway.add(registry)
end
private :push_metrics

def set_all_metrics
set_last_successful_run
set_processing_duration
latest_events = get_latest_bag_events_by_time
set_success_count(latest_events)
set_failed_count(latest_events)
push_metrics
end
end
end
24 changes: 20 additions & 4 deletions lib/status_event_repository.rb
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,10 @@ def get_all_for_bag_identifier(identifier)
def get_latest_event_for_bag(bag_identifier:)
raise NotImplementedError
end

def get_latest_event_for_bags(start_time:)
raise NotImplementedError
end
end

class StatusEventInMemoryRepository < StatusEventRepositoryBase
Expand Down Expand Up @@ -69,10 +73,6 @@ def create(bag_identifier:, status:, timestamp:, note: nil)
@status_events << event
end

def get_by_id(id)
@status_events.find { |e| e.id == id }
end

def get_all
@status_events
end
Expand All @@ -87,6 +87,14 @@ def get_latest_event_for_bag(bag_identifier:)
.sort_by(&:timestamp).reverse
(events.length > 0) ? events[0] : nil
end

def get_latest_event_for_bags(start_time:)
@status_events.select { |e| e.timestamp >= start_time }
.group_by(&:bag_identifier)
.transform_values { |bag_identifier| bag_identifier.max_by(&:timestamp) }
.values
.compact
end
end

class StatusEventDatabaseRepository
Expand Down Expand Up @@ -142,6 +150,14 @@ def get_all_for_bag_identifier(identifier)
.map { |se| convert_to_struct(se) }
end

# https://sequel.jeremyevans.net/rdoc/classes/Sequel/SQL/Window.html
def get_latest_event_for_bags(start_time:)
base_query.where { timestamp >= start_time }
.select_append { row_number.function.over(partition: :bag_id, order: Sequel.desc(:timestamp)).as(:rn) }
.from_self.where(rn: 1)
.all.map { |se| convert_to_struct(se) }
end

def get_latest_event_for_bag(bag_identifier:)
event = base_query
.where(bag: DatabaseSchema::Bag.where(identifier: bag_identifier))
Expand Down
8 changes: 8 additions & 0 deletions prometheus.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
scrape_configs:
- job_name: darkbluejob
honor_labels: true
honor_timestamps: true
scrape_interval: 50ms
metrics_path: /metrics
static_configs:
- targets: ['pushgateway:9091']
25 changes: 18 additions & 7 deletions run_dark_blue.rb
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,13 @@

require_relative "lib/archivematica"
require_relative "lib/bag_repository"
require_relative "lib/bag_validator"
require_relative "lib/data_transfer"
require_relative "lib/dispatcher"
require_relative "lib/metrics"
require_relative "lib/remote_client"
require_relative "lib/repository_package_repository"
require_relative "lib/status_event_repository"
require_relative "lib/bag_validator"

class DarkBlueError < StandardError
end
Expand All @@ -35,7 +36,7 @@ def initialize(config)
type: config.aptrust.remote.type,
settings: config.aptrust.remote.settings
),
status_event_repo: StatusEventRepository::StatusEventRepositoryFactory.for(use_db: DB),
status_event_repo: S.status_event_repo,
bag_repo: BagRepository::BagRepositoryFactory.for(use_db: DB)
)
@arch_configs = config.dark_blue.archivematicas
Expand Down Expand Up @@ -201,8 +202,18 @@ def self.parse(options)
dark_blue_job = DarkBlueJob.new(config)

options = DarkBlueParser.parse ARGV
if options.packages.length > 0
dark_blue_job.redeliver_packages(options.packages)
else
dark_blue_job.process
end

start_time, end_time = Metrics::Timer.time_processing {
if options.packages.length > 0
dark_blue_job.redeliver_packages(options.packages)
else
dark_blue_job.process
end
}
metrics = Metrics::MetricsProvider.new(
start_time: start_time,
end_time: end_time,
status_event_repo: S.status_event_repo,
push_gateway_url: config.metrics.push_gateway_url
)
metrics.set_all_metrics
5 changes: 5 additions & 0 deletions services.rb
Original file line number Diff line number Diff line change
Expand Up @@ -44,3 +44,8 @@ def self.included(klass)
password: db_config.password,
fractional_seconds: true)
end

S.register(:status_event_repo) do
db = S.config.database && S.dbconnect
StatusEventRepository::StatusEventRepositoryFactory.for(use_db: db)
end
Loading

0 comments on commit b4ca813

Please sign in to comment.