From e3ee946743ff7d3e0955c01bbf36973fc677662d Mon Sep 17 00:00:00 2001 From: Johannes Haass Date: Wed, 16 Oct 2024 11:20:43 +0200 Subject: [PATCH] Clear job locks on generic workers after grace period is exceeded (#477) Introduces a configurable grace period after which the generic worker processes will be killed. This allows workers to finish their current job during e.g. an update without being killed after 15 seconds (bpm default). After the worker processes are stopped/killed pending locks will be cleared which allows other workers to pick up pending jobs. Before the locks would be only cleared after the job timeout has been exceeded (default 4 hours). This is based on the assumption that jobs processed on the generic workers are idempotent. --- .../templates/cloud_controller_ng.yml.erb | 3 - jobs/cloud_controller_worker/spec | 10 ++- .../templates/bin/cloud_controller_worker.erb | 6 +- .../templates/cloud_controller_ng.yml.erb | 3 - .../templates/drain.sh.erb | 14 +++- .../templates/shutdown_drain.rb.erb | 16 +++++ .../cloud_controller_ng_spec.rb | 11 ---- .../cloud_controller_worker_spec.rb | 11 ---- spec/cloud_controller_worker/drain_spec.rb | 66 +++++++++++++++++++ 9 files changed, 105 insertions(+), 35 deletions(-) create mode 100644 jobs/cloud_controller_worker/templates/shutdown_drain.rb.erb create mode 100644 spec/cloud_controller_worker/drain_spec.rb diff --git a/jobs/cloud_controller_ng/templates/cloud_controller_ng.yml.erb b/jobs/cloud_controller_ng/templates/cloud_controller_ng.yml.erb index c0b2367fc6..d9a30f521c 100644 --- a/jobs/cloud_controller_ng/templates/cloud_controller_ng.yml.erb +++ b/jobs/cloud_controller_ng/templates/cloud_controller_ng.yml.erb @@ -101,9 +101,6 @@ jobs: <% if_p("cc.jobs.priorities") do |priorities| %> priorities: <%= priorities.to_json %> <% end %> - <% if_p("cc.jobs.number_of_worker_threads") do |number_of_worker_threads| %> - number_of_worker_threads: <%= number_of_worker_threads %> - <% end %> cpu_weight_min_memory: <%= p("cc.cpu_weight_min_memory") %> diff --git a/jobs/cloud_controller_worker/spec b/jobs/cloud_controller_worker/spec index 102356474a..79a5cc3cd3 100644 --- a/jobs/cloud_controller_worker/spec +++ b/jobs/cloud_controller_worker/spec @@ -25,6 +25,7 @@ templates: droplets_ca_cert.pem.erb: config/certs/droplets_ca_cert.pem packages_ca_cert.pem.erb: config/certs/packages_ca_cert.pem resource_pool_ca_cert.pem.erb: config/certs/resource_pool_ca_cert.pem + shutdown_drain.rb.erb: bin/shutdown_drain mutual_tls_ca.crt.erb: config/certs/mutual_tls_ca.crt mutual_tls.crt.erb: config/certs/mutual_tls.crt mutual_tls.key.erb: config/certs/mutual_tls.key @@ -123,9 +124,6 @@ properties: description: "Maximum health check timeout (in seconds) that can be set for the app" cc.jobs.priorities: description: "List of hashes containing delayed jobs 'display_name' and its desired priority. This will overwrite the default priority of ccng" - cc.jobs.number_of_worker_threads: - description: "If set multiple delayed job workers will be started as threads in the same process. If not set there will be one delayed job worker per process." - cc.staging_upload_user: description: "User name used to access internal endpoints of Cloud Controller to upload files when staging" cc.staging_upload_password: @@ -418,6 +416,12 @@ properties: cc.jobs.generic.number_of_workers: default: 1 description: "Number of generic cloud_controller_worker workers" + cc.jobs.generic.number_of_worker_threads: + description: "Optional. Number of worker threads to start for each generic cloud_controller_worker worker process" + cc.jobs.generic.worker_grace_period_seconds: + description: "The number of seconds to wait for each generic cloud_controller_worker worker process to finish processing jobs before forcefully shutting it down" + default: 15 + uaa.clients.cc-service-dashboards.secret: description: "Used for generating SSO clients for service brokers." diff --git a/jobs/cloud_controller_worker/templates/bin/cloud_controller_worker.erb b/jobs/cloud_controller_worker/templates/bin/cloud_controller_worker.erb index d48eabbe85..f220aab114 100644 --- a/jobs/cloud_controller_worker/templates/bin/cloud_controller_worker.erb +++ b/jobs/cloud_controller_worker/templates/bin/cloud_controller_worker.erb @@ -14,4 +14,8 @@ export LD_PRELOAD=/var/vcap/packages/jemalloc/lib/libjemalloc.so <% end %> cd /var/vcap/packages/cloud_controller_ng/cloud_controller_ng -exec bundle exec rake jobs:generic[cc_global_worker.<%= spec.job.name %>.<%= spec.index %>.${INDEX}] + +<% num_threads = p("cc.jobs.generic.number_of_worker_threads", nil) %> +<% grace_period = p("cc.jobs.generic.worker_grace_period_seconds") %> + +exec bundle exec rake jobs:generic[cc_global_worker.<%= spec.job.name %>.<%= spec.index %>.${INDEX}<%= ",#{num_threads},#{grace_period.to_i - 1}" if num_threads %>] \ No newline at end of file diff --git a/jobs/cloud_controller_worker/templates/cloud_controller_ng.yml.erb b/jobs/cloud_controller_worker/templates/cloud_controller_ng.yml.erb index f6c116fa2e..62053c7d21 100644 --- a/jobs/cloud_controller_worker/templates/cloud_controller_ng.yml.erb +++ b/jobs/cloud_controller_worker/templates/cloud_controller_ng.yml.erb @@ -55,9 +55,6 @@ jobs: <% if_p("cc.jobs.priorities") do |priorities| %> priorities: <%= priorities.to_json %> <% end %> - <% if_p("cc.jobs.number_of_worker_threads") do |number_of_worker_threads| %> - number_of_worker_threads: <%= number_of_worker_threads %> - <% end %> default_app_memory: <%= p("cc.default_app_memory") %> default_app_disk_in_mb: <%= p("cc.default_app_disk_in_mb") %> diff --git a/jobs/cloud_controller_worker/templates/drain.sh.erb b/jobs/cloud_controller_worker/templates/drain.sh.erb index a24017af64..63dbcf270d 100755 --- a/jobs/cloud_controller_worker/templates/drain.sh.erb +++ b/jobs/cloud_controller_worker/templates/drain.sh.erb @@ -1,8 +1,16 @@ #!/usr/bin/env bash -for i in {1..<%=p("cc.jobs.generic.number_of_workers")%>}; do - /var/vcap/jobs/bpm/bin/bpm stop cloud_controller_worker -p "worker_${i}" 1>&2 -done +source /var/vcap/jobs/cloud_controller_worker/bin/ruby_version.sh +export CLOUD_CONTROLLER_NG_CONFIG=/var/vcap/jobs/cloud_controller_worker/config/cloud_controller_ng.yml + +/var/vcap/jobs/cloud_controller_worker/bin/shutdown_drain 1>&2 + +pushd /var/vcap/packages/cloud_controller_ng/cloud_controller_ng > /dev/null || exit 1 + for i in {1..<%=p("cc.jobs.generic.number_of_workers")%>}; do + # shellcheck disable=SC2093 + bundle exec rake jobs:clear_pending_locks[cc_global_worker.<%= spec.job.name %>.<%= spec.index %>."${i}"] 1>&2 + done +popd > /dev/null || exit 1 echo 0 # tell bosh not wait for anything exit 0 diff --git a/jobs/cloud_controller_worker/templates/shutdown_drain.rb.erb b/jobs/cloud_controller_worker/templates/shutdown_drain.rb.erb new file mode 100644 index 0000000000..6a1c77fd4a --- /dev/null +++ b/jobs/cloud_controller_worker/templates/shutdown_drain.rb.erb @@ -0,0 +1,16 @@ +#!/var/vcap/packages/ruby-3.2/bin/ruby --disable-all + +$LOAD_PATH.unshift('/var/vcap/packages/cloud_controller_ng/cloud_controller_ng/app') +$LOAD_PATH.unshift('/var/vcap/packages/cloud_controller_ng/cloud_controller_ng/lib') + +require 'cloud_controller/drain' + +@threads = [] +@grace_period = <%= p("cc.jobs.generic.worker_grace_period_seconds") %> +@drain = VCAP::CloudController::Drain.new('/var/vcap/sys/log/cloud_controller_worker') + +(1..<%= p("cc.jobs.generic.number_of_workers") %>).each do |i| + @threads << Thread.new { @drain.shutdown_delayed_worker("/var/vcap/sys/run/bpm/cloud_controller_worker/worker_#{i}.pid", @grace_period.to_i) } +end + +@threads.each(&:join) \ No newline at end of file diff --git a/spec/cloud_controller_ng/cloud_controller_ng_spec.rb b/spec/cloud_controller_ng/cloud_controller_ng_spec.rb index 20bf3b7185..4c8e0e0d47 100644 --- a/spec/cloud_controller_ng/cloud_controller_ng_spec.rb +++ b/spec/cloud_controller_ng/cloud_controller_ng_spec.rb @@ -711,17 +711,6 @@ module Test end end - describe 'cc_jobs_number_of_worker_threads' do - context "when 'cc.jobs.number_of_worker_threads' is set" do - before { merged_manifest_properties['cc']['jobs'] = { 'number_of_worker_threads' => 7 } } - - it 'renders the correct value into the ccng config' do - template_hash = YAML.safe_load(template.render(merged_manifest_properties, consumes: links)) - expect(template_hash['jobs']['number_of_worker_threads']).to eq(7) - end - end - end - describe 'cc_jobs_queues' do context 'when cc.jobs.queues is not set' do it 'does not render ccng config' do diff --git a/spec/cloud_controller_worker/cloud_controller_worker_spec.rb b/spec/cloud_controller_worker/cloud_controller_worker_spec.rb index 28b1b5505c..8570269cb9 100644 --- a/spec/cloud_controller_worker/cloud_controller_worker_spec.rb +++ b/spec/cloud_controller_worker/cloud_controller_worker_spec.rb @@ -250,17 +250,6 @@ module Test end end - describe 'cc_jobs_number_of_worker_threads' do - context "when 'cc.jobs.number_of_worker_threads' is set" do - before { manifest_properties['cc']['jobs'] = { 'number_of_worker_threads' => 7 } } - - it 'renders the correct value into the ccng config' do - template_hash = YAML.safe_load(template.render(manifest_properties, consumes: links)) - expect(template_hash['jobs']['number_of_worker_threads']).to eq(7) - end - end - end - describe 'cc_jobs_queues' do context 'when cc.jobs.queues is not set' do it 'does not render ccng config' do diff --git a/spec/cloud_controller_worker/drain_spec.rb b/spec/cloud_controller_worker/drain_spec.rb new file mode 100644 index 0000000000..87390cb022 --- /dev/null +++ b/spec/cloud_controller_worker/drain_spec.rb @@ -0,0 +1,66 @@ +# frozen_string_literal: true + +require 'rspec' +require 'bosh/template/test' + +module Bosh + module Template + module Test + describe 'drain template rendering' do + let(:release_path) { File.join(File.dirname(__FILE__), '../..') } + let(:release) { ReleaseDir.new(release_path) } + let(:job) { release.job('cloud_controller_worker') } + + describe 'bin/shutdown_drain' do + let(:template) { job.template('bin/shutdown_drain') } + + it 'renders the default value' do + rendered_file = template.render({}, consumes: {}) + expect(rendered_file).to include('@grace_period = 15') + end + + context "when 'worker_grace_period_seconds' is provided" do + it 'renders the provided value' do + rendered_file = template.render({ 'cc' => { 'jobs' => { 'generic' => { 'worker_grace_period_seconds' => 60 } } } }, consumes: {}) + expect(rendered_file).to include('@grace_period = 60') + end + end + + it 'renders the default number of workers' do + rendered_file = template.render({}, consumes: {}) + expect(rendered_file).to include('(1..1).each do |i|') + end + + context "when 'number_of_workers' is provided" do + it 'renders the provided number of workers' do + rendered_file = template.render({ 'cc' => { 'jobs' => { 'generic' => { 'number_of_workers' => 5 } } } }, consumes: {}) + expect(rendered_file).to include('(1..5).each do |i|') + end + end + end + + describe 'bin/drain' do + let(:template) { job.template('bin/drain') } + + it 'renders the default number of workers' do + rendered_file = template.render({}, consumes: {}) + expect(rendered_file).to include('for i in {1..1}; do') + end + + context "when 'number_of_workers' is provided" do + it 'renders the provided number of workers' do + rendered_file = template.render({ 'cc' => { 'jobs' => { 'generic' => { 'number_of_workers' => 5 } } } }, consumes: {}) + expect(rendered_file).to include('for i in {1..5}; do') + end + end + + it 'renders the job name and index' do + rendered_file = template.render({ 'job_name' => 'cc-worker' }, consumes: {}) + # Default job name is 'me' in tests (bosh-template) + expect(rendered_file).to include('bundle exec rake jobs:clear_pending_locks[cc_global_worker.me.0."${i}"]') + end + end + end + end + end +end