From 662e6aef88127f886e08c0bd34c9c02427163f0f Mon Sep 17 00:00:00 2001 From: carolyncole <1599081+carolyncole@users.noreply.github.com> Date: Tue, 12 Dec 2023 08:27:33 -0500 Subject: [PATCH] Adding a method to just count the files instead of fully processing them (#1632) also cache responses to counting and getting files can utilize the same aws data --- app/models/work.rb | 4 ++ app/services/s3_query_service.rb | 63 +++++++++++------- app/views/works/_s3_resources.html.erb | 4 +- app/views/works/show.html.erb | 2 +- spec/services/s3_query_service_spec.rb | 91 +++++++++++++++++++------- spec/support/s3_query_service_specs.rb | 2 +- spec/system/provenance_note_spec.rb | 4 ++ 7 files changed, 120 insertions(+), 50 deletions(-) diff --git a/app/models/work.rb b/app/models/work.rb index 3399f0a1..3db42497 100644 --- a/app/models/work.rb +++ b/app/models/work.rb @@ -472,6 +472,10 @@ def embargoed? embargo_date >= current_date end + def upload_count + @upload_count ||= s3_query_service.count_objects + end + protected def work_validator diff --git a/app/services/s3_query_service.rb b/app/services/s3_query_service.rb index 8cc39376..e3e2e6ee 100644 --- a/app/services/s3_query_service.rb +++ b/app/services/s3_query_service.rb @@ -40,6 +40,7 @@ def initialize(model, mode = "precuration") @mode = mode @part_size = 5_368_709_120 # 5GB is the maximum part size for AWS @last_response = nil + @s3_responses = {} end def config @@ -166,13 +167,19 @@ def find_s3_file(filename:) # Retrieve the S3 resources uploaded to the S3 Bucket # @return [Array] def client_s3_files(reload: false, bucket_name: self.bucket_name, prefix: self.prefix, ignore_directories: true) - @client_s3_files = nil if reload # force a reload + if reload # force a reload + @client_s3_files = nil + clear_s3_responses(bucket_name:, prefix:) + end @client_s3_files ||= get_s3_objects(bucket_name:, prefix:, ignore_directories:) end def client_s3_empty_files(reload: false, bucket_name: self.bucket_name, prefix: self.prefix) - @client_s3_empty_files = nil if reload # force a reload - @client_s3_empty_files = begin + if reload # force a reload + @client_s3_empty_files = nil + clear_s3_responses(bucket_name:, prefix:) + end + @client_s3_empty_files ||= begin files_and_directories = get_s3_objects(bucket_name:, prefix:, ignore_directories: false) files_and_directories.select { |object| !object.filename.ends_with?("/") && object.empty? } end @@ -322,14 +329,41 @@ def md5(io:) md5.base64digest end + def count_objects(bucket_name: self.bucket_name, prefix: self.prefix) + responses = s3_responses(bucket_name:, prefix:) + responses.reduce(0) { |total, resp| total + resp.key_count } + end + private + def clear_s3_responses(bucket_name:, prefix:) + key = "#{bucket_name} #{prefix}" + @s3_responses[key] = nil + end + + def s3_responses(bucket_name:, prefix:) + key = "#{bucket_name} #{prefix}" + responses = @s3_responses[key] + if responses.nil? + resp = client.list_objects_v2({ bucket: bucket_name, max_keys: 1000, prefix: }) + responses = [resp] + while resp.is_truncated + resp = client.list_objects_v2({ bucket: bucket_name, max_keys: 1000, prefix:, continuation_token: resp.next_continuation_token }) + responses << resp + end + @s3_responses[key] = responses + end + responses + end + def get_s3_objects(bucket_name:, prefix:, ignore_directories:) start = Time.zone.now - resp = client.list_objects_v2({ bucket: bucket_name, max_keys: 1000, prefix: }) - resp_hash = resp.to_h - objects = parse_objects(resp_hash, ignore_directories:) - objects += parse_continuation(resp_hash, bucket_name:, prefix:, ignore_directories:) + responses = s3_responses(bucket_name:, prefix:) + objects = responses.reduce([]) do |all_objects, resp| + resp_hash = resp.to_h + resp_objects = parse_objects(resp_hash, ignore_directories:) + all_objects + resp_objects + end elapsed = Time.zone.now - start Rails.logger.info("Loading S3 objects. Bucket: #{bucket_name}. Prefix: #{prefix}. Elapsed: #{elapsed} seconds") objects @@ -347,21 +381,6 @@ def parse_objects(resp, ignore_directories: true) objects end - def parse_continuation(resp_hash, bucket_name: self.bucket_name, prefix: self.prefix, ignore_directories: true) - objects = [] - while resp_hash[:is_truncated] - token = resp_hash[:next_continuation_token] - resp = client.list_objects_v2({ bucket: bucket_name, max_keys: 1000, prefix:, continuation_token: token }) - resp_hash = resp.to_h - objects += parse_objects(resp_hash, ignore_directories:) - end - objects - rescue Aws::Errors::ServiceError => aws_service_error - message = "An error was encountered when requesting to list the AWS S3 Objects in the bucket #{bucket_name} with the key #{prefix}: #{aws_service_error}" - Rails.logger.error(message) - raise aws_service_error - end - def upload_multipart_file(target_bucket:, target_key:, size:, io:) multi = client.create_multipart_upload(bucket: target_bucket, key: target_key) part_num = 0 diff --git a/app/views/works/_s3_resources.html.erb b/app/views/works/_s3_resources.html.erb index ef583704..d9b83904 100644 --- a/app/views/works/_s3_resources.html.erb +++ b/app/views/works/_s3_resources.html.erb @@ -2,7 +2,7 @@
-

<%= "#{@work.uploads.count} #{'File'.pluralize(@work.uploads.count)}"%> in +

<%= "#{@work.upload_count} #{'File'.pluralize(@work.upload_count)} or #{'Directory'.pluralize(@work.upload_count)}"%> in <%= @work.approved? ? "post-curation" : "pre-curation" %> storage

@@ -15,7 +15,7 @@ - <% if @work.uploads.empty? %> + <% if @work.upload_count.zero? %> diff --git a/app/views/works/show.html.erb b/app/views/works/show.html.erb index 973818ea..bfceccfa 100644 --- a/app/views/works/show.html.erb +++ b/app/views/works/show.html.erb @@ -270,7 +270,7 @@
Location
- <% if @work.pre_curation_uploads_fast.present? %> + <% if @work.upload_count > 0 %> Amazon S3 Curation Storage <% elsif @work.files_location_cluster? %> PUL Research Cluster diff --git a/spec/services/s3_query_service_spec.rb b/spec/services/s3_query_service_spec.rb index 723d08c8..59dea011 100644 --- a/spec/services/s3_query_service_spec.rb +++ b/spec/services/s3_query_service_spec.rb @@ -110,7 +110,7 @@ it "takes a DOI and returns information about that DOI in S3" do fake_aws_client = double(Aws::S3::Client) s3_query_service.stub(:client).and_return(fake_aws_client) - fake_s3_resp = double(Aws::S3::Types::ListObjectsV2Output) + fake_s3_resp = double(Aws::S3::Types::ListObjectsV2Output, is_truncated: false) fake_aws_client.stub(:list_objects_v2).and_return(fake_s3_resp) fake_s3_resp.stub(:to_h).and_return(s3_hash) @@ -127,11 +127,10 @@ it "takes a DOI and returns information about that DOI in S3 with pagination" do fake_aws_client = double(Aws::S3::Client) s3_query_service.stub(:client).and_return(fake_aws_client) - fake_s3_resp = double(Aws::S3::Types::ListObjectsV2Output) + fake_s3_resp = double(Aws::S3::Types::ListObjectsV2Output, next_continuation_token: "abc123") fake_aws_client.stub(:list_objects_v2).and_return(fake_s3_resp) - s3_hash_truncated = s3_hash.clone - s3_hash_truncated[:is_truncated] = true - fake_s3_resp.stub(:to_h).and_return(s3_hash_truncated, s3_hash) + fake_s3_resp.stub(:is_truncated).and_return(true, false) + fake_s3_resp.stub(:to_h).and_return(s3_hash) data_profile = s3_query_service.data_profile expect(data_profile[:objects]).to be_instance_of(Array) @@ -169,7 +168,7 @@ let(:fake_multi) { instance_double(Aws::S3::Types::CreateMultipartUploadOutput, key: "abc", upload_id: "upload id", bucket: "bucket") } let(:fake_parts) { instance_double(Aws::S3::Types::CopyPartResult, etag: "etag123abc", checksum_sha256: "sha256abc123") } let(:fake_upload) { instance_double(Aws::S3::Types::UploadPartCopyOutput, copy_part_result: fake_parts) } - let(:fake_s3_resp) { double(Aws::S3::Types::ListObjectsV2Output) } + let(:fake_s3_resp) { double(Aws::S3::Types::ListObjectsV2Output, is_truncated: false) } let(:preservation_service) { instance_double(WorkPreservationService) } before do @@ -373,6 +372,28 @@ end end end + + describe "#count_objects" do + before do + fake_s3_resp.stub(:key_count).and_return(s3_hash[:contents].count) + fake_s3_resp.stub(:is_truncated).and_return(false) + end + + it "returns all the objects" do + expect(s3_query_service.count_objects).to eq(3) + end + + context "truncated results" do + before do + fake_s3_resp.stub(:key_count).and_return(s3_hash[:contents].count) + fake_s3_resp.stub(:is_truncated).and_return(true, false) + fake_s3_resp.stub(:next_continuation_token).and_return("abc") + end + it "returns all the objects" do + expect(s3_query_service.count_objects).to eq(6) + end + end + end end context "post curated" do @@ -381,7 +402,7 @@ it "keeps precurated and post curated items separate" do fake_aws_client = double(Aws::S3::Client) s3_query_service.stub(:client).and_return(fake_aws_client) - fake_s3_resp = double(Aws::S3::Types::ListObjectsV2Output) + fake_s3_resp = double(Aws::S3::Types::ListObjectsV2Output, is_truncated: false) fake_aws_client.stub(:list_objects_v2).and_return(fake_s3_resp) fake_s3_resp.stub(:to_h).and_return(s3_hash) @@ -666,11 +687,10 @@ before do s3_query_service.stub(:client).and_return(fake_aws_client) - fake_s3_resp = double(Aws::S3::Types::ListObjectsV2Output) + fake_s3_resp = double(Aws::S3::Types::ListObjectsV2Output, next_continuation_token: "abc123") fake_aws_client.stub(:list_objects_v2).and_return(fake_s3_resp) - s3_hash_truncated = s3_hash.clone - s3_hash_truncated[:is_truncated] = true - fake_s3_resp.stub(:to_h).and_return(s3_hash_truncated, s3_hash) + fake_s3_resp.stub(:is_truncated).and_return(true, false, true, false) + fake_s3_resp.stub(:to_h).and_return(s3_hash) end it "it retrieves the files for the work" do @@ -681,18 +701,31 @@ expect(files[2].filename).to match(/README/) expect(files[3].filename).to match(/SCoData_combined_v1_2020-07_datapackage.json/) expect(fake_aws_client).to have_received(:list_objects_v2).with(bucket: "example-bucket", max_keys: 1000, prefix: "10.34770/pe9w-x904/#{work.id}/") - expect(fake_aws_client).to have_received(:list_objects_v2).with(bucket: "example-bucket", continuation_token: nil, max_keys: 1000, prefix: "10.34770/pe9w-x904/#{work.id}/") + expect(fake_aws_client).to have_received(:list_objects_v2).with(bucket: "example-bucket", continuation_token: "abc123", max_keys: 1000, prefix: "10.34770/pe9w-x904/#{work.id}/") + end + + it "it retrieves the files for a bucket and prefix once" do + s3_query_service.client_s3_files(bucket_name: "other-bucket", prefix: "new-prefix") + files = s3_query_service.client_s3_files(bucket_name: "other-bucket", prefix: "new-prefix") + expect(files.count).to eq 4 + expect(files.first.filename).to match(/README/) + expect(files[1].filename).to match(/SCoData_combined_v1_2020-07_datapackage.json/) + expect(files[2].filename).to match(/README/) + expect(files[3].filename).to match(/SCoData_combined_v1_2020-07_datapackage.json/) + expect(fake_aws_client).to have_received(:list_objects_v2).with(bucket: "other-bucket", max_keys: 1000, prefix: "new-prefix").once + expect(fake_aws_client).to have_received(:list_objects_v2).with(bucket: "other-bucket", continuation_token: "abc123", max_keys: 1000, prefix: "new-prefix").once end - it "it retrieves the files for a bucket and prefix" do + it "it retrieves the files for a bucket and prefix any time reload is true" do + s3_query_service.client_s3_files(reload: true, bucket_name: "other-bucket", prefix: "new-prefix") files = s3_query_service.client_s3_files(reload: true, bucket_name: "other-bucket", prefix: "new-prefix") expect(files.count).to eq 4 expect(files.first.filename).to match(/README/) expect(files[1].filename).to match(/SCoData_combined_v1_2020-07_datapackage.json/) expect(files[2].filename).to match(/README/) expect(files[3].filename).to match(/SCoData_combined_v1_2020-07_datapackage.json/) - expect(fake_aws_client).to have_received(:list_objects_v2).with(bucket: "other-bucket", max_keys: 1000, prefix: "new-prefix") - expect(fake_aws_client).to have_received(:list_objects_v2).with(bucket: "other-bucket", continuation_token: nil, max_keys: 1000, prefix: "new-prefix") + expect(fake_aws_client).to have_received(:list_objects_v2).with(bucket: "other-bucket", max_keys: 1000, prefix: "new-prefix").twice + expect(fake_aws_client).to have_received(:list_objects_v2).with(bucket: "other-bucket", continuation_token: "abc123", max_keys: 1000, prefix: "new-prefix").twice end it "retrieves the directories if requested" do @@ -705,7 +738,7 @@ expect(files[4].filename).to match(/SCoData_combined_v1_2020-07_datapackage.json/) expect(files[5].filename).to match(/directory/) expect(fake_aws_client).to have_received(:list_objects_v2).with(bucket: "other-bucket", max_keys: 1000, prefix: "new-prefix") - expect(fake_aws_client).to have_received(:list_objects_v2).with(bucket: "other-bucket", continuation_token: nil, max_keys: 1000, prefix: "new-prefix") + expect(fake_aws_client).to have_received(:list_objects_v2).with(bucket: "other-bucket", continuation_token: "abc123", max_keys: 1000, prefix: "new-prefix") end end @@ -715,11 +748,10 @@ before do s3_query_service.stub(:client).and_return(fake_aws_client) - fake_s3_resp = double(Aws::S3::Types::ListObjectsV2Output) + fake_s3_resp = double(Aws::S3::Types::ListObjectsV2Output, next_continuation_token: "abc123") fake_aws_client.stub(:list_objects_v2).and_return(fake_s3_resp) - s3_hash_truncated = s3_hash.clone - s3_hash_truncated[:is_truncated] = true - fake_s3_resp.stub(:to_h).and_return(s3_hash_truncated, s3_hash) + fake_s3_resp.stub(:is_truncated).and_return(true, false, true, false) + fake_s3_resp.stub(:to_h).and_return(s3_hash) end it "it retrieves the files for the work" do @@ -728,16 +760,27 @@ expect(files.first.filename).to eq(s3_key2) expect(files[1].filename).to match(s3_key2) expect(fake_aws_client).to have_received(:list_objects_v2).with(bucket: "example-bucket", max_keys: 1000, prefix: "10.34770/pe9w-x904/#{work.id}/") - expect(fake_aws_client).to have_received(:list_objects_v2).with(bucket: "example-bucket", continuation_token: nil, max_keys: 1000, prefix: "10.34770/pe9w-x904/#{work.id}/") + expect(fake_aws_client).to have_received(:list_objects_v2).with(bucket: "example-bucket", continuation_token: "abc123", max_keys: 1000, prefix: "10.34770/pe9w-x904/#{work.id}/") end - it "it retrieves the files for a bucket and prefix" do + it "it retrieves the files for a bucket and prefix once" do + s3_query_service.client_s3_empty_files(bucket_name: "other-bucket", prefix: "new-prefix") + files = s3_query_service.client_s3_empty_files(bucket_name: "other-bucket", prefix: "new-prefix") + expect(files.count).to eq 2 + expect(files.first.filename).to eq(s3_key2) + expect(files[1].filename).to eq(s3_key2) + expect(fake_aws_client).to have_received(:list_objects_v2).with(bucket: "other-bucket", max_keys: 1000, prefix: "new-prefix").once + expect(fake_aws_client).to have_received(:list_objects_v2).with(bucket: "other-bucket", continuation_token: "abc123", max_keys: 1000, prefix: "new-prefix").once + end + + it "it retrieves the files for a bucket and prefix any time reload is true" do + s3_query_service.client_s3_empty_files(reload: true, bucket_name: "other-bucket", prefix: "new-prefix") files = s3_query_service.client_s3_empty_files(reload: true, bucket_name: "other-bucket", prefix: "new-prefix") expect(files.count).to eq 2 expect(files.first.filename).to eq(s3_key2) expect(files[1].filename).to eq(s3_key2) - expect(fake_aws_client).to have_received(:list_objects_v2).with(bucket: "other-bucket", max_keys: 1000, prefix: "new-prefix") - expect(fake_aws_client).to have_received(:list_objects_v2).with(bucket: "other-bucket", continuation_token: nil, max_keys: 1000, prefix: "new-prefix") + expect(fake_aws_client).to have_received(:list_objects_v2).with(bucket: "other-bucket", max_keys: 1000, prefix: "new-prefix").twice + expect(fake_aws_client).to have_received(:list_objects_v2).with(bucket: "other-bucket", continuation_token: "abc123", max_keys: 1000, prefix: "new-prefix").twice end end diff --git a/spec/support/s3_query_service_specs.rb b/spec/support/s3_query_service_specs.rb index dde4dda6..79a232cd 100644 --- a/spec/support/s3_query_service_specs.rb +++ b/spec/support/s3_query_service_specs.rb @@ -7,7 +7,7 @@ def stub_s3(data: [], bucket_url: nil, prefix: "10.34770/123-abc/1", bucket_name allow(@s3_client).to receive(:put_object) fake_s3_query = instance_double(S3QueryService, data_profile: { objects: data, ok: true }, client: @s3_client, - client_s3_files: data, prefix:) + client_s3_files: data, prefix:, count_objects: data.count) mock_methods(fake_s3_query, data, bucket_name) allow(S3QueryService).to receive(:new).and_return(fake_s3_query) diff --git a/spec/system/provenance_note_spec.rb b/spec/system/provenance_note_spec.rb index 882214f7..b665d3bc 100644 --- a/spec/system/provenance_note_spec.rb +++ b/spec/system/provenance_note_spec.rb @@ -2,6 +2,10 @@ require "rails_helper" RSpec.describe "Adding a Provenance note", type: :system, js: true do + before do + stub_s3 + end + context "A user that does not have permissions to edit the provenance" do let(:any_user) { FactoryBot.create(:user) } let(:work) { FactoryBot.create(:draft_work, created_by_user_id: any_user.id) }
<%= t('works.form.curation_uploads.empty') %>