Skip to content

Commit

Permalink
Adding a method to just count the files instead of fully processing t…
Browse files Browse the repository at this point in the history
…hem (#1632)

also cache responses to counting and getting files can utilize the same aws data
  • Loading branch information
carolyncole authored Dec 12, 2023
1 parent 1f52c04 commit 662e6ae
Show file tree
Hide file tree
Showing 7 changed files with 120 additions and 50 deletions.
4 changes: 4 additions & 0 deletions app/models/work.rb
Original file line number Diff line number Diff line change
Expand Up @@ -472,6 +472,10 @@ def embargoed?
embargo_date >= current_date
end

def upload_count
@upload_count ||= s3_query_service.count_objects
end

protected

def work_validator
Expand Down
63 changes: 41 additions & 22 deletions app/services/s3_query_service.rb
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ def initialize(model, mode = "precuration")
@mode = mode
@part_size = 5_368_709_120 # 5GB is the maximum part size for AWS
@last_response = nil
@s3_responses = {}
end

def config
Expand Down Expand Up @@ -166,13 +167,19 @@ def find_s3_file(filename:)
# Retrieve the S3 resources uploaded to the S3 Bucket
# @return [Array<S3File>]
def client_s3_files(reload: false, bucket_name: self.bucket_name, prefix: self.prefix, ignore_directories: true)
@client_s3_files = nil if reload # force a reload
if reload # force a reload
@client_s3_files = nil
clear_s3_responses(bucket_name:, prefix:)
end
@client_s3_files ||= get_s3_objects(bucket_name:, prefix:, ignore_directories:)
end

def client_s3_empty_files(reload: false, bucket_name: self.bucket_name, prefix: self.prefix)
@client_s3_empty_files = nil if reload # force a reload
@client_s3_empty_files = begin
if reload # force a reload
@client_s3_empty_files = nil
clear_s3_responses(bucket_name:, prefix:)
end
@client_s3_empty_files ||= begin
files_and_directories = get_s3_objects(bucket_name:, prefix:, ignore_directories: false)
files_and_directories.select { |object| !object.filename.ends_with?("/") && object.empty? }
end
Expand Down Expand Up @@ -322,14 +329,41 @@ def md5(io:)
md5.base64digest
end

def count_objects(bucket_name: self.bucket_name, prefix: self.prefix)
responses = s3_responses(bucket_name:, prefix:)
responses.reduce(0) { |total, resp| total + resp.key_count }
end

private

def clear_s3_responses(bucket_name:, prefix:)
key = "#{bucket_name} #{prefix}"
@s3_responses[key] = nil
end

def s3_responses(bucket_name:, prefix:)
key = "#{bucket_name} #{prefix}"
responses = @s3_responses[key]
if responses.nil?
resp = client.list_objects_v2({ bucket: bucket_name, max_keys: 1000, prefix: })
responses = [resp]
while resp.is_truncated
resp = client.list_objects_v2({ bucket: bucket_name, max_keys: 1000, prefix:, continuation_token: resp.next_continuation_token })
responses << resp
end
@s3_responses[key] = responses
end
responses
end

def get_s3_objects(bucket_name:, prefix:, ignore_directories:)
start = Time.zone.now
resp = client.list_objects_v2({ bucket: bucket_name, max_keys: 1000, prefix: })
resp_hash = resp.to_h
objects = parse_objects(resp_hash, ignore_directories:)
objects += parse_continuation(resp_hash, bucket_name:, prefix:, ignore_directories:)
responses = s3_responses(bucket_name:, prefix:)
objects = responses.reduce([]) do |all_objects, resp|
resp_hash = resp.to_h
resp_objects = parse_objects(resp_hash, ignore_directories:)
all_objects + resp_objects
end
elapsed = Time.zone.now - start
Rails.logger.info("Loading S3 objects. Bucket: #{bucket_name}. Prefix: #{prefix}. Elapsed: #{elapsed} seconds")
objects
Expand All @@ -347,21 +381,6 @@ def parse_objects(resp, ignore_directories: true)
objects
end

def parse_continuation(resp_hash, bucket_name: self.bucket_name, prefix: self.prefix, ignore_directories: true)
objects = []
while resp_hash[:is_truncated]
token = resp_hash[:next_continuation_token]
resp = client.list_objects_v2({ bucket: bucket_name, max_keys: 1000, prefix:, continuation_token: token })
resp_hash = resp.to_h
objects += parse_objects(resp_hash, ignore_directories:)
end
objects
rescue Aws::Errors::ServiceError => aws_service_error
message = "An error was encountered when requesting to list the AWS S3 Objects in the bucket #{bucket_name} with the key #{prefix}: #{aws_service_error}"
Rails.logger.error(message)
raise aws_service_error
end

def upload_multipart_file(target_bucket:, target_key:, size:, io:)
multi = client.create_multipart_upload(bucket: target_bucket, key: target_key)
part_num = 0
Expand Down
4 changes: 2 additions & 2 deletions app/views/works/_s3_resources.html.erb
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
<div class="lux">
<div class="card">
<div class="files card-body">
<h3><%= "#{@work.uploads.count} #{'File'.pluralize(@work.uploads.count)}"%> in
<h3><%= "#{@work.upload_count} #{'File'.pluralize(@work.upload_count)} or #{'Directory'.pluralize(@work.upload_count)}"%> in
<%= @work.approved? ? "post-curation" : "pre-curation" %> storage
</h3>
<table id="files-table" class="table">
Expand All @@ -15,7 +15,7 @@
</tr>
</thead>
<tbody>
<% if @work.uploads.empty? %>
<% if @work.upload_count.zero? %>
<tr class="files">
<td><span><%= t('works.form.curation_uploads.empty') %></span></td>
<td><span></span></td>
Expand Down
2 changes: 1 addition & 1 deletion app/views/works/show.html.erb
Original file line number Diff line number Diff line change
Expand Up @@ -270,7 +270,7 @@

<dt>Location</dt>
<dd>
<% if @work.pre_curation_uploads_fast.present? %>
<% if @work.upload_count > 0 %>
Amazon S3 Curation Storage
<% elsif @work.files_location_cluster? %>
PUL Research Cluster
Expand Down
91 changes: 67 additions & 24 deletions spec/services/s3_query_service_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@
it "takes a DOI and returns information about that DOI in S3" do
fake_aws_client = double(Aws::S3::Client)
s3_query_service.stub(:client).and_return(fake_aws_client)
fake_s3_resp = double(Aws::S3::Types::ListObjectsV2Output)
fake_s3_resp = double(Aws::S3::Types::ListObjectsV2Output, is_truncated: false)
fake_aws_client.stub(:list_objects_v2).and_return(fake_s3_resp)
fake_s3_resp.stub(:to_h).and_return(s3_hash)

Expand All @@ -127,11 +127,10 @@
it "takes a DOI and returns information about that DOI in S3 with pagination" do
fake_aws_client = double(Aws::S3::Client)
s3_query_service.stub(:client).and_return(fake_aws_client)
fake_s3_resp = double(Aws::S3::Types::ListObjectsV2Output)
fake_s3_resp = double(Aws::S3::Types::ListObjectsV2Output, next_continuation_token: "abc123")
fake_aws_client.stub(:list_objects_v2).and_return(fake_s3_resp)
s3_hash_truncated = s3_hash.clone
s3_hash_truncated[:is_truncated] = true
fake_s3_resp.stub(:to_h).and_return(s3_hash_truncated, s3_hash)
fake_s3_resp.stub(:is_truncated).and_return(true, false)
fake_s3_resp.stub(:to_h).and_return(s3_hash)

data_profile = s3_query_service.data_profile
expect(data_profile[:objects]).to be_instance_of(Array)
Expand Down Expand Up @@ -169,7 +168,7 @@
let(:fake_multi) { instance_double(Aws::S3::Types::CreateMultipartUploadOutput, key: "abc", upload_id: "upload id", bucket: "bucket") }
let(:fake_parts) { instance_double(Aws::S3::Types::CopyPartResult, etag: "etag123abc", checksum_sha256: "sha256abc123") }
let(:fake_upload) { instance_double(Aws::S3::Types::UploadPartCopyOutput, copy_part_result: fake_parts) }
let(:fake_s3_resp) { double(Aws::S3::Types::ListObjectsV2Output) }
let(:fake_s3_resp) { double(Aws::S3::Types::ListObjectsV2Output, is_truncated: false) }
let(:preservation_service) { instance_double(WorkPreservationService) }

before do
Expand Down Expand Up @@ -373,6 +372,28 @@
end
end
end

describe "#count_objects" do
before do
fake_s3_resp.stub(:key_count).and_return(s3_hash[:contents].count)
fake_s3_resp.stub(:is_truncated).and_return(false)
end

it "returns all the objects" do
expect(s3_query_service.count_objects).to eq(3)
end

context "truncated results" do
before do
fake_s3_resp.stub(:key_count).and_return(s3_hash[:contents].count)
fake_s3_resp.stub(:is_truncated).and_return(true, false)
fake_s3_resp.stub(:next_continuation_token).and_return("abc")
end
it "returns all the objects" do
expect(s3_query_service.count_objects).to eq(6)
end
end
end
end

context "post curated" do
Expand All @@ -381,7 +402,7 @@
it "keeps precurated and post curated items separate" do
fake_aws_client = double(Aws::S3::Client)
s3_query_service.stub(:client).and_return(fake_aws_client)
fake_s3_resp = double(Aws::S3::Types::ListObjectsV2Output)
fake_s3_resp = double(Aws::S3::Types::ListObjectsV2Output, is_truncated: false)
fake_aws_client.stub(:list_objects_v2).and_return(fake_s3_resp)
fake_s3_resp.stub(:to_h).and_return(s3_hash)

Expand Down Expand Up @@ -666,11 +687,10 @@

before do
s3_query_service.stub(:client).and_return(fake_aws_client)
fake_s3_resp = double(Aws::S3::Types::ListObjectsV2Output)
fake_s3_resp = double(Aws::S3::Types::ListObjectsV2Output, next_continuation_token: "abc123")
fake_aws_client.stub(:list_objects_v2).and_return(fake_s3_resp)
s3_hash_truncated = s3_hash.clone
s3_hash_truncated[:is_truncated] = true
fake_s3_resp.stub(:to_h).and_return(s3_hash_truncated, s3_hash)
fake_s3_resp.stub(:is_truncated).and_return(true, false, true, false)
fake_s3_resp.stub(:to_h).and_return(s3_hash)
end

it "it retrieves the files for the work" do
Expand All @@ -681,18 +701,31 @@
expect(files[2].filename).to match(/README/)
expect(files[3].filename).to match(/SCoData_combined_v1_2020-07_datapackage.json/)
expect(fake_aws_client).to have_received(:list_objects_v2).with(bucket: "example-bucket", max_keys: 1000, prefix: "10.34770/pe9w-x904/#{work.id}/")
expect(fake_aws_client).to have_received(:list_objects_v2).with(bucket: "example-bucket", continuation_token: nil, max_keys: 1000, prefix: "10.34770/pe9w-x904/#{work.id}/")
expect(fake_aws_client).to have_received(:list_objects_v2).with(bucket: "example-bucket", continuation_token: "abc123", max_keys: 1000, prefix: "10.34770/pe9w-x904/#{work.id}/")
end

it "it retrieves the files for a bucket and prefix once" do
s3_query_service.client_s3_files(bucket_name: "other-bucket", prefix: "new-prefix")
files = s3_query_service.client_s3_files(bucket_name: "other-bucket", prefix: "new-prefix")
expect(files.count).to eq 4
expect(files.first.filename).to match(/README/)
expect(files[1].filename).to match(/SCoData_combined_v1_2020-07_datapackage.json/)
expect(files[2].filename).to match(/README/)
expect(files[3].filename).to match(/SCoData_combined_v1_2020-07_datapackage.json/)
expect(fake_aws_client).to have_received(:list_objects_v2).with(bucket: "other-bucket", max_keys: 1000, prefix: "new-prefix").once
expect(fake_aws_client).to have_received(:list_objects_v2).with(bucket: "other-bucket", continuation_token: "abc123", max_keys: 1000, prefix: "new-prefix").once
end

it "it retrieves the files for a bucket and prefix" do
it "it retrieves the files for a bucket and prefix any time reload is true" do
s3_query_service.client_s3_files(reload: true, bucket_name: "other-bucket", prefix: "new-prefix")
files = s3_query_service.client_s3_files(reload: true, bucket_name: "other-bucket", prefix: "new-prefix")
expect(files.count).to eq 4
expect(files.first.filename).to match(/README/)
expect(files[1].filename).to match(/SCoData_combined_v1_2020-07_datapackage.json/)
expect(files[2].filename).to match(/README/)
expect(files[3].filename).to match(/SCoData_combined_v1_2020-07_datapackage.json/)
expect(fake_aws_client).to have_received(:list_objects_v2).with(bucket: "other-bucket", max_keys: 1000, prefix: "new-prefix")
expect(fake_aws_client).to have_received(:list_objects_v2).with(bucket: "other-bucket", continuation_token: nil, max_keys: 1000, prefix: "new-prefix")
expect(fake_aws_client).to have_received(:list_objects_v2).with(bucket: "other-bucket", max_keys: 1000, prefix: "new-prefix").twice
expect(fake_aws_client).to have_received(:list_objects_v2).with(bucket: "other-bucket", continuation_token: "abc123", max_keys: 1000, prefix: "new-prefix").twice
end

it "retrieves the directories if requested" do
Expand All @@ -705,7 +738,7 @@
expect(files[4].filename).to match(/SCoData_combined_v1_2020-07_datapackage.json/)
expect(files[5].filename).to match(/directory/)
expect(fake_aws_client).to have_received(:list_objects_v2).with(bucket: "other-bucket", max_keys: 1000, prefix: "new-prefix")
expect(fake_aws_client).to have_received(:list_objects_v2).with(bucket: "other-bucket", continuation_token: nil, max_keys: 1000, prefix: "new-prefix")
expect(fake_aws_client).to have_received(:list_objects_v2).with(bucket: "other-bucket", continuation_token: "abc123", max_keys: 1000, prefix: "new-prefix")
end
end

Expand All @@ -715,11 +748,10 @@

before do
s3_query_service.stub(:client).and_return(fake_aws_client)
fake_s3_resp = double(Aws::S3::Types::ListObjectsV2Output)
fake_s3_resp = double(Aws::S3::Types::ListObjectsV2Output, next_continuation_token: "abc123")
fake_aws_client.stub(:list_objects_v2).and_return(fake_s3_resp)
s3_hash_truncated = s3_hash.clone
s3_hash_truncated[:is_truncated] = true
fake_s3_resp.stub(:to_h).and_return(s3_hash_truncated, s3_hash)
fake_s3_resp.stub(:is_truncated).and_return(true, false, true, false)
fake_s3_resp.stub(:to_h).and_return(s3_hash)
end

it "it retrieves the files for the work" do
Expand All @@ -728,16 +760,27 @@
expect(files.first.filename).to eq(s3_key2)
expect(files[1].filename).to match(s3_key2)
expect(fake_aws_client).to have_received(:list_objects_v2).with(bucket: "example-bucket", max_keys: 1000, prefix: "10.34770/pe9w-x904/#{work.id}/")
expect(fake_aws_client).to have_received(:list_objects_v2).with(bucket: "example-bucket", continuation_token: nil, max_keys: 1000, prefix: "10.34770/pe9w-x904/#{work.id}/")
expect(fake_aws_client).to have_received(:list_objects_v2).with(bucket: "example-bucket", continuation_token: "abc123", max_keys: 1000, prefix: "10.34770/pe9w-x904/#{work.id}/")
end

it "it retrieves the files for a bucket and prefix" do
it "it retrieves the files for a bucket and prefix once" do
s3_query_service.client_s3_empty_files(bucket_name: "other-bucket", prefix: "new-prefix")
files = s3_query_service.client_s3_empty_files(bucket_name: "other-bucket", prefix: "new-prefix")
expect(files.count).to eq 2
expect(files.first.filename).to eq(s3_key2)
expect(files[1].filename).to eq(s3_key2)
expect(fake_aws_client).to have_received(:list_objects_v2).with(bucket: "other-bucket", max_keys: 1000, prefix: "new-prefix").once
expect(fake_aws_client).to have_received(:list_objects_v2).with(bucket: "other-bucket", continuation_token: "abc123", max_keys: 1000, prefix: "new-prefix").once
end

it "it retrieves the files for a bucket and prefix any time reload is true" do
s3_query_service.client_s3_empty_files(reload: true, bucket_name: "other-bucket", prefix: "new-prefix")
files = s3_query_service.client_s3_empty_files(reload: true, bucket_name: "other-bucket", prefix: "new-prefix")
expect(files.count).to eq 2
expect(files.first.filename).to eq(s3_key2)
expect(files[1].filename).to eq(s3_key2)
expect(fake_aws_client).to have_received(:list_objects_v2).with(bucket: "other-bucket", max_keys: 1000, prefix: "new-prefix")
expect(fake_aws_client).to have_received(:list_objects_v2).with(bucket: "other-bucket", continuation_token: nil, max_keys: 1000, prefix: "new-prefix")
expect(fake_aws_client).to have_received(:list_objects_v2).with(bucket: "other-bucket", max_keys: 1000, prefix: "new-prefix").twice
expect(fake_aws_client).to have_received(:list_objects_v2).with(bucket: "other-bucket", continuation_token: "abc123", max_keys: 1000, prefix: "new-prefix").twice
end
end

Expand Down
2 changes: 1 addition & 1 deletion spec/support/s3_query_service_specs.rb
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ def stub_s3(data: [], bucket_url: nil, prefix: "10.34770/123-abc/1", bucket_name
allow(@s3_client).to receive(:put_object)

fake_s3_query = instance_double(S3QueryService, data_profile: { objects: data, ok: true }, client: @s3_client,
client_s3_files: data, prefix:)
client_s3_files: data, prefix:, count_objects: data.count)
mock_methods(fake_s3_query, data, bucket_name)
allow(S3QueryService).to receive(:new).and_return(fake_s3_query)

Expand Down
4 changes: 4 additions & 0 deletions spec/system/provenance_note_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@
require "rails_helper"

RSpec.describe "Adding a Provenance note", type: :system, js: true do
before do
stub_s3
end

context "A user that does not have permissions to edit the provenance" do
let(:any_user) { FactoryBot.create(:user) }
let(:work) { FactoryBot.create(:draft_work, created_by_user_id: any_user.id) }
Expand Down

0 comments on commit 662e6ae

Please sign in to comment.