Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add an option to record file content size #201

Merged
merged 2 commits into from
Sep 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion rocrate/model/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,10 @@ def write(self, base_path):
out_file_path.parent.mkdir(parents=True, exist_ok=True)
mode = 'w' + ('b' if isinstance(self.source, BytesIO) else 't')
with open(out_file_path, mode) as out_file:
out_file.write(self.source.getvalue())
content = self.source.getvalue()
out_file.write(content)
if self.record_size:
self._jsonld['contentSize'] = str(len(content))
elif is_url(str(self.source)):
if self.fetch_remote or self.validate_url:
if self.validate_url:
Expand All @@ -62,10 +65,14 @@ def write(self, base_path):
out_file_path.parent.mkdir(parents=True, exist_ok=True)
urllib.request.urlretrieve(self.source, out_file_path)
self._jsonld['contentUrl'] = str(self.source)
if self.record_size:
self._jsonld['contentSize'] = str(out_file_path.stat().st_size)
elif self.source is None:
# Allows to record a File entity whose @id does not exist, see #73
warnings.warn(f"No source for {self.id}")
else:
out_file_path.parent.mkdir(parents=True, exist_ok=True)
if not out_file_path.exists() or not out_file_path.samefile(self.source):
shutil.copy(self.source, out_file_path)
if self.record_size:
self._jsonld['contentSize'] = str(out_file_path.stat().st_size)
3 changes: 2 additions & 1 deletion rocrate/model/file_or_dir.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,12 @@
class FileOrDir(DataEntity):

def __init__(self, crate, source=None, dest_path=None, fetch_remote=False,
validate_url=False, properties=None):
validate_url=False, properties=None, record_size=False):
if properties is None:
properties = {}
self.fetch_remote = fetch_remote
self.validate_url = validate_url
self.record_size = record_size
self.source = source
if dest_path:
dest_path = Path(dest_path)
Expand Down
17 changes: 10 additions & 7 deletions rocrate/rocrate.py
Original file line number Diff line number Diff line change
Expand Up @@ -342,15 +342,17 @@ def add_file(
dest_path=None,
fetch_remote=False,
validate_url=False,
properties=None
properties=None,
record_size=False
):
return self.add(File(
self,
source=source,
dest_path=dest_path,
fetch_remote=fetch_remote,
validate_url=validate_url,
properties=properties
properties=properties,
record_size=record_size
))

def add_dataset(
Expand Down Expand Up @@ -478,11 +480,12 @@ def write_zip(self, out_path):

def add_workflow(
self, source=None, dest_path=None, fetch_remote=False, validate_url=False, properties=None,
main=False, lang="cwl", lang_version=None, gen_cwl=False, cls=ComputationalWorkflow
main=False, lang="cwl", lang_version=None, gen_cwl=False, cls=ComputationalWorkflow,
record_size=False
):
workflow = self.add(cls(
self, source=source, dest_path=dest_path, fetch_remote=fetch_remote,
validate_url=validate_url, properties=properties
validate_url=validate_url, properties=properties, record_size=record_size
))
if isinstance(lang, ComputerLanguage):
assert lang.crate is self
Expand All @@ -503,7 +506,7 @@ def add_workflow(
cwl_dest_path = Path(source).with_suffix(".cwl").name
cwl_workflow = self.add_workflow(
source=cwl_source, dest_path=cwl_dest_path, fetch_remote=fetch_remote, properties=properties,
main=False, lang="cwl", gen_cwl=False, cls=WorkflowDescription
main=False, lang="cwl", gen_cwl=False, cls=WorkflowDescription, record_size=record_size
)
workflow.subjectOf = cwl_workflow
return workflow
Expand Down Expand Up @@ -542,12 +545,12 @@ def add_test_instance(self, suite, url, resource="", service="jenkins", identifi

def add_test_definition(
self, suite, source=None, dest_path=None, fetch_remote=False, validate_url=False, properties=None,
engine="planemo", engine_version=None
engine="planemo", engine_version=None, record_size=False
):
suite = self.__validate_suite(suite)
definition = self.add(
TestDefinition(self, source=source, dest_path=dest_path, fetch_remote=fetch_remote,
validate_url=validate_url, properties=properties)
validate_url=validate_url, properties=properties, record_size=record_size)
)
if isinstance(engine, SoftwareApplication):
assert engine.crate is self
Expand Down
18 changes: 14 additions & 4 deletions test/test_write.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def test_file_writing(test_data_dir, tmpdir, helpers, gen_preview, to_zip):
file_subdir_id = 'sample_file_subdir.txt'

sample_file = test_data_dir / sample_file_id
file_returned = crate.add_file(sample_file)
file_returned = crate.add_file(sample_file, record_size=True)
assert file_returned.id == sample_file_id
file_returned_subdir = crate.add_file(sample_file, sample_file2_id)
assert file_returned_subdir.id == sample_file2_id
Expand Down Expand Up @@ -99,15 +99,17 @@ def test_file_writing(test_data_dir, tmpdir, helpers, gen_preview, to_zip):
assert json_entities[formatted_creator_id]["name"] == creator_name
if gen_preview:
assert helpers.PREVIEW_FILE_NAME in json_entities
file_entity = json_entities[sample_file_id]
assert file_entity["contentSize"] == str(file1.stat().st_size)


@pytest.mark.parametrize("stream_cls", [io.BytesIO, io.StringIO])
def test_in_mem_stream(stream_cls, tmpdir, helpers):
crate = ROCrate()

test_file_id = 'a/b/test_file.txt'
file_content = b'\x00\x01' if stream_cls is io.BytesIO else 'foo\n'
file_returned = crate.add_file(stream_cls(file_content), test_file_id)
file_content = b'\x00\x01\x02' if stream_cls is io.BytesIO else 'foo'
file_returned = crate.add_file(stream_cls(file_content), test_file_id, record_size=True)
assert file_returned.id == test_file_id

out_path = tmpdir / 'ro_crate_out'
Expand All @@ -121,6 +123,9 @@ def test_in_mem_stream(stream_cls, tmpdir, helpers):
mode = 'r' + ('b' if stream_cls is io.BytesIO else 't')
with open(file1, mode) as f:
assert f.read() == file_content
json_entities = helpers.read_json_entities(out_path)
file_entity = json_entities[test_file_id]
assert file_entity['contentSize'] == '3'


@pytest.mark.parametrize(
Expand All @@ -132,7 +137,11 @@ def test_remote_uri(tmpdir, helpers, fetch_remote, validate_url, to_zip):
url = ('https://raw.githubusercontent.com/ResearchObject/ro-crate-py/'
'master/test/test-data/sample_file.txt')
relpath = "a/b/sample_file.txt"
kw = {"fetch_remote": fetch_remote, "validate_url": validate_url}
kw = {
"fetch_remote": fetch_remote,
"validate_url": validate_url,
"record_size": True,
}
if fetch_remote:
file_ = crate.add_file(url, relpath, **kw)
assert file_.id == relpath
Expand All @@ -154,6 +163,7 @@ def test_remote_uri(tmpdir, helpers, fetch_remote, validate_url, to_zip):
out_file = out_crate.dereference(file_.id)
assert (out_path / relpath).is_file()
assert out_file["contentUrl"] == url
assert out_file["contentSize"] == str((out_path / file_.id).stat().st_size)
else:
out_file = out_crate.dereference(url)
assert not (out_path / relpath).exists()
Expand Down
Loading