Skip to content

Commit

Permalink
allow to name the downloaded files according to formReference (#178)
Browse files Browse the repository at this point in the history
* allow to name the downloaded files according to formReference; fixed compatibility issue with audio/wav which should be audio/x-wav

* added test

---------

Co-authored-by: Robert Forkel <xrotwang@googlemail.com>
  • Loading branch information
Bibiko and xrotwang committed Sep 9, 2024
1 parent f8cd20a commit afda5ea
Show file tree
Hide file tree
Showing 4 changed files with 25 additions and 3 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,12 @@

The `pycldf` package adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).


## Unreleased

- Added option to `downloadmedia` subcommand to customize file naming.


## [1.38.1] - 2024-05-06

- Fixed bug whereby foreign keys for the ParameterNetwork component were not added correctly because
Expand Down
7 changes: 6 additions & 1 deletion src/pycldf/commands/downloadmedia.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,11 @@

def register(parser):
add_dataset(parser)
parser.add_argument(
'--use-form-id',
help='Use the value in formReference as name of downloaded file',
action='store_true',
default=False)
parser.add_argument(
'output',
help='Existing local directory to download the files to',
Expand All @@ -27,6 +32,6 @@ def run(args):
for s in args.filters:
col, _, substring = s.partition('=')
filters.append((col, substring))
for item in MediaTable(get_dataset(args)):
for item in MediaTable(get_dataset(args), args.use_form_id):
if all(substring in item[col] for col, substring in filters):
item.save(args.output)
11 changes: 9 additions & 2 deletions src/pycldf/media.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ class File:
"""
def __init__(self, media: 'MediaTable', row: dict):
self.row = row
self.id = row[media.id_col.name]
self.id = row[media.filename_col.name]
self._mimetype = row[media.mimetype_col.name]
self.url = None
self.scheme = None
Expand Down Expand Up @@ -184,7 +184,7 @@ class MediaTable(pycldf.ComponentWithValidation):
"""
Container class for a `Dataset`'s media items.
"""
def __init__(self, ds: pycldf.Dataset):
def __init__(self, ds: pycldf.Dataset, use_form_id: bool = False):
super().__init__(ds)
self.url_col = ds.get(('MediaTable', 'http://cldf.clld.org/v1.0/terms.rdf#downloadUrl'))
self.path_in_zip_col = ds.get(
Expand All @@ -196,6 +196,8 @@ def __init__(self, ds: pycldf.Dataset):
self.url_col = col
break
self.id_col = ds[self.component, 'http://cldf.clld.org/v1.0/terms.rdf#id']
self.filename_col = ds[self.component, 'http://cldf.clld.org/v1.0/terms.rdf#formReference']\
if use_form_id else self.id_col
self.mimetype_col = ds[self.component, 'http://cldf.clld.org/v1.0/terms.rdf#mediaType']

@functools.cached_property
Expand Down Expand Up @@ -271,6 +273,11 @@ def __init__(self, s):
mtype, _, param = self.string.partition(';')
param = param.strip()
self.type, _, self.subtype = mtype.partition('/')

# for compatibility reasons
if self.type == 'audio' and self.subtype.lower() in {'wav'}:
self.subtype = 'x-wav'

if param.startswith('charset='):
self.encoding = param.replace('charset=', '').strip()
else:
Expand Down
4 changes: 4 additions & 0 deletions tests/test_media.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,10 @@ def test_read_data_url(url, data):
urllib.parse.urlparse(url), Mimetype('text/plain;charset=US-ASCII')) == data


def test_fixed_mimetypes():
assert Mimetype('audio/wav').extension == '.wav'


def test_read_file_url(tmp_path):
p = tmp_path / 'test.txt'
p.write_text('äöü', encoding='utf16')
Expand Down

0 comments on commit afda5ea

Please sign in to comment.