Skip to content

Commit

Permalink
Merge pull request #70 from steppi/boto
Browse files Browse the repository at this point in the history
Replace wget with boto3
  • Loading branch information
Albert Steppi authored May 21, 2022
2 parents d5b0948 + 5af8296 commit 92a3a14
Show file tree
Hide file tree
Showing 4 changed files with 91 additions and 56 deletions.
135 changes: 84 additions & 51 deletions adeft/download/download.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
import adeft.locations as loc
import boto3
import botocore
import os
import gzip
import json
import wget
import shutil
import logging
import requests


from adeft.locations import ADEFT_MODELS_PATH, S3_BUCKET_URL, \
RESOURCES_PATH, TEST_RESOURCES_PATH
from botocore import UNSIGNED
from botocore.config import Config


logger = logging.getLogger(__file__)
Expand All @@ -17,9 +17,9 @@
def setup_models_folder():
"""Create models folder if it does not exist and download models
"""
if os.path.isdir(ADEFT_MODELS_PATH):
shutil.rmtree(ADEFT_MODELS_PATH)
os.mkdir(ADEFT_MODELS_PATH)
if os.path.isdir(loc.ADEFT_MODELS_PATH):
shutil.rmtree(loc.ADEFT_MODELS_PATH)
os.mkdir(loc.ADEFT_MODELS_PATH)
download_models()
return

Expand Down Expand Up @@ -51,41 +51,48 @@ def download_models(models=None):
models = set(models) & set(s3_models)
for model in models:
# create model directory if it does not currently exist
if not os.path.exists(os.path.join(ADEFT_MODELS_PATH, model)):
os.makedirs(os.path.join(ADEFT_MODELS_PATH, model))
for resource in (model + '_grounding_dict.json',
model + '_names.json',
model + '_model.gz'):
resource_path = os.path.join(ADEFT_MODELS_PATH, model, resource)
# if resource already exists, remove it since wget will not
# overwrite existing files, choosing a new name instead
if not os.path.exists(os.path.join(loc.ADEFT_MODELS_PATH, model)):
os.makedirs(os.path.join(loc.ADEFT_MODELS_PATH, model))
for resource in (
model + '_grounding_dict.json',
model + '_names.json',
model + '_model.gz'
):
resource_path = os.path.join(
loc.ADEFT_MODELS_PATH, model, resource
)
# if resource already exists, remove it. Ensures that
# models path stays in a consistent state.
_remove_if_exists(resource_path)
wget.download(url='/'.join((S3_BUCKET_URL, 'Models',
model, resource)),
out=resource_path)
download_adeft_object(
'Models', model, resource, outpath=resource_path
)


def setup_resources_folder():
"""Make resources folder and download resources
Replaces content in existing resources folder if it already exists
"""
if os.path.isdir(RESOURCES_PATH):
shutil.rmtree(RESOURCES_PATH)
os.mkdir(RESOURCES_PATH)
if os.path.isdir(loc.RESOURCES_PATH):
shutil.rmtree(loc.RESOURCES_PATH)
os.mkdir(loc.RESOURCES_PATH)
download_resources()


def download_resources():
resources = ['groundings.csv']
for resource in resources:
resource_path = os.path.join(RESOURCES_PATH, f'{resource}.gz')
resource_path = os.path.join(loc.RESOURCES_PATH, f'{resource}.gz')
_remove_if_exists(resource_path)
wget.download(url='/'.join((S3_BUCKET_URL, 'Resources',
f'{resource}.gz')),
out=resource_path)
download_adeft_object(
'Resources', f'{resource}.gz',
outpath=resource_path
)
with gzip.open(resource_path, 'rb') as f_in:
with open(os.path.join(RESOURCES_PATH, resource), 'wb') as f_out:
with open(
os.path.join(loc.RESOURCES_PATH, resource), 'wb'
) as f_out:
shutil.copyfileobj(f_in, f_out)
os.remove(resource_path)

Expand All @@ -96,12 +103,12 @@ def setup_test_resource_folder():
Replaces content in existing test_resource_folders if they already
exist.
"""
if os.path.isdir(TEST_RESOURCES_PATH):
shutil.rmtree(TEST_RESOURCES_PATH)
os.mkdir(TEST_RESOURCES_PATH)
os.mkdir(os.path.join(TEST_RESOURCES_PATH, 'test_model'))
os.mkdir(os.path.join(TEST_RESOURCES_PATH, 'scratch'))
os.mkdir(os.path.join(TEST_RESOURCES_PATH, 'test_model', 'IR'))
if os.path.isdir(loc.TEST_RESOURCES_PATH):
shutil.rmtree(loc.TEST_RESOURCES_PATH)
os.mkdir(loc.TEST_RESOURCES_PATH)
os.mkdir(os.path.join(loc.TEST_RESOURCES_PATH, 'test_model'))
os.mkdir(os.path.join(loc.TEST_RESOURCES_PATH, 'scratch'))
os.mkdir(os.path.join(loc.TEST_RESOURCES_PATH, 'test_model', 'IR'))
download_test_resources()
return

Expand All @@ -115,23 +122,27 @@ def download_test_resources():
not already exist they will be created when running
python -m adeft.download
"""
test_model_path = os.path.join(TEST_RESOURCES_PATH, 'test_model', 'IR')
test_model_path = os.path.join(loc.TEST_RESOURCES_PATH, 'test_model', 'IR')
if not os.path.exists(test_model_path):
os.mkdir(test_model_path)
for resource in ('IR_grounding_dict.json', 'IR_names.json', 'IR_model.gz'):
if not os.path.exists(os.path.join(test_model_path, resource)):
wget.download(url='/'.join((S3_BUCKET_URL, 'Test', 'IR',
resource)),
out=os.path.join(test_model_path, resource))
if not os.path.exists(os.path.join(TEST_RESOURCES_PATH,
download_adeft_object(
'Test', 'IR', resource,
outpath=os.path.join(test_model_path, resource),
)
if not os.path.exists(os.path.join(loc.TEST_RESOURCES_PATH,
'example_training_data.json')):
wget.download(url='/'.join((S3_BUCKET_URL, 'Test',
'example_training_data.json')),
out=os.path.join(TEST_RESOURCES_PATH,
'example_training_data.json'))
download_adeft_object(
'Test', 'example_training_data.json',
outpath=os.path.join(
loc.TEST_RESOURCES_PATH,
'example_training_data.json'
),
)


def get_available_models(path=ADEFT_MODELS_PATH):
def get_available_models(path=loc.ADEFT_MODELS_PATH):
"""Returns set of all models currently in models folder"""
if not os.path.exists(path):
return {}
Expand All @@ -157,15 +168,37 @@ def get_available_models(path=ADEFT_MODELS_PATH):

def get_s3_models():
"""Returns set of all models currently available on s3"""
result = requests.get('/'.join((S3_BUCKET_URL, 'Models',
's3_models.json')))
try:
output = result.json()
assert isinstance(output, dict)
except json.JSONDecodeError or AssertionError:
output = {}
logger.warning('Online deft models are currently unavailable')
return output
response = _read_s3_content(
bucket=loc.S3_BUCKET,
key=_get_s3_key('Models', 's3_models.json'),
)
except botocore.exceptions.ClientError:
logger.warning("Online Adeft models not available.")
return
return json.loads(response["Body"].read())


def download_adeft_object(*args, outpath):
logger.info(f"Downloading {'/'.join(args)}")
return _anonymous_s3_download(loc.S3_BUCKET, _get_s3_key(*args), outpath)


def _get_s3_key(*args):
return '/'.join((loc.S3_KEY_PREFIX, ) + args)


def _read_s3_content(bucket, key):
config = Config(signature_version=UNSIGNED)
s3 = boto3.client('s3', config=config, region_name='us-east-1')
response = s3.get_object(Bucket=bucket, Key=key)
return response


def _anonymous_s3_download(bucket, key, outpath):
config = Config(signature_version=UNSIGNED)
s3 = boto3.client('s3', config=config, region_name='us-east-1')
s3.download_file(bucket, key, outpath)


def _remove_if_exists(path):
Expand Down
4 changes: 3 additions & 1 deletion adeft/locations.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,6 @@
RESOURCES_PATH = os.path.join(ADEFT_PATH, 'resources')
GROUNDINGS_FILE_PATH = os.path.join(RESOURCES_PATH, 'groundings.csv')
TEST_RESOURCES_PATH = os.path.join(ADEFT_PATH, 'test_resources')
S3_BUCKET_URL = f'https://adeft.s3.amazonaws.com/{__version__}'
S3_BUCKET = "adeft"
BUCKET_REGION = "us-east-1"
S3_KEY_PREFIX = __version__
3 changes: 1 addition & 2 deletions doc/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ sphinx
sphinx_rtd_theme
scikit-learn>=0.20.0
nltk
wget
requests
boto3
flask
appdirs
5 changes: 3 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,9 @@
'Programming Language :: Python :: 3.9'
],
packages=find_packages(),
install_requires=['nltk', 'scikit-learn>=0.20.0', 'wget',
'requests', 'flask', 'appdirs'],
install_requires=[
'nltk', 'scikit-learn>=0.20.0', 'boto3', 'flask', 'appdirs'
],
extras_require={'test': ['pytest', 'pytest-cov']},
keywords=['nlp', 'biology', 'disambiguation'],
ext_modules=extensions,
Expand Down

0 comments on commit 92a3a14

Please sign in to comment.