Skip to content

Commit

Permalink
Download the dataset to a temporary folder by default
Browse files Browse the repository at this point in the history
Also, add some error handling for missing folders, urls, etc.
  • Loading branch information
rolandgeider committed May 20, 2024
1 parent 248f28b commit 53230ac
Showing 1 changed file with 58 additions and 11 deletions.
69 changes: 58 additions & 11 deletions wger/nutrition/management/commands/import-usda-products.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,22 +35,54 @@

logger = logging.getLogger(__name__)

# Check https://fdc.nal.usda.gov/download-datasets.html for current file names
# current_file = 'FoodData_Central_foundation_food_json_2024-04-18.zip'
current_file = 'FoodData_Central_branded_food_json_2024-04-18.zip'
download_folder = '/Users/roland/Entwicklung/wger/server/extras/usda'


class Command(ImportProductCommand):
"""
Import an Open Food facts Dump
Import an USDA dataset
"""

def add_arguments(self, parser):
super().add_arguments(parser)

parser.add_argument(
'--use-folder',
action='store',
default='',
dest='tmp_folder',
type=str,
help='Controls whether to use a temporary folder created by python (the default) or '
'the path provided for storing the downloaded dataset. If there are already '
'downloaded or extracted files here, they will be used instead of fetching them '
'again.'
)

parser.add_argument(
'--dataset-name',
action='store',
default='FoodData_Central_branded_food_json_2024-04-18.zip',
dest='dataset_name',
type=str,
help='What dataset to download, this value will be appended to '
'"https://fdc.nal.usda.gov/fdc-datasets/". Consult '
'https://fdc.nal.usda.gov/download-datasets.html for current file names',
)

def handle(self, **options):
if options['mode'] == 'insert':
self.mode = Mode.INSERT

usda_url = f'https://fdc.nal.usda.gov/fdc-datasets/{current_file}'
usda_url = f'https://fdc.nal.usda.gov/fdc-datasets/{options["dataset_name"]}'

if options['tmp_folder']:
download_folder = options['tmp_folder']

# Check whether the folder exists
if not os.path.exists(download_folder):
self.stdout.write(self.style.ERROR(f'Folder {download_folder} does not exist!'))
return
else:
tmp_folder = tempfile.TemporaryDirectory()
download_folder = tmp_folder.name

self.stdout.write('Importing entries from USDA')
self.stdout.write(f' - {self.mode}')
Expand All @@ -61,12 +93,17 @@ def handle(self, **options):
english = Language.objects.get(short_name=ENGLISH_SHORT_NAME)

# Download the dataset
zip_file = os.path.join(download_folder, current_file)
zip_file = os.path.join(download_folder, options['dataset_name'])
if os.path.exists(zip_file):
self.stdout.write(f'File already downloaded {zip_file}, not downloading it again')
else:
self.stdout.write(f'Downloading {zip_file}... (this may take a while)')
self.stdout.write(f'Downloading {usda_url}... (this may take a while)')
req = requests.get(usda_url, stream=True, headers=wger_headers())

if req.status_code == 404:
self.stdout.write(self.style.ERROR(f'Could not open {usda_url}!'))
return

with open(zip_file, 'wb') as fid:
for chunk in req.iter_content(chunk_size=50 * 1024):
fid.write(chunk)
Expand All @@ -80,8 +117,14 @@ def handle(self, **options):
raise Exception('No files found in the ZIP archive')

first_file = file_list[0]
self.stdout.write(f'Extracting {first_file}...')
extracted_file_path = zip_ref.extract(first_file, path=download_folder)

# If the file was already extracted to the download folder, skip
extracted_file_path = os.path.join(download_folder, first_file)
if os.path.exists(extracted_file_path):
self.stdout.write(f'File {first_file} already extracted, skipping...')
else:
self.stdout.write(f'Extracting {first_file}...')
extracted_file_path = zip_ref.extract(first_file, path=download_folder)

# Since the file is almost JSONL, just process each line individually
with open(extracted_file_path, 'r') as extracted_file:
Expand All @@ -108,5 +151,9 @@ def handle(self, **options):
# pass
self.handle_data(ingredient_data)

if not options['tmp_folder']:
self.stdout.write(f'Removing temporary folder {download_folder}')
tmp_folder.cleanup()

self.stdout.write(self.style.SUCCESS('Finished!'))
self.stdout.write(self.style.SUCCESS(str(self.counter)))

0 comments on commit 53230ac

Please sign in to comment.