Download the dataset to a temporary folder by default

Also, add some error handling for missing folders, urls, etc.
wger-project · May 20, 2024 · 53230ac · 53230ac
1 parent 248f28b
commit 53230ac
Showing 1 changed file with 58 additions and 11 deletions.
diff --git a/wger/nutrition/management/commands/import-usda-products.py b/wger/nutrition/management/commands/import-usda-products.py
@@ -35,22 +35,54 @@
 
 logger = logging.getLogger(__name__)
 
-# Check https://fdc.nal.usda.gov/download-datasets.html for current file names
-# current_file = 'FoodData_Central_foundation_food_json_2024-04-18.zip'
-current_file = 'FoodData_Central_branded_food_json_2024-04-18.zip'
-download_folder = '/Users/roland/Entwicklung/wger/server/extras/usda'
-
 
 class Command(ImportProductCommand):
     """
-    Import an Open Food facts Dump
+    Import an USDA dataset
     """
 
+    def add_arguments(self, parser):
+        super().add_arguments(parser)
+
+        parser.add_argument(
+            '--use-folder',
+            action='store',
+            default='',
+            dest='tmp_folder',
+            type=str,
+            help='Controls whether to use a temporary folder created by python (the default) or '
+                 'the path provided for storing the downloaded dataset. If there are already '
+                 'downloaded or extracted files here, they will be used instead of fetching them '
+                 'again.'
+        )
+
+        parser.add_argument(
+            '--dataset-name',
+            action='store',
+            default='FoodData_Central_branded_food_json_2024-04-18.zip',
+            dest='dataset_name',
+            type=str,
+            help='What dataset to download, this value will be appended to '
+                 '"https://fdc.nal.usda.gov/fdc-datasets/". Consult '
+                 'https://fdc.nal.usda.gov/download-datasets.html for current file names',
+        )
+
     def handle(self, **options):
         if options['mode'] == 'insert':
             self.mode = Mode.INSERT
 
-        usda_url = f'https://fdc.nal.usda.gov/fdc-datasets/{current_file}'
+        usda_url = f'https://fdc.nal.usda.gov/fdc-datasets/{options["dataset_name"]}'
+
+        if options['tmp_folder']:
+            download_folder = options['tmp_folder']
+
+            # Check whether the folder exists
+            if not os.path.exists(download_folder):
+                self.stdout.write(self.style.ERROR(f'Folder {download_folder} does not exist!'))
+                return
+        else:
+            tmp_folder = tempfile.TemporaryDirectory()
+            download_folder = tmp_folder.name
 
         self.stdout.write('Importing entries from USDA')
         self.stdout.write(f' - {self.mode}')
@@ -61,12 +93,17 @@ def handle(self, **options):
         english = Language.objects.get(short_name=ENGLISH_SHORT_NAME)
 
         # Download the dataset
-        zip_file = os.path.join(download_folder, current_file)
+        zip_file = os.path.join(download_folder, options['dataset_name'])
         if os.path.exists(zip_file):
             self.stdout.write(f'File already downloaded {zip_file}, not downloading it again')
         else:
-            self.stdout.write(f'Downloading {zip_file}... (this may take a while)')
+            self.stdout.write(f'Downloading {usda_url}... (this may take a while)')
             req = requests.get(usda_url, stream=True, headers=wger_headers())
+
+            if req.status_code == 404:
+                self.stdout.write(self.style.ERROR(f'Could not open {usda_url}!'))
+                return
+
             with open(zip_file, 'wb') as fid:
                 for chunk in req.iter_content(chunk_size=50 * 1024):
                     fid.write(chunk)
@@ -80,8 +117,14 @@ def handle(self, **options):
                 raise Exception('No files found in the ZIP archive')
 
             first_file = file_list[0]
-            self.stdout.write(f'Extracting {first_file}...')
-            extracted_file_path = zip_ref.extract(first_file, path=download_folder)
+
+            # If the file was already extracted to the download folder, skip
+            extracted_file_path = os.path.join(download_folder, first_file)
+            if os.path.exists(extracted_file_path):
+                self.stdout.write(f'File {first_file} already extracted, skipping...')
+            else:
+                self.stdout.write(f'Extracting {first_file}...')
+                extracted_file_path = zip_ref.extract(first_file, path=download_folder)
 
         # Since the file is almost JSONL, just process each line individually
         with open(extracted_file_path, 'r') as extracted_file:
@@ -108,5 +151,9 @@ def handle(self, **options):
                     # pass
                     self.handle_data(ingredient_data)
 
+        if not options['tmp_folder']:
+            self.stdout.write(f'Removing temporary folder {download_folder}')
+            tmp_folder.cleanup()
+
         self.stdout.write(self.style.SUCCESS('Finished!'))
         self.stdout.write(self.style.SUCCESS(str(self.counter)))