datacommonsorg · swethammkumari · Sep 16, 2024 · Sep 16, 2024 · Sep 16, 2024 · Sep 18, 2024
diff --git a/scripts/world_bank/datasets/datasets.py b/scripts/world_bank/datasets/datasets.py
@@ -13,6 +13,12 @@
 # limitations under the License.
 """Processes WB datasets.
 
+update september 2024:
+To run all processing methods , please do not pass the mode 
+Run: python3 datasets.py
+
+Or If required to check issue in any individual process follow all the steps as below:
+
 Supports the following tasks:
 
 ============================
@@ -66,6 +72,7 @@
 import re
 import urllib3
 from urllib3.util.ssl_ import create_urllib3_context
+from urllib3.exceptions import HTTPError
 from absl import flags
 import zipfile
 import codecs
@@ -84,7 +91,7 @@ class Mode:
 
 
 flags.DEFINE_string(
-    'mode', Mode.WRITE_OBSERVATIONS,
+    'mode', None,
     f"Specify one of the following modes: {Mode.FETCH_DATASETS}, {Mode.DOWNLOAD_DATASETS}, {Mode.WRITE_WB_CODES}, {Mode.LOAD_STAT_VARS}, {Mode.WRITE_OBSERVATIONS}"
 )
 
@@ -131,7 +138,7 @@ class Mode:
 
 def download_datasets():
     '''Downloads dataset files. This is a very expensive operation so run it with care. It assumes that the datasets CSV is already available.'''
-
+    logging.info('start download_datasets')
     with open(DATASETS_CSV_FILE_PATH, 'r') as f:
         csv_rows = list(csv.DictReader(f))
         download_urls = []
@@ -159,9 +166,12 @@ def download(url):
         # Using urllib3 for downloading content to avoid SSL issue.
         # See: https://github.com/urllib3/urllib3/issues/2653#issuecomment-1165418616
         with urllib3.PoolManager(ssl_context=ctx) as http:
-            response = http.request("GET", url)
-        with open(file_path, 'wb') as f:
-            f.write(response.data)
+            try:
+                response = http.request("GET", url)
+                with open(file_path, 'wb') as f:
+                    f.write(response.data)
+            except HTTPError as e:
+                print(f"HTTP error encountered: {e}")
     except Exception as e:
         logging.error("Error downloading %s", url, exc_info=e)
 
@@ -277,11 +287,15 @@ def load_json(url, params, response_file):
             return json.load(f)
 
     logging.info("Fetching url %s, params %s", url, params)
-    response = requests.get(url, params=params).json()
-    with open(response_file, 'w') as f:
-        logging.info('Writing response to file %s', response_file)
-        json.dump(response, f, indent=2)
-    return response
+    try:
+        response = requests.get(url, params=params).json()
+        with open(response_file, 'w') as f:
+            logging.info('Writing response to file %s', response_file)
+            json.dump(response, f, indent=2)
+        return response
+    except Exception as e:
+        print(f"Http error {e}")
+        return None
 
 
 def load_json_file(json_file):
@@ -571,19 +585,27 @@ def get_data_and_series_file_names(zip):
 
 
 def main(_):
-    match FLAGS.mode:
-        case Mode.FETCH_DATASETS:
-            download_datasets()
-        case Mode.DOWNLOAD_DATASETS:
-            fetch_and_write_datasets_csv()
-        case Mode.WRITE_WB_CODES:
-            write_wb_codes()
-        case Mode.LOAD_STAT_VARS:
-            load_stat_vars(FLAGS.stat_vars_file)
-        case Mode.WRITE_OBSERVATIONS:
-            write_all_observations(FLAGS.stat_vars_file)
-        case _:
-            logging.error('No mode specified.')
+    logging.info(FLAGS.mode)
+    if not FLAGS.mode:
+        fetch_and_write_datasets_csv()
+        download_datasets()
+        write_wb_codes()
+        load_stat_vars(FLAGS.stat_vars_file)
+        write_all_observations(FLAGS.stat_vars_file)
+    else:
+        match FLAGS.mode:
+            case Mode.FETCH_DATASETS:
+                download_datasets()
+            case Mode.DOWNLOAD_DATASETS:
+                fetch_and_write_datasets_csv()
+            case Mode.WRITE_WB_CODES:
+                write_wb_codes()
+            case Mode.LOAD_STAT_VARS:
+                load_stat_vars(FLAGS.stat_vars_file)
+            case Mode.WRITE_OBSERVATIONS:
+                write_all_observations(FLAGS.stat_vars_file)
+            case _:
+                logging.error('No mode specified.')
 
 
 if __name__ == '__main__':

diff --git a/scripts/world_bank/datasets/manifest.json b/scripts/world_bank/datasets/manifest.json
@@ -0,0 +1,94 @@
+{
+  "import_specifications": [
+    {
+      "import_name": "WorldBankDatasets",
+      "curator_emails": ["swethakumari@google.com"],
+      "provenance_url": "https://data.worldbank.org",
+      "provenance_description": "World Bank databases are essential tools for supporting critical management decisions and providing key statistical information for Bank operational activities.",
+      "scripts": ["datasets.py"],
+      "import_inputs": [
+        {
+          "template_mcf": "wb.tmcf",
+          "cleaned_csv": "output/observations/ASPIRE_CSV_obs.csv"
+        },
+        {
+           "template_mcf": "wb.tmcf",
+          "cleaned_csv": "output/observations/EdStats_CSV_obs.csv"
+        },
+        {
+           "template_mcf": "wb.tmcf",
+          "cleaned_csv": "output/observations/FINDEX_CSV_obs.csv"
+        },
+        {
+           "template_mcf": "wb.tmcf",
+          "cleaned_csv": "output/observations/GFDD_CSV_obs.csv"
+        },
+        {
+           "template_mcf": "wb.tmcf",
+          "cleaned_csv": "output/observations/GPFI_CSV_obs.csv"
+        },
+        {
+           "template_mcf": "wb.tmcf",
+          "cleaned_csv": "output/observations/HCI_CSV_obs.csv"
+        },
+        {
+           "template_mcf": "wb.tmcf",
+          "cleaned_csv": "output/observations/IDA_CSV_obs.csv"
+        },
+        {
+           "template_mcf": "wb.tmcf",
+          "cleaned_csv": "output/observations/Jobs_CSV_obs.csv"
+        },
+        {
+           "template_mcf": "wb.tmcf",
+          "cleaned_csv": "output/observations/MDG_CSV_obs.csv"
+        },
+        {
+           "template_mcf": "wb.tmcf",
+          "cleaned_csv": "output/observations/PovStats_CSV_obs.csv"
+        },
+        {
+           "template_mcf": "wb.tmcf",
+          "cleaned_csv": "output/observations/SDG_CSV_obs.csv"
+        },
+        {
+           "template_mcf": "wb.tmcf",
+          "cleaned_csv": "output/observations/SE4ALL_CSV_obs.csv"
+        },
+        {
+           "template_mcf": "wb.tmcf",
+          "cleaned_csv": "output/observations/Subnational-Population_CSV_obs.csv"
+        },
+        {
+           "template_mcf": "wb.tmcf",
+          "cleaned_csv": "output/observations/Subnational-Poverty_CSV_obs.csv"
+        },
+        {
+           "template_mcf": "wb.tmcf",
+          "cleaned_csv": "output/observations/WGI_CSV_obs.csv"
+        },
+        {
+           "template_mcf": "wb.tmcf",
+          "cleaned_csv": "output/observations/BBSC_CSV_obs.csv"
+        },
+         {
+           "template_mcf": "wb.tmcf",
+          "cleaned_csv": "output/observations/DB_CSV_obs.csv"
+        },
+         {
+           "template_mcf": "wb.tmcf",
+          "cleaned_csv": "output/observations/Economic_Fitness_CSV_obs.csv"
+        },
+        {
+           "template_mcf": "wb.tmcf",
+          "cleaned_csv": "output/observations/HEFPI_CSV_obs.csv"
+        },
+        {
+           "template_mcf": "wb.tmcf",
+          "cleaned_csv": "output/observations/WWBI_CSV_obs.csv"
+        }
+      ],
+      "cron_schedule": "5 3 15 * *"
+    }
+  ]
+}