datacommonsorg · swethammkumari · Sep 16, 2024 · Sep 16, 2024 · Sep 16, 2024 · Sep 18, 2024
diff --git a/scripts/world_bank/datasets/README.md b/scripts/world_bank/datasets/README.md
@@ -0,0 +1,66 @@
+# World Bank Datasets
+
+- source: https://data.worldbank.org
+
+- how to download data: Auto download of data by using python script(datasets.py).
+
+- type of place: Country.
+
+- statvars: All Type
+
+- years: 1960 to 2050
+
+-copyright year: 2024
+
+### How to run:
+"""Processes WB datasets.
+
+update september 2024:
+To run all processing methods , please do not pass the mode 
+Run: python3 datasets.py
+
+Or If required to check issue in any individual process follow all the steps as below:
+
+Supports the following tasks:
+
+============================
+
+fetch_datasets: Fetches WB dataset lists and resources and writes them to 'output/wb-datasets.csv'
+
+Run: python3 datasets.py --mode=fetch_datasets
+
+============================
+
+download_datasets: Downloads datasets listed in 'output/wb-datasets.csv' to the 'output/downloads' folder.
+
+Run: python3 datasets.py --mode=download_datasets
+
+============================
+
+write_wb_codes: Extracts World Bank indicator codes (and related information) from files downloaded in the  'output/downloads' folder to 'output/wb-codes.csv'.
+
+It only operates on files that are named '*_CSV.zip'.
+
+Run: python3 datasets.py --mode=write_wb_codes
+
+============================
+
+load_stat_vars: Loads stat vars from a mapping file specified via the `stat_vars_file` flag.
+
+Use this for debugging to ensure that the mappings load correctly and fix any errors logged by this operation.
+
+Run: python3 datasets.py --mode=load_stat_vars --stat_vars_file=/path/to/statvars.csv
+
+See `sample-svs.csv` for a sample mappings file.
+
+============================
+
+write_observations: Extracts observations from files downloaded in the 'output/downloads' folder and saves them to CSVs in the 'output/observations' folder.
+
+The stat vars file to be used for mappings should be specified using the `stat_vars_file' flag.
+
+It only operates on files that are named '*_CSV.zip'.
+
+Run: python3 datasets.py --mode=write_observations --stat_vars_file=/path/to/statvars.csv
+"""
+
diff --git a/scripts/world_bank/datasets/datasets.py b/scripts/world_bank/datasets/datasets.py
@@ -13,6 +13,12 @@
 # limitations under the License.
 """Processes WB datasets.
 
+update september 2024:
+To run all processing methods , please do not pass the mode 
+Run: python3 datasets.py
+
+Or If required to check issue in any individual process follow all the steps as below:
+
 Supports the following tasks:
 
 ============================
@@ -41,7 +47,7 @@
 
 Use this for debugging to ensure that the mappings load correctly and fix any errors logged by this operation.
 
-Run: python3 datasets.py --mode=load_stat_vars --stat_vars_file=/path/to/sv_mappings.csv
+Run: python3 datasets.py --mode=load_stat_vars --stat_vars_file=/path/to/statvars.csv
 
 See `sample-svs.csv` for a sample mappings file.
 
@@ -53,7 +59,7 @@
 
 It only operates on files that are named '*_CSV.zip'.
 
-Run: python3 datasets.py --mode=write_observations --stat_vars_file=/path/to/sv_mappings.csv
+Run: python3 datasets.py --mode=write_observations --stat_vars_file=/path/to/statvars.csv
 """
 
 import requests
@@ -66,6 +72,7 @@
 import re
 import urllib3
 from urllib3.util.ssl_ import create_urllib3_context
+from urllib3.exceptions import HTTPError
 from absl import flags
 import zipfile
 import codecs
@@ -84,7 +91,7 @@ class Mode:
 
 
 flags.DEFINE_string(
-    'mode', Mode.WRITE_OBSERVATIONS,
+    'mode', None,
     f"Specify one of the following modes: {Mode.FETCH_DATASETS}, {Mode.DOWNLOAD_DATASETS}, {Mode.WRITE_WB_CODES}, {Mode.LOAD_STAT_VARS}, {Mode.WRITE_OBSERVATIONS}"
 )
 
@@ -131,7 +138,7 @@ class Mode:
 
 def download_datasets():
     '''Downloads dataset files. This is a very expensive operation so run it with care. It assumes that the datasets CSV is already available.'''
-
+    logging.info('start download_datasets')
     with open(DATASETS_CSV_FILE_PATH, 'r') as f:
         csv_rows = list(csv.DictReader(f))
         download_urls = []
@@ -158,10 +165,13 @@ def download(url):
         # response = requests.get(url)
         # Using urllib3 for downloading content to avoid SSL issue.
         # See: https://github.com/urllib3/urllib3/issues/2653#issuecomment-1165418616
-        with urllib3.PoolManager(ssl_context=ctx) as http:
-            response = http.request("GET", url)
-        with open(file_path, 'wb') as f:
-            f.write(response.data)
+        with urllib3.PoolManager(ssl_context=ctx,timeout=90) as http:
+            try:
+                response = http.request("GET", url)
+                with open(file_path, 'wb') as f:
+                    f.write(response.data)
+            except HTTPError as e:
+                print(f"HTTP error encountered: {e}")
     except Exception as e:
         logging.error("Error downloading %s", url, exc_info=e)
 
@@ -277,11 +287,15 @@ def load_json(url, params, response_file):
             return json.load(f)
 
     logging.info("Fetching url %s, params %s", url, params)
-    response = requests.get(url, params=params).json()
-    with open(response_file, 'w') as f:
-        logging.info('Writing response to file %s', response_file)
-        json.dump(response, f, indent=2)
-    return response
+    try:
+        response = requests.get(url, params=params).json()
+        with open(response_file, 'w') as f:
+            logging.info('Writing response to file %s', response_file)
+            json.dump(response, f, indent=2)
+        return response
+    except Exception as e:
+        print(f"Http error {e}")
+        return None
 
 
 def load_json_file(json_file):
@@ -571,19 +585,27 @@ def get_data_and_series_file_names(zip):
 
 
 def main(_):
-    match FLAGS.mode:
-        case Mode.FETCH_DATASETS:
-            download_datasets()
-        case Mode.DOWNLOAD_DATASETS:
-            fetch_and_write_datasets_csv()
-        case Mode.WRITE_WB_CODES:
-            write_wb_codes()
-        case Mode.LOAD_STAT_VARS:
-            load_stat_vars(FLAGS.stat_vars_file)
-        case Mode.WRITE_OBSERVATIONS:
-            write_all_observations(FLAGS.stat_vars_file)
-        case _:
-            logging.error('No mode specified.')
+    logging.info(FLAGS.mode)
+    if not FLAGS.mode:
+        fetch_and_write_datasets_csv()
+        download_datasets()
+        write_wb_codes()
+        load_stat_vars(FLAGS.stat_vars_file)
+        write_all_observations(FLAGS.stat_vars_file)
+    else:
+        match FLAGS.mode:
+            case Mode.FETCH_DATASETS:
+                download_datasets()
+            case Mode.DOWNLOAD_DATASETS:
+                fetch_and_write_datasets_csv()
+            case Mode.WRITE_WB_CODES:
+                write_wb_codes()
+            case Mode.LOAD_STAT_VARS:
+                load_stat_vars(FLAGS.stat_vars_file)
+            case Mode.WRITE_OBSERVATIONS:
+                write_all_observations(FLAGS.stat_vars_file)
+            case _:
+                logging.error('No mode specified.')
 
 
 if __name__ == '__main__':

diff --git a/scripts/world_bank/datasets/manifest.json b/scripts/world_bank/datasets/manifest.json
@@ -0,0 +1,94 @@
+{
+  "import_specifications": [
+    {
+      "import_name": "WorldBankDatasets",
+      "curator_emails": ["swethakumari@google.com"],
+      "provenance_url": "https://data.worldbank.org",
+      "provenance_description": "World Bank databases are essential tools for supporting critical management decisions and providing key statistical information for Bank operational activities.",
+      "scripts": ["datasets.py"],
+      "import_inputs": [
+        {
+          "template_mcf": "wb.tmcf",
+          "cleaned_csv": "output/observations/ASPIRE_CSV_obs.csv"
+        },
+        {
+           "template_mcf": "wb.tmcf",
+          "cleaned_csv": "output/observations/EdStats_CSV_obs.csv"
+        },
+        {
+           "template_mcf": "wb.tmcf",
+          "cleaned_csv": "output/observations/FINDEX_CSV_obs.csv"
+        },
+        {
+           "template_mcf": "wb.tmcf",
+          "cleaned_csv": "output/observations/GFDD_CSV_obs.csv"
+        },
+        {
+           "template_mcf": "wb.tmcf",
+          "cleaned_csv": "output/observations/GPFI_CSV_obs.csv"
+        },
+        {
+           "template_mcf": "wb.tmcf",
+          "cleaned_csv": "output/observations/HCI_CSV_obs.csv"
+        },
+        {
+           "template_mcf": "wb.tmcf",
+          "cleaned_csv": "output/observations/IDA_CSV_obs.csv"
+        },
+        {
+           "template_mcf": "wb.tmcf",
+          "cleaned_csv": "output/observations/Jobs_CSV_obs.csv"
+        },
+        {
+           "template_mcf": "wb.tmcf",
+          "cleaned_csv": "output/observations/MDG_CSV_obs.csv"
+        },
+        {
+           "template_mcf": "wb.tmcf",
+          "cleaned_csv": "output/observations/PovStats_CSV_obs.csv"
+        },
+        {
+           "template_mcf": "wb.tmcf",
+          "cleaned_csv": "output/observations/SDG_CSV_obs.csv"
+        },
+        {
+           "template_mcf": "wb.tmcf",
+          "cleaned_csv": "output/observations/SE4ALL_CSV_obs.csv"
+        },
+        {
+           "template_mcf": "wb.tmcf",
+          "cleaned_csv": "output/observations/Subnational-Population_CSV_obs.csv"
+        },
+        {
+           "template_mcf": "wb.tmcf",
+          "cleaned_csv": "output/observations/Subnational-Poverty_CSV_obs.csv"
+        },
+        {
+           "template_mcf": "wb.tmcf",
+          "cleaned_csv": "output/observations/WGI_CSV_obs.csv"
+        },
+        {
+           "template_mcf": "wb.tmcf",
+          "cleaned_csv": "output/observations/BBSC_CSV_obs.csv"
+        },
+         {
+           "template_mcf": "wb.tmcf",
+          "cleaned_csv": "output/observations/DB_CSV_obs.csv"
+        },
+         {
+           "template_mcf": "wb.tmcf",
+          "cleaned_csv": "output/observations/Economic_Fitness_CSV_obs.csv"
+        },
+        {
+           "template_mcf": "wb.tmcf",
+          "cleaned_csv": "output/observations/HEFPI_CSV_obs.csv"
+        },
+        {
+           "template_mcf": "wb.tmcf",
+          "cleaned_csv": "output/observations/WWBI_CSV_obs.csv"
+        }
+      ],
+      "cron_schedule": "5 3 15 * *"
+    }
+  ]
+}