From 32f8e0deae376eefeafd3d856692ee2272d89b26 Mon Sep 17 00:00:00 2001 From: Ajai Tirumali Date: Tue, 5 Nov 2024 18:23:34 +0530 Subject: [PATCH] Copy over wdi legacy import scripts to github --- scripts/world_bank/wdi_legacy/README.md | 20 ++ scripts/world_bank/wdi_legacy/constants.py | 299 +++++++++++++++++++ scripts/world_bank/wdi_legacy/wdi_csv2mcf.py | 126 ++++++++ 3 files changed, 445 insertions(+) create mode 100644 scripts/world_bank/wdi_legacy/README.md create mode 100644 scripts/world_bank/wdi_legacy/constants.py create mode 100644 scripts/world_bank/wdi_legacy/wdi_csv2mcf.py diff --git a/scripts/world_bank/wdi_legacy/README.md b/scripts/world_bank/wdi_legacy/README.md new file mode 100644 index 0000000000..8f858433e0 --- /dev/null +++ b/scripts/world_bank/wdi_legacy/README.md @@ -0,0 +1,20 @@ +# Importing World Bank World Development Indicators(WDI) DATA + +The primary World Bank collection of development indicators, compiled from +officially-recognized international sources. It presents the most current and +accurate global development data available, and includes national estimates. +At the moment, we only include a small subset of variables from this. +See wdi_csv2mcf for detailed data we include. + +The data is from: + +https://datacatalog.worldbank.org/dataset/world-development-indicators + +**To generate MCFs from these files, provide the input/output paths and run:** +Download and unzip the CSV file from +https://datacatalog.worldbank.org/search/dataset/0037712 into +input_files/WDICSV.csv + +Then run the command: + +`python wdi_csv2mcf.py` diff --git a/scripts/world_bank/wdi_legacy/constants.py b/scripts/world_bank/wdi_legacy/constants.py new file mode 100644 index 0000000000..3cbd311900 --- /dev/null +++ b/scripts/world_bank/wdi_legacy/constants.py @@ -0,0 +1,299 @@ +"""Constants used across wdi.""" + +from string import Template + +POPS_FILE = 'pops_file' + +START_YEAR_COLUMN = 4 +YEAR_RANGE_START = 1960 +YEAR_RANGE_END = 2023 + +INDICATOR_NAME = 'Indicator Name' +COUNTRY_CODE = 'Country Code' + +CO2_EMISSIONS = 'CO2 emissions (metric tons per capita)' + +POP_CO2_EMISSIONS_MCF_TEMPL = Template(""" +Node: Pop_CO2_Emissions_$location_abbr +typeOf: schema:StatisticalPopulation +populationType: dcs:Emissions +emittedThing: dcs:CarbonDioxide +location: dcid:$location +""") + +OBS_CO2_EMISSIONS_MCF_TEMPL = Template(""" +Node: Obs_CO2_Emissions_Amount_${location_abbr}_$observation_date +typeOf: schema:Observation +observedNode: l:Pop_CO2_Emissions_$location_abbr +observationDate: "$observation_date" +observationPeriod: "P1Y" +measuredProperty: dcs:amount +measuredValue: $measured_value +measurementDenominator: dcs:PerCapita +unit: dcs:MetricTon +""") + +ELEC_CONSUMPTION = 'Electric power consumption (kWh per capita)' + +POP_ELEC_CONSUMPTION_MCF_TEMPL = Template(""" +Node: Pop_Consumption_Electricity_$location_abbr +typeOf: schema:StatisticalPopulation +populationType: dcs:Consumption +consumedThing: dcs:Electricity +location: dcid:$location +""") + +OBS_ELEC_CONSUMPTION_MCF_TEMPL = Template(""" +Node: Obs_Consumption_Electricity_Amount_${location_abbr}_$observation_date +typeOf: schema:Observation +observedNode: l:Pop_Consumption_Electricity_$location_abbr +observationDate: "$observation_date" +observationPeriod: "P1Y" +measuredProperty: dcs:amount +measuredValue: $measured_value +measurementDenominator: dcs:PerCapita +unit: dcs:KilowattHour +""") + +ENERGY_USE = 'Energy use (kg of oil equivalent per capita)' + +POP_ENERGY_USE_MCF_TEMPL = Template(""" +Node: Pop_Energy_Use_$location_abbr +typeOf: schema:StatisticalPopulation +populationType: dcs:Consumption +consumedThing: dcs:Energy +location: dcid:$location +""") + +OBS_ENERGY_USE_MCF_TEMPL = Template(""" +Node: Obs_Energy_Use_Amount_${location_abbr}_$observation_date +typeOf: schema:Observation +observedNode: l:Pop_Energy_Use_$location_abbr +observationDate: "$observation_date" +observationPeriod: "P1Y" +measuredProperty: dcs:amount +measuredValue: $measured_value +measurementDenominator: dcs:PerCapita +unit: dcs:KilogramOfOilEquivalent +""") + +GDP_NOMINAL = 'GDP (current US$)' + +POP_GDP_NOMINAL_MCF_TEMPL = Template(""" +Node: Pop_GDP_Nominal_$location_abbr +typeOf: schema:StatisticalPopulation +populationType: dcs:EconomicActivity +activitySource: dcs:GrossDomesticProduction +location: dcid:$location +""") + +OBS_GDP_NOMINAL_MCF_TEMPL = Template(""" +Node: Obs_GDP_Nominal_Amount_${location_abbr}_$observation_date +typeOf: schema:Observation +observedNode: l:Pop_GDP_Nominal_$location_abbr +observationDate: "$observation_date" +observationPeriod: "P1Y" +measuredProperty: dcs:amount +measuredValue: $measured_value +measurementQualifier: dcs:Nominal +unit: dcs:USDollar +""") + +GDP_GROWTH_RATE = 'GDP growth (annual %)' + +POP_GDP_GROWTH_RATE_MCF_TEMPL = Template(""" +Node: Pop_GDP_Growth_Rate_$location_abbr +typeOf: schema:StatisticalPopulation +populationType: dcs:EconomicActivity +activitySource: dcs:GrossDomesticProduction +location: dcid:$location +""") + +OBS_GDP_GROWTH_RATE_TEMPL = Template(""" +Node: Obs_GDP_Growth_Rate_Amount_${location_abbr}_$observation_date +typeOf: schema:Observation +observedNode: l:Pop_GDP_Growth_Rate_$location_abbr +observationDate: "$observation_date" +observationPeriod: "P1Y" +measuredProperty: dcs:amount +growthRate: $measured_value +""") + +GDP_NOM_PER_CAPITA = 'GDP per capita (current US$)' + +# Use POP_GDP_NOMINAL_MCF_TEMPL as POP template so that only one pop node +# will be generated. + +OBS_GDP_NOM_PER_CAPITA_TEMPL = Template(""" +Node: Obs_GDP_Nom_Per_Capita_Amount_${location_abbr}_$observation_date +typeOf: schema:Observation +observedNode: l:Pop_GDP_Nominal_$location_abbr +observationDate: "$observation_date" +observationPeriod: "P1Y" +measuredProperty: dcs:amount +measuredValue: $measured_value +measurementQualifier: dcs:Nominal +measurementDenominator: dcs:PerCapita +unit: dcs:USDollar +""") + +GNI_IN_PPP = 'GNI, PPP (current international $)' + +POP_GNI_IN_PPP_MCF_TEMPL = Template(""" +Node: Pop_GNI_In_PPP_$location_abbr +typeOf: schema:StatisticalPopulation +populationType: dcs:EconomicActivity +activitySource: dcs:GrossNationalIncome +location: dcid:$location +""") + +OBS_GNI_IN_PPP_MCF_TEMPL = Template(""" +Node: Obs_GNI_In_PPP_Amount_${location_abbr}_$observation_date +typeOf: schema:Observation +observedNode: l:Pop_GNI_In_PPP_$location_abbr +observationDate: "$observation_date" +observationPeriod: "P1Y" +measuredProperty: dcs:amount +measuredValue: $measured_value +measurementQualifier: dcs:PurchasingPowerParity +unit: dcs:InternationalDollar +""") + +GNI_PPP_PER_CAPITA = 'GNI per capita, PPP (current international $)' + +# Use POP_GNI_IN_PPP_MCF_TEMPL as POP template so that only one pop node will +# be generated. + +OBS_GNI_PPP_PER_CAPITA_MCF_TEMPL = Template(""" +Node: Obs_GNI_PPP_Per_Capita_Amount_${location_abbr}_$observation_date +typeOf: schema:Observation +observedNode: l:Pop_GNI_In_PPP_$location_abbr +observationDate: "$observation_date" +observationPeriod: "P1Y" +measuredProperty: dcs:amount +measuredValue: $measured_value +measurementQualifier: dcs:PurchasingPowerParity +measurementDenominator: dcs:PerCapita +unit: dcs:InternationalDollar +""") + +INTER_USER_PERC = 'Individuals using the Internet (% of population)' + +POP_INTER_USER_PERC_MCF_TEMPL = Template(""" +Node: Pop_Inter_User_Perc_$location_abbr +typeOf: schema:StatisticalPopulation +populationType: schema:Person +isInternetUser: schema:True +location: dcid:$location +""") + +OBS_INTER_USER_PERC_MCF_TEMPL = Template(""" +Node: Obs_Inter_User_Perc_Count_${location_abbr}_$observation_date +typeOf: schema:Observation +observedNode: l:Pop_Inter_User_Perc_$location_abbr +observationDate: "$observation_date" +observationPeriod: "P1Y" +measuredProperty: dcs:count +measuredValue: $measured_value +measurementDenominator: dcs:PerCapita +scalingFactor: 100 +""") + +LIFE_EXPECTANCY = 'Life expectancy at birth, total (years)' + +POP_LIFE_EXPECTANCY_MCF_TEMPL = Template(""" +Node: Pop_Life_Expectancy_$location_abbr +typeOf: schema:StatisticalPopulation +populationType: schema:Person +location: dcid:$location +""") + +OBS_LIFE_EXPECTANCY_MCF_TEMPL = Template(""" +Node: Obs_Life_Expectancy_${location_abbr}_$observation_date +typeOf: schema:Observation +observedNode: l:Pop_Life_Expectancy_$location_abbr +observationDate: "$observation_date" +measuredProperty: dcs:lifeExpectancy +measuredValue: $measured_value +unit: dcs:Year +""") + +FERTILITY_RATE = 'Fertility rate, total (births per woman)' + +POP_FERTILITY_RATE_MCF_TEMPL = Template(""" +Node: Pop_Fertility_Rate_$location_abbr +typeOf: schema:StatisticalPopulation +populationType: schema:Person +gender: schema:Female +location: dcid:$location +""") + +OBS_FERTILITY_RATE_MCF_TEMPL = Template(""" +Node: Obs_Fertility_Rate_${location_abbr}_$observation_date +typeOf: schema:Observation +observedNode: l:Pop_Fertility_Rate_$location_abbr +observationDate: "$observation_date" +measuredProperty: dcs:fertilityRate +measuredValue: $measured_value +""") + +POPULATION = 'Population, total' + +POP_POPULATION_MCF_TEMPL = Template(""" +Node: Pop_Population_$location_abbr +typeOf: schema:StatisticalPopulation +populationType: schema:Person +location: dcid:$location +""") + +OBS_POPULATION_MCF_TEMPL = Template(""" +Node: Obs_Population_Count_${location_abbr}_$observation_date +typeOf: schema:Observation +observedNode: l:Pop_Population_$location_abbr +observationDate: "$observation_date" +observationPeriod: "P1Y" +measuredProperty: dcs:count +measuredValue: $measured_value +""") + +POPU_GROWTH_RATE = 'Population growth (annual %)' + +POP_POPU_GROWTH_RATE_MCF_TEMPL = Template(""" +Node: Pop_Popu_Growth_Rate_$location_abbr +typeOf: schema:StatisticalPopulation +populationType: schema:Person +location: dcid:$location +""") + +OBS_POPU_GROWTH_RATE_MCF_TEMPL = Template(""" +Node: Obs_Popu_Growth_Rate_Count_${location_abbr}_$observation_date +typeOf: schema:Observation +observedNode: l:Pop_Popu_Growth_Rate_$location_abbr +observationDate: "$observation_date" +observationPeriod: "P1Y" +measuredProperty: dcs:count +growthRate: $measured_value +""") + +INDICATOR_TEMP_MAP = { + CO2_EMISSIONS: (POP_CO2_EMISSIONS_MCF_TEMPL, OBS_CO2_EMISSIONS_MCF_TEMPL), + ELEC_CONSUMPTION: + (POP_ELEC_CONSUMPTION_MCF_TEMPL, OBS_ELEC_CONSUMPTION_MCF_TEMPL), + ENERGY_USE: (POP_ENERGY_USE_MCF_TEMPL, OBS_ENERGY_USE_MCF_TEMPL), + GDP_NOMINAL: (POP_GDP_NOMINAL_MCF_TEMPL, OBS_GDP_NOMINAL_MCF_TEMPL), + GDP_GROWTH_RATE: (POP_GDP_GROWTH_RATE_MCF_TEMPL, OBS_GDP_GROWTH_RATE_TEMPL), + GDP_NOM_PER_CAPITA: + (POP_GDP_NOMINAL_MCF_TEMPL, OBS_GDP_NOM_PER_CAPITA_TEMPL), + GNI_IN_PPP: (POP_GNI_IN_PPP_MCF_TEMPL, OBS_GNI_IN_PPP_MCF_TEMPL), + GNI_PPP_PER_CAPITA: + (POP_GNI_IN_PPP_MCF_TEMPL, OBS_GNI_PPP_PER_CAPITA_MCF_TEMPL), + INTER_USER_PERC: + (POP_INTER_USER_PERC_MCF_TEMPL, OBS_INTER_USER_PERC_MCF_TEMPL), + LIFE_EXPECTANCY: + (POP_LIFE_EXPECTANCY_MCF_TEMPL, OBS_LIFE_EXPECTANCY_MCF_TEMPL), + FERTILITY_RATE: + (POP_FERTILITY_RATE_MCF_TEMPL, OBS_FERTILITY_RATE_MCF_TEMPL), + POPULATION: (POP_POPULATION_MCF_TEMPL, OBS_POPULATION_MCF_TEMPL), + POPU_GROWTH_RATE: + (POP_POPU_GROWTH_RATE_MCF_TEMPL, OBS_POPU_GROWTH_RATE_MCF_TEMPL), +} diff --git a/scripts/world_bank/wdi_legacy/wdi_csv2mcf.py b/scripts/world_bank/wdi_legacy/wdi_csv2mcf.py new file mode 100644 index 0000000000..035f8119fb --- /dev/null +++ b/scripts/world_bank/wdi_legacy/wdi_csv2mcf.py @@ -0,0 +1,126 @@ +"""Script to convert WDI CSV to MCF.""" + +import csv +import os +import sys + +from absl import app +from absl import flags +from absl import logging + +import constants as cons + +# Allows the local imports from repo root dir +_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +_DATA_REPO_DIR = _SCRIPT_DIR.split('/scripts/', 1)[0] +sys.path.append(os.path.join(_DATA_REPO_DIR, 'util')) + +import download_util +import file_util +from counters import Counters + + +flags.DEFINE_string( + 'input_file', 'input_files/WDICSV.csv', + 'Path of input file downloaded from https://datacatalog.worldbank.org/search/dataset/0037712.' +) +flags.DEFINE_string('output_dir', 'output', 'Directory to write MCF output.') +flags.DEFINE_string('pops_file_name', 'wdi_pops.mcf', + 'CNS file name to write population statistics nodes.') + +_FLAGS = flags.FLAGS + +_out_file = {} + + +def process_line(l, start_year: int, end_year: int, country_codes: dict, + counters: Counters): + """Process line of WDI data.""" + indicator = l[cons.INDICATOR_NAME] + if indicator in cons.INDICATOR_TEMP_MAP: + # Skip country code not following ISO 3166 standard. + if len(l[cons.COUNTRY_CODE]) != 3 or l[ + cons.COUNTRY_CODE] not in country_codes: + counters.add_counter('input-rows-dropped-invalid-country', 1) + return + place_dcid = 'Earth' if l[ + cons.COUNTRY_CODE] == 'WLD' else 'country/' + l[cons.COUNTRY_CODE] + process_pop(l, indicator, place_dcid, counters) + for year in range(start_year, end_year + 1): + year_str = str(year) + process_obs(l, indicator, year_str, place_dcid, counters) + + +def process_pop(l, indicator, place_dcid, counters): + """Store the pop node into population statistics MCF file.""" + try: + node = cons.INDICATOR_TEMP_MAP[indicator][0].substitute( + location=place_dcid, location_abbr=l[cons.COUNTRY_CODE]) + _out_file[cons.POPS_FILE].write(node + '\n') + counters.add_counter(f'output-pop', 1) + except: + counters.add_counter('error-pop', 1) + raise app.UsageError('Unable to write Pop for line=%s', l) + + +def process_obs(l, indicator, year_str, place_dcid, counters): + """Store the obs node into the MCF file of the year if it has measure.""" + if l[year_str]: + try: + node = cons.INDICATOR_TEMP_MAP[indicator][1].substitute( + location=place_dcid, + location_abbr=l[cons.COUNTRY_CODE], + underscore='_', + observation_date=year_str, + measured_value=l[year_str]) + _out_file[year_str].write(node + '\n') + counters.add_counter(f'output-obs', 1) + counters.add_counter(f'output-obs-{year_str}', 1) + except: + counters.add_counter('error-obs', 1) + raise app.UsageError( + 'Unable to write Obs for %s country %s year %s : mVal=%s', + indicator, l[cons.COUNTRY_CODE], year_str, l) + + +def process_file(input_file: str, output_dir: str, pops_file_name: str): + """Process the input CSV and writes the MCF.""" + global _out_file + counters = Counters() + + # Load list of existing countries + country_codes = file_util.file_load_csv_dict(os.path.join( + os.path.dirname(_SCRIPT_DIR), 'wdi', 'WorldBankCountries.csv'), + key_column='ISO3166Alpha3', + value_column='CountryName') + # Create output file + _out_file[cons.POPS_FILE] = file_util.FileIO( + os.path.join(output_dir, pops_file_name), 'w') + + counters.add_counter('total', file_util.file_estimate_num_rows(input_file)) + with file_util.FileIO(input_file, 'r') as f_in: + dict_reader = csv.DictReader(f_in) + columns = dict_reader.fieldnames + start_year = int(columns[cons.START_YEAR_COLUMN]) + end_year = int(columns[-1]) + + # Create per-year output files + for yr in range(start_year, end_year + 1): + year_str = str(yr) + if year_str not in _out_file: + _out_file[year_str] = file_util.FileIO( + os.path.join(output_dir, 'wdi' + year_str + '.mcf'), 'w') + + for l in dict_reader: + process_line(l, start_year, end_year, country_codes, counters) + counters.add_counter('processed', 1) + + +def main(argv): + if len(argv) > 1: + raise app.UsageError('Too many command-line arguments.') + process_file(_FLAGS.input_file, _FLAGS.output_dir, _FLAGS.pops_file_name) + + +if __name__ == '__main__': + app.run(main)