Skip to content

Commit

Permalink
Generate a CSV for debugging resolved dcids. (#216)
Browse files Browse the repository at this point in the history
  • Loading branch information
keyurva authored Sep 20, 2023
1 parent fd5877e commit 850ca65
Show file tree
Hide file tree
Showing 6 changed files with 100 additions and 15 deletions.
2 changes: 2 additions & 0 deletions simple/stats/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ The first 2 columns of input CSVs should be place names (or more generically _en

The output `observations.csv` can be imported directly into sqlite. A sample output CSV can be found [here](sample/countries/observations.csv).

The program also outputs a `debug_resolve.csv` file. This is for debugging whether names were resolved to the correct DCIDs and addressed any unresolved ones. A sample CSV can be found [here](sample/countries/debug_resolve.csv).

## Other options

To see all parameters and overrides supported by the script:
Expand Down
13 changes: 12 additions & 1 deletion simple/stats/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,24 @@

# Defaults.
DEFAULT_DATA_DIR = ".data"
DEFAULT_INPUT_DIR = os.path.join(DEFAULT_DATA_DIR, "input")
DEFAULT_INPUT_PATH = os.path.join(DEFAULT_DATA_DIR, "input")
DEFAULT_OUTPUT_DIR = os.path.join(DEFAULT_DATA_DIR, "output")

OBSERVATIONS_FILE_NAME = "observations.csv"
DEBUG_RESOLVE_FILE_NAME = "debug_resolve.csv"

# Observations CSV columns.
COLUMN_DCID = "dcid"
COLUMN_VARIABLE = "variable"
COLUMN_DATE = "date"
COLUMN_VALUE = "value"

# Debug CSV columns and values
DEBUG_COLUMN_NAME = "name"
DEBUG_COLUMN_DCID = "dcid"
DEBUG_COLUMN_LINK = "link"
DEBUG_UNRESOLVED_DCID = "*UNRESOLVED*"

# DC links
DC_HOME = "https://datacommons.org"
DC_BROWSER = "https://datacommons.org/browser"
64 changes: 53 additions & 11 deletions simple/stats/importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,14 +28,17 @@
# TODO: Add support for units.
class SimpleStatsImporter:

def __init__(self, input_dir: str, output_dir: str,
def __init__(self, input_path: str, output_dir: str,
entity_type: str) -> None:
self.input_dir = input_dir
self.input_path = input_path
self.output_dir = output_dir
self.observations_file = os.path.join(output_dir,
constants.OBSERVATIONS_FILE_NAME)
self.debug_resolve_file = os.path.join(
output_dir, constants.DEBUG_RESOLVE_FILE_NAME)
self.entity_type = entity_type
self.df = pd.DataFrame()
self.debug_resolve_df = None

def do_import(self) -> None:
self._init()
Expand All @@ -44,21 +47,29 @@ def do_import(self) -> None:
self._resolve_entities()
self._unpivot_variables()
self._reorder_columns()
self._write_csv()
self._write_csvs()

def _init(self):
os.makedirs(self.output_dir, exist_ok=True)

def _read_csvs(self) -> None:
if os.path.isdir(self.input_path):
self.df = SimpleStatsImporter._read_csvs_from_dir(self.input_path)
else:
self.df = pd.read_csv(self.input_path)

logging.info("Read %s rows.", self.df.index.size)

@staticmethod
def _read_csvs_from_dir(input_dir: str) -> pd.DataFrame:
files = [
os.path.join(self.input_dir, filename)
for filename in os.listdir(self.input_dir)
os.path.join(input_dir, filename)
for filename in os.listdir(input_dir)
]
df = pd.DataFrame()
for file in files:
df = pd.concat([df, pd.read_csv(file)])
logging.info("Read %s rows.", df.index.size)
self.df = df
return df

def _rename_columns(self) -> None:
df = self.df
Expand All @@ -77,13 +88,40 @@ def _resolve_entities(self) -> None:
logging.info("Resolved %s of %s entities.", len(dcids), len(entities))
column.replace(dcids, inplace=True)
unresolved = set(entities).difference(set(dcids.keys()))
if unresolved:
unresolved_list = list(unresolved)
unresolved_list = list(unresolved)
if unresolved_list:
logging.warning("# unresolved entities which will be dropped: %s",
len(unresolved_list))
logging.warning("Dropped entities: %s", unresolved_list)
df.drop(df[df.iloc[:, 0].isin(values=unresolved)].index,
df.drop(df[df.iloc[:, 0].isin(values=unresolved_list)].index,
inplace=True)
self._create_debug_resolve_dataframe(resolved=dcids,
unresolved=unresolved_list)

def _create_debug_resolve_dataframe(self, resolved: dict[str, str],
unresolved: list[str]):
# Add unresolved names first
names = unresolved[:]
dcids = [constants.DEBUG_UNRESOLVED_DCID] * len(unresolved)

# Add resolved names and dcids
names.extend(list(resolved.keys()))
dcids.extend(list(resolved.values()))

# Create browser links
links = []
for dcid in dcids:
if dcid == constants.DEBUG_UNRESOLVED_DCID:
links.append("")
else:
links.append(f"{constants.DC_BROWSER}/{dcid}")

# Create dataframe
self.debug_resolve_df = pd.DataFrame({
constants.DEBUG_COLUMN_NAME: names,
constants.DEBUG_COLUMN_DCID: dcids,
constants.DEBUG_COLUMN_LINK: links,
})

def _unpivot_variables(self) -> None:
self.df = self.df.melt(
Expand All @@ -100,7 +138,11 @@ def _reorder_columns(self) -> None:
constants.COLUMN_VALUE,
])

def _write_csv(self) -> None:
def _write_csvs(self) -> None:
logging.info("Writing %s observations to: %s", self.df.index.size,
self.observations_file)
self.df.to_csv(self.observations_file, index=False)
if self.debug_resolve_df is not None:
logging.info("Writing resolutions (for debugging) to: %s",
self.debug_resolve_file)
self.debug_resolve_df.to_csv(self.debug_resolve_file, index=False)
6 changes: 3 additions & 3 deletions simple/stats/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,15 +32,15 @@
None,
"The type of entities in the CSV (e.g. 'City', 'Country', 'Company', etc.).",
)
flags.DEFINE_string("input_dir", constants.DEFAULT_INPUT_DIR,
"The input directory.")
flags.DEFINE_string("input_path", constants.DEFAULT_INPUT_PATH,
"The input directory or file.")
flags.DEFINE_string("output_dir", constants.DEFAULT_OUTPUT_DIR,
"The output directory.")


def main(_):
importer = SimpleStatsImporter(
input_dir=FLAGS.input_dir,
input_path=FLAGS.input_path,
output_dir=FLAGS.output_dir,
entity_type=FLAGS.entity_type,
)
Expand Down
15 changes: 15 additions & 0 deletions simple/stats/sample/countries/debug_resolve.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
name,dcid,link
West Bank and Gaza,*UNRESOLVED*,
Cabo Verde,*UNRESOLVED*,
Afghanistan,country/AFG,https://datacommons.org/browser/country/AFG
Albania,country/ALB,https://datacommons.org/browser/country/ALB
Algeria,country/DZA,https://datacommons.org/browser/country/DZA
American Samoa,country/ASM,https://datacommons.org/browser/country/ASM
Andorra,country/AND,https://datacommons.org/browser/country/AND
Angola,country/AGO,https://datacommons.org/browser/country/AGO
Anguilla,country/AIA,https://datacommons.org/browser/country/AIA
Wallis and Futuna Islands,country/WLF,https://datacommons.org/browser/country/WLF
Western Sahara,country/ESH,https://datacommons.org/browser/country/ESH
Yemen,country/YEM,https://datacommons.org/browser/country/YEM
Zambia,country/ZMB,https://datacommons.org/browser/country/ZMB
Zimbabwe,country/ZWE,https://datacommons.org/browser/country/ZWE
15 changes: 15 additions & 0 deletions simple/stats/sample/powerplants/debug_resolve.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
name,dcid,link
FOO BAR,*UNRESOLVED*,
BAZ BAR,*UNRESOLVED*,
Suzlon Project,dc/000qxlm93vn93,https://datacommons.org/browser/dc/000qxlm93vn93
Crete Energy Venture,dc/009cxnrd9h8x6,https://datacommons.org/browser/dc/009cxnrd9h8x6
Watchtower Educational Center,dc/00d76gnyx8p7b,https://datacommons.org/browser/dc/00d76gnyx8p7b
Union Power,dc/00jy62n5m9bt9,https://datacommons.org/browser/dc/00jy62n5m9bt9
Pearl Station,dc/00w9rbw8yn7x7,https://datacommons.org/browser/dc/00w9rbw8yn7x7
Austin Gas Recovery,dc/00zjgb4rjchx3,https://datacommons.org/browser/dc/00zjgb4rjchx3
Gordon,dc/011s19rm0mzh1,https://datacommons.org/browser/dc/011s19rm0mzh1
White River Lock and Dam 2,dc/017y3py1dzkmg,https://datacommons.org/browser/dc/017y3py1dzkmg
Bristol Plant,dc/01blq25mdxzs5,https://datacommons.org/browser/dc/01blq25mdxzs5
Edison Sault,dc/01xe39q7j5x45,https://datacommons.org/browser/dc/01xe39q7j5x45
Navajo Dam,dc/02b53twnh3fx,https://datacommons.org/browser/dc/02b53twnh3fx
CNN Center,dc/0lh5h07dsvl23,https://datacommons.org/browser/dc/0lh5h07dsvl23

0 comments on commit 850ca65

Please sign in to comment.