Skip to content

Commit

Permalink
Support paginated node results. (#217)
Browse files Browse the repository at this point in the history
  • Loading branch information
keyurva authored Sep 20, 2023
1 parent 850ca65 commit ec383aa
Show file tree
Hide file tree
Showing 4 changed files with 41 additions and 20 deletions.
12 changes: 6 additions & 6 deletions simple/stats/sample/powerplants/debug_resolve.csv
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,14 @@ name,dcid,link
FOO BAR,*UNRESOLVED*,
BAZ BAR,*UNRESOLVED*,
Suzlon Project,dc/000qxlm93vn93,https://datacommons.org/browser/dc/000qxlm93vn93
Crete Energy Venture,dc/009cxnrd9h8x6,https://datacommons.org/browser/dc/009cxnrd9h8x6
Watchtower Educational Center,dc/00d76gnyx8p7b,https://datacommons.org/browser/dc/00d76gnyx8p7b
Union Power,dc/00jy62n5m9bt9,https://datacommons.org/browser/dc/00jy62n5m9bt9
Crete Energy Venture,dc/5c7tz3lbln3p,https://datacommons.org/browser/dc/5c7tz3lbln3p
Watchtower Educational Center,dc/8zmh7ctlkbsc4,https://datacommons.org/browser/dc/8zmh7ctlkbsc4
Union Power,dc/2ysvc67fk1162,https://datacommons.org/browser/dc/2ysvc67fk1162
Pearl Station,dc/00w9rbw8yn7x7,https://datacommons.org/browser/dc/00w9rbw8yn7x7
Austin Gas Recovery,dc/00zjgb4rjchx3,https://datacommons.org/browser/dc/00zjgb4rjchx3
Gordon,dc/011s19rm0mzh1,https://datacommons.org/browser/dc/011s19rm0mzh1
White River Lock and Dam 2,dc/017y3py1dzkmg,https://datacommons.org/browser/dc/017y3py1dzkmg
Bristol Plant,dc/01blq25mdxzs5,https://datacommons.org/browser/dc/01blq25mdxzs5
Edison Sault,dc/01xe39q7j5x45,https://datacommons.org/browser/dc/01xe39q7j5x45
Bristol Plant,dc/4359q0h458f01,https://datacommons.org/browser/dc/4359q0h458f01
Edison Sault,dc/3kds7zgl4wz26,https://datacommons.org/browser/dc/3kds7zgl4wz26
Navajo Dam,dc/02b53twnh3fx,https://datacommons.org/browser/dc/02b53twnh3fx
CNN Center,dc/0lh5h07dsvl23,https://datacommons.org/browser/dc/0lh5h07dsvl23
CNN Center,dc/dk2p9l3l8x1b6,https://datacommons.org/browser/dc/dk2p9l3l8x1b6
18 changes: 9 additions & 9 deletions simple/stats/sample/powerplants/observations.csv
Original file line number Diff line number Diff line change
@@ -1,21 +1,21 @@
dcid,variable,date,value
dc/000qxlm93vn93,var1,2023,0.19
dc/009cxnrd9h8x6,var1,2023,0.21
dc/00d76gnyx8p7b,var1,2023,0.29
dc/00jy62n5m9bt9,var1,2023,0.31
dc/5c7tz3lbln3p,var1,2023,0.21
dc/8zmh7ctlkbsc4,var1,2023,0.29
dc/2ysvc67fk1162,var1,2023,0.31
dc/00w9rbw8yn7x7,var1,2023,0.37
dc/00zjgb4rjchx3,var1,2023,0.5
dc/011s19rm0mzh1,var1,2023,0.52
dc/017y3py1dzkmg,var1,2023,0.76
dc/000qxlm93vn93,var2,2023,6.0
dc/009cxnrd9h8x6,var2,2023,56.0
dc/00d76gnyx8p7b,var2,2023,6.0
dc/00jy62n5m9bt9,var2,2023,34.0
dc/5c7tz3lbln3p,var2,2023,56.0
dc/8zmh7ctlkbsc4,var2,2023,6.0
dc/2ysvc67fk1162,var2,2023,34.0
dc/00w9rbw8yn7x7,var2,2023,76.0
dc/00zjgb4rjchx3,var2,2023,34.0
dc/011s19rm0mzh1,var2,2023,92.0
dc/017y3py1dzkmg,var2,2023,9.0
dc/01blq25mdxzs5,var2,2023,34.0
dc/01xe39q7j5x45,var2,2023,42.0
dc/4359q0h458f01,var2,2023,34.0
dc/3kds7zgl4wz26,var2,2023,42.0
dc/02b53twnh3fx,var2,2023,75.0
dc/0lh5h07dsvl23,var2,2023,65.0
dc/dk2p9l3l8x1b6,var2,2023,65.0
28 changes: 23 additions & 5 deletions simple/util/dc_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

import os
import requests
from absl import logging

from .ngram_matcher import NgramMatcher

Expand All @@ -34,6 +35,8 @@ def get_api_key():
_RESOLVE_PLACE_TYPES = set(
["Place", "Continent", "Country", "State", "Province", "City"])

_MAX_NODES = 10_000


# See: https://docs.datacommons.org/api/rest/v2/resolve
def resolve_entities(entities: list[str],
Expand Down Expand Up @@ -70,9 +73,20 @@ def resolve_place_entities(entities: list[str],
# See: https://docs.datacommons.org/api/rest/v2/node
def resolve_non_place_entities(entities: list[str],
entity_type: str = None) -> dict[str, str]:
all_entities = get_entities_of_type(entity_type=entity_type)
ngrams = NgramMatcher()
ngrams.add_keys_values(all_entities)

all_entities, next_token = get_entities_of_type(entity_type=entity_type)
while True:
ngrams.add_keys_values(all_entities)
if ngrams.get_tuples_count() >= _MAX_NODES:
logging.warning("Nodes fetched truncated to: %s",
ngrams.get_tuples_count())
break
if next_token:
all_entities, next_token = get_entities_of_type(
entity_type=entity_type, next_token=next_token)
else:
break

resolved: dict[str, str] = {}
for entity in entities:
Expand All @@ -84,13 +98,17 @@ def resolve_non_place_entities(entities: list[str],
return resolved


# TODO: Support pagination.
# TODO: Cache results to file and return from cache if present.
def get_entities_of_type(entity_type: str) -> dict[str, str]:
def get_entities_of_type(entity_type: str,
next_token: str = None) -> (dict[str, str], str):
data = {
"nodes": [entity_type],
"property": "<-typeOf",
}
if next_token:
data["nextToken"] = next_token

logging.info("Fetching nodes: %s", data)
response = post(path="/v2/node", data=data)

result: dict[str, str] = {}
Expand All @@ -104,7 +122,7 @@ def get_entities_of_type(entity_type: str) -> dict[str, str]:
if name and dcid:
result[name] = dcid

return result
return result, response.get("nextToken", "")


def post(path: str, data={}) -> dict:
Expand Down
3 changes: 3 additions & 0 deletions simple/util/ngram_matcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,9 @@ def __init__(self, config: dict = {}):
# { '<ngram>': { (id1, pos1), (id2, pos2), ...}, ...}
self._ngram_dict = {}

def get_tuples_count(self):
return len(self._key_values)

def add_keys_values(self, kvs: dict[str, any]) -> None:
for key, value in kvs.items():
self.add_key_value(key, value)
Expand Down

0 comments on commit ec383aa

Please sign in to comment.