Skip to content

Commit

Permalink
Merge pull request #187 from leaf-ai/fix_standard_predictor
Browse files Browse the repository at this point in the history
#186 Make standard predictor predict from the day before start_date
  • Loading branch information
ofrancon authored Jan 19, 2021
2 parents e738bc5 + a562e2e commit 4850feb
Showing 1 changed file with 27 additions and 18 deletions.
45 changes: 27 additions & 18 deletions covid_xprize/standard_predictor/xprize_predictor.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,8 +89,6 @@ def __init__(self, path_to_model_weights, data_url):
f"Data file not found at {DATA_FILE_PATH}"

self.df = self._prepare_dataframe(data_url)
geos = self.df.GeoID.unique()
self.country_samples = self._create_country_samples(self.df, geos)

def predict(self,
start_date_str: str,
Expand All @@ -109,10 +107,15 @@ def predict(self,
"Date": [],
"PredictedDailyNewCases": []}

# Fix for past predictions
geos = npis_df.GeoID.unique()
truncated_df = self.df[self.df.Date < start_date_str]
country_samples = self._create_country_samples(truncated_df, geos, False)

# For each requested geo
geos = npis_df.GeoID.unique()
for g in geos:
cdf = self.df[self.df.GeoID == g]
cdf = truncated_df[truncated_df.GeoID == g]
if len(cdf) == 0:
# we don't have historical data for this geo: return zeroes
pred_new_cases = [0] * nb_days
Expand All @@ -123,7 +126,7 @@ def predict(self,
geo_start_date = min(last_known_date + np.timedelta64(1, 'D'), start_date)
npis_gdf = npis_df[(npis_df.Date >= geo_start_date) & (npis_df.Date <= end_date)]

pred_new_cases = self._get_new_cases_preds(cdf, g, npis_gdf)
pred_new_cases = self._get_new_cases_preds(cdf, g, npis_gdf, country_samples)

# Append forecast data to results to return
country = npis_df[npis_df.GeoID == g].iloc[0].CountryName
Expand All @@ -139,10 +142,10 @@ def predict(self,
# Return only the requested predictions
return forecast_df[(forecast_df.Date >= start_date) & (forecast_df.Date <= end_date)]

def _get_new_cases_preds(self, c_df, g, npis_df):
def _get_new_cases_preds(self, c_df, g, npis_df, country_samples):
cdf = c_df[c_df.ConfirmedCases.notnull()]
initial_context_input = self.country_samples[g]['X_test_context'][-1]
initial_action_input = self.country_samples[g]['X_test_action'][-1]
initial_context_input = country_samples[g]['X_test_context'][-1]
initial_action_input = country_samples[g]['X_test_action'][-1]
# Predictions with passed npis
cnpis_df = npis_df[npis_df.GeoID == g]
npis_sequence = np.array(cnpis_df[NPI_COLUMNS])
Expand Down Expand Up @@ -284,11 +287,12 @@ def _load_additional_context_df():
return additional_context_df

@staticmethod
def _create_country_samples(df: pd.DataFrame, geos: list) -> dict:
def _create_country_samples(df: pd.DataFrame, geos: list, is_training: bool) -> dict:
"""
For each country, creates numpy arrays for Keras
:param df: a Pandas DataFrame with historical data for countries (the "Oxford" dataset)
:param geos: a list of geo names
:param is_training: True if the data will be used for training, False if it's used for predicting
:return: a dictionary of train and test sets, for each specified country
"""
context_column = 'PredictionRatio'
Expand All @@ -300,30 +304,35 @@ def _create_country_samples(df: pd.DataFrame, geos: list) -> dict:
cdf = cdf[cdf.ConfirmedCases.notnull()]
context_data = np.array(cdf[context_column])
action_data = np.array(cdf[action_columns])
outcome_data = np.array(cdf[outcome_column])
context_samples = []
action_samples = []
outcome_samples = []
nb_total_days = outcome_data.shape[0]
if is_training:
outcome_data = np.array(cdf[outcome_column])
outcome_samples = []
nb_total_days = context_data.shape[0]
else:
nb_total_days = context_data.shape[0] + 1
for d in range(NB_LOOKBACK_DAYS, nb_total_days):
context_samples.append(context_data[d - NB_LOOKBACK_DAYS:d])
action_samples.append(action_data[d - NB_LOOKBACK_DAYS:d])
outcome_samples.append(outcome_data[d])
if len(outcome_samples) > 0:
if is_training:
outcome_samples.append(outcome_data[d])
if len(context_samples) > 0:
X_context = np.expand_dims(np.stack(context_samples, axis=0), axis=2)
X_action = np.stack(action_samples, axis=0)
y = np.stack(outcome_samples, axis=0)
country_samples[g] = {
'X_context': X_context,
'X_action': X_action,
'y': y,
'X_train_context': X_context[:-NB_TEST_DAYS],
'X_train_action': X_action[:-NB_TEST_DAYS],
'y_train': y[:-NB_TEST_DAYS],
'X_test_context': X_context[-NB_TEST_DAYS:],
'X_test_action': X_action[-NB_TEST_DAYS:],
'y_test': y[-NB_TEST_DAYS:],
}
if is_training:
y = np.stack(outcome_samples, axis=0)
country_samples[g]['y'] = y
country_samples[g]['y_train'] = y[:-NB_TEST_DAYS]
country_samples[g]['y_test'] = y[-NB_TEST_DAYS:]
return country_samples

# Function for performing roll outs into the future
Expand Down Expand Up @@ -384,7 +393,7 @@ def _smooth_case_list(case_list, window):
def train(self):
print("Creating numpy arrays for Keras for each country...")
geos = self._most_affected_geos(self.df, MAX_NB_COUNTRIES, NB_LOOKBACK_DAYS)
country_samples = self._create_country_samples(self.df, geos)
country_samples = self._create_country_samples(self.df, geos, True)
print("Numpy arrays created")

# Aggregate data for training
Expand Down

0 comments on commit 4850feb

Please sign in to comment.