diff --git a/neuralprophet/data/process.py b/neuralprophet/data/process.py index 593cafbfb..9ee1c1a70 100644 --- a/neuralprophet/data/process.py +++ b/neuralprophet/data/process.py @@ -272,10 +272,29 @@ def _prepare_dataframe_to_predict(model, df: pd.DataFrame, max_lags: int, freq: if len(df_i.columns) == 1 and "ds" in df_i: if max_lags != 0: raise ValueError("only datestamps provided but y values needed for auto-regression.") - df_i = _check_dataframe(model, df_i, check_y=False, exogenous=False) + df_i, regressors_to_remove, lag_regressors_to_remove = df_utils.check_dataframe( + df=df_i, + n_forecasts=model.n_forecasts, + n_lags=model.n_lags, + check_y=False, + covariates=None, + regressors=None, + events=None, + seasonalities=None, + ) + else: - df_i = _check_dataframe(model, df_i, check_y=model.max_lags > 0, exogenous=False) - # fill in missing nans except for nans at end + df_i, regressors_to_remove, lag_regressors_to_remove = df_utils.check_dataframe( + df=df_i, + n_forecasts=model.n_forecasts, + n_lags=model.n_lags, + check_y=model.max_lags > 0, + covariates=None, + regressors=None, + events=None, + seasonalities=None, + ) + df_i = _handle_missing_data( df=df_i, freq=freq, @@ -368,67 +387,6 @@ def _validate_column_name( raise ValueError(f"Name {name!r} already used for an added regressor.") -def _check_dataframe( - model, - df: pd.DataFrame, - check_y: bool = True, - exogenous: bool = True, - future: Optional[bool] = None, -) -> pd.DataFrame: - """Performs basic data sanity checks and ordering - - Prepare dataframe for fitting or predicting. - - Parameters - ---------- - df : pd.DataFrame - dataframe containing column ``ds``, ``y``, and optionally``ID`` with all data - check_y : bool - if df must have series values - - Note - ---- - set to True if training or predicting with autoregression - exogenous : bool - whether to check covariates, regressors and events column names - future : bool - whether this function is called by make_future_dataframe() - - Returns - ------- - pd.DataFrame - checked dataframe - """ - if len(df) < (model.n_forecasts + model.n_lags) and not future: - raise ValueError( - "Dataframe has less than n_forecasts + n_lags rows. " - "Forecasting not possible. Please either use a larger dataset, or adjust the model parameters." - ) - df, _, _, _ = df_utils.prep_or_copy_df(df) - df, regressors_to_remove, lag_regressors_to_remove = df_utils.check_dataframe( - df=df, - check_y=check_y, - covariates=model.config_lagged_regressors if exogenous else None, - regressors=model.config_regressors if exogenous else None, - events=model.config_events if exogenous else None, - seasonalities=model.config_seasonality if exogenous else None, - future=True if future else None, - ) - if model.config_regressors is not None: - for reg in regressors_to_remove: - log.warning(f"Removing regressor {reg} because it is not present in the data.") - model.config_regressors.pop(reg) - if len(model.config_regressors) == 0: - model.config_regressors = None - if model.config_lagged_regressors is not None: - for reg in lag_regressors_to_remove: - log.warning(f"Removing lagged regressor {reg} because it is not present in the data.") - model.config_lagged_regressors.pop(reg) - if len(model.config_lagged_regressors) == 0: - model.config_lagged_regressors = None - return df - - def _handle_missing_data( df: pd.DataFrame, freq: Optional[str], diff --git a/neuralprophet/data/split.py b/neuralprophet/data/split.py index 164c75ef6..19a031e2a 100644 --- a/neuralprophet/data/split.py +++ b/neuralprophet/data/split.py @@ -7,7 +7,6 @@ from neuralprophet import df_utils from neuralprophet.configure import ConfigEvents, Regressor -from neuralprophet.data.process import _check_dataframe log = logging.getLogger("NP.data.splitting") @@ -47,6 +46,9 @@ def _maybe_extend_df( # Receives df with ID column periods_add = {} extended_df = pd.DataFrame() + if "ds" not in df and "y" in df: + dummy_ds = df_utils.create_dummy_datestamps(len(df)) + df.insert(loc=0, column="ds", value=dummy_ds) for df_name, df_i in df.groupby("ID"): _ = df_utils.infer_frequency(df_i, n_lags=max_lags, freq=freq) # to get all forecasteable values with df given, maybe extend into future: @@ -170,6 +172,15 @@ def _make_future_dataframe( ValueError If future values of all user specified regressors not provided. """ + if "ds" not in df and "y" in df: + if model.config_seasonality is None: + dummy_ds = df_utils.create_dummy_datestamps(len(df)) + df.insert(loc=0, column="ds", value=dummy_ds) + else: + raise ValueError( + "Provided dataframe has no column 'ds'. " + "To continue with dummy equidistant datestamps disable seasonality" + ) # Receives df with single ID column assert len(df["ID"].unique()) == 1 if periods == 0 and n_historic_predictions is True: @@ -239,9 +250,30 @@ def _make_future_dataframe( if len(df) > 0: if len(df.columns) == 1 and "ds" in df: assert max_lags == 0 - df = _check_dataframe(model, df, check_y=False, exogenous=False) + df, regressors_to_remove, lag_regressors_to_remove = df_utils.check_dataframe( + df=df, + n_forecasts=model.n_forecasts, + n_lags=model.n_lags, + check_y=False, + covariates=None, + regressors=None, + events=None, + seasonalities=None, + ) + else: - df = _check_dataframe(model, df, check_y=max_lags > 0, exogenous=True, future=True) + df, regressors_to_remove, lag_regressors_to_remove = df_utils.check_dataframe( + df=df, + n_forecasts=model.n_forecasts, + n_lags=model.n_lags, + check_y=max_lags > 0, + covariates=model.config_lagged_regressors, + regressors=model.config_regressors, + events=model.config_events, + seasonalities=model.config_seasonality, + future=True, + ) + # future data # check for external events known in future if model.config_events is not None and periods > 0 and events_df is None: diff --git a/neuralprophet/df_utils.py b/neuralprophet/df_utils.py index d879f8408..c03767002 100644 --- a/neuralprophet/df_utils.py +++ b/neuralprophet/df_utils.py @@ -12,7 +12,6 @@ if TYPE_CHECKING: from neuralprophet.configure import ConfigEvents, ConfigLaggedRegressors, ConfigSeasonality - log = logging.getLogger("NP.df_utils") @@ -41,7 +40,6 @@ def prep_or_copy_df(df: pd.DataFrame) -> tuple[pd.DataFrame, bool, bool, list[st """ if not isinstance(df, pd.DataFrame): raise ValueError("Provided DataFrame (df) must be of pd.DataFrame type.") - # Create a copy of the dataframe df_copy = df.copy(deep=True) @@ -417,6 +415,37 @@ def normalize(df, data_params): return df +def create_dummy_datestamps( + df_length, freq="S", startyear=1970, startmonth=1, startday=1, starthour=0, startminute=0, startsecond=0 +): + """ + Helper function to create a dummy series of datestamps for equidistant data without ds. + + Parameters + ---------- + df : int + Length of dataframe + freq : str + Frequency of data recording, any valid frequency for pd.date_range, such as ``D`` or ``M`` + startyear, startmonth, startday, starthour, startminute, startsecond : int + Defines the first datestamp + Returns + ------- + pd.DataFrame or dict + dataframe with dummy equidistant datestamps + """ + log.info( + f"Provided dataframe has no column 'ds' - dummy equidistant datestamps added. Frequency={freq}." + f"Consider calling 'df_utils.create_dummy_datestamps' to adjust ds." + ) + + startdate = pd.Timestamp( + year=startyear, month=startmonth, day=startday, hour=starthour, minute=startminute, second=startsecond + ) + dummy_ds = pd.Series(pd.date_range(startdate, periods=df_length, freq=freq)) + return dummy_ds.dt.strftime("%Y%m%d%H%M%S") + + def check_single_dataframe(df, check_y, covariates, regressors, events, seasonalities): """Performs basic data sanity checks and ordering as well as prepare dataframe for fitting or predicting. @@ -515,6 +544,8 @@ def check_single_dataframe(df, check_y, covariates, regressors, events, seasonal def check_dataframe( df: pd.DataFrame, + n_forecasts: int = 1, + n_lags: int = 0, check_y: bool = True, covariates=None, regressors=None, @@ -529,6 +560,10 @@ def check_dataframe( ---------- df : pd.DataFrame containing column ``ds`` + n_forecasts : int + number of forecasts to be made. + n_lags : int + previous regressors time steps to use as input in the predictor check_y : bool if df must have series values set to True if training or predicting with autoregression @@ -548,8 +583,13 @@ def check_dataframe( pd.DataFrame or dict checked dataframe """ - df, _, _, _ = prep_or_copy_df(df) checked_df = pd.DataFrame() + if len(df) < (n_forecasts + n_lags) and not future: + raise ValueError( + "Dataframe has less than n_forecasts + n_lags rows. " + "Forecasting not possible. Please either use a larger dataset, or adjust the model parameters." + ) + df, _, _, _ = prep_or_copy_df(df) for df_name, df_i in df.groupby("ID"): df_aux = check_single_dataframe(df_i, check_y, covariates, regressors, events, seasonalities) df_aux = df_aux.copy(deep=True) diff --git a/neuralprophet/forecaster.py b/neuralprophet/forecaster.py index 89a4d384a..934fead30 100644 --- a/neuralprophet/forecaster.py +++ b/neuralprophet/forecaster.py @@ -15,7 +15,6 @@ from neuralprophet import configure, df_utils, np_types, time_dataset, time_net, utils, utils_metrics from neuralprophet.data.process import ( - _check_dataframe, _convert_raw_predictions_to_raw_df, _create_dataset, _handle_missing_data, @@ -926,7 +925,41 @@ def fit( # Pre-processing # Copy df and save list of unique time series IDs (the latter for global-local modelling if enabled) df, _, _, self.id_list = df_utils.prep_or_copy_df(df) - df = _check_dataframe(self, df, check_y=True, exogenous=True) + # Checking if ds is in df + dummy_ds_act = False + if "ds" not in df and "y" in df: + dummy_ds_act = True + dummy_ds = df_utils.create_dummy_datestamps(len(df)) + df.insert(loc=0, column="ds", value=dummy_ds) + df, regressors_to_remove, lag_regressors_to_remove = df_utils.check_dataframe( + df=df, + n_forecasts=self.n_forecasts, + n_lags=self.n_lags, + check_y=True, + covariates=self.config_lagged_regressors, + regressors=self.config_regressors, + events=self.config_events, + seasonalities=self.config_seasonality, + ) + # Adjusting model properties + if self.config_seasonality is not None and dummy_ds_act is True: + for name, period in self.config_seasonality.periods.items(): + resolution = 0 + log.warning(f"Disabling {name} seasonality due to missing datestamps.") + self.config_seasonality.periods[name].resolution = resolution + if self.config_regressors is not None: + for reg in regressors_to_remove: + log.warning(f"Removing regressor {reg} because it is not present in the data.") + self.config_regressors.pop(reg) + if len(self.config_regressors) == 0: + self.config_regressors = None + if self.config_lagged_regressors is not None: + for reg in lag_regressors_to_remove: + log.warning(f"Removing lagged regressor {reg} because it is not present in the data.") + self.config_lagged_regressors.pop(reg) + if len(self.config_lagged_regressors) == 0: + self.config_lagged_regressors = None + self.data_freq = df_utils.infer_frequency(df, n_lags=self.max_lags, freq=freq) df = _handle_missing_data( df=df, @@ -969,7 +1002,20 @@ def fit( ) else: df_val, _, _, _ = df_utils.prep_or_copy_df(validation_df) - df_val = _check_dataframe(self, df_val, check_y=False, exogenous=False) + if "ds" not in df_val and "y" in df_val: + dummy_ds = df_utils.create_dummy_datestamps(len(df_val)) + df_val.insert(loc=0, column="ds", value=dummy_ds) + df_val, regressors_to_remove, lag_regressors_to_remove = df_utils.check_dataframe( + df=df_val, + n_forecasts=self.n_forecasts, + n_lags=self.n_lags, + check_y=False, + covariates=None, + regressors=None, + events=None, + seasonalities=None, + ) + df_val = _handle_missing_data( df=df_val, freq=self.data_freq, @@ -1113,7 +1159,20 @@ def test(self, df: pd.DataFrame): df, _, _, _ = df_utils.prep_or_copy_df(df) if self.fitted is False: log.warning("Model has not been fitted. Test results will be random.") - df = _check_dataframe(self, df, check_y=True, exogenous=True) + if "ds" not in df and "y" in df: + dummy_ds = df_utils.create_dummy_datestamps(len(df)) + df.insert(loc=0, column="ds", value=dummy_ds) + df, regressors_to_remove, lag_regressors_to_remove = df_utils.check_dataframe( + df=df, + n_forecasts=self.n_forecasts, + n_lags=self.n_lags, + check_y=True, + covariates=self.config_lagged_regressors, + regressors=self.config_regressors, + events=self.config_events, + seasonalities=self.config_seasonality, + ) + freq = df_utils.infer_frequency(df, n_lags=self.max_lags, freq=self.data_freq) df = _handle_missing_data( df=df, @@ -1253,7 +1312,20 @@ def split_df(self, df: pd.DataFrame, freq: str = "auto", valid_p: float = 0.2, l 2 2022-12-13 8.30 data3 """ df, received_ID_col, received_single_time_series, _ = df_utils.prep_or_copy_df(df) - df = _check_dataframe(self, df, check_y=False, exogenous=False) + if "ds" not in df and "y" in df: + dummy_ds = df_utils.create_dummy_datestamps(len(df)) + df.insert(loc=0, column="ds", value=dummy_ds) + df, regressors_to_remove, lag_regressors_to_remove = df_utils.check_dataframe( + df=df, + n_forecasts=self.n_forecasts, + n_lags=self.n_lags, + check_y=False, + covariates=None, + regressors=None, + events=None, + seasonalities=None, + ) + freq = df_utils.infer_frequency(df, n_lags=self.max_lags, freq=freq) df = _handle_missing_data( df=df, @@ -1442,7 +1514,20 @@ def crossvalidation_split_df( 2 2022-12-10 7.55 data3 """ df, received_ID_col, received_single_time_series, _ = df_utils.prep_or_copy_df(df) - df = _check_dataframe(self, df, check_y=False, exogenous=False) + if "ds" not in df and "y" in df: + dummy_ds = df_utils.create_dummy_datestamps(len(df)) + df.insert(loc=0, column="ds", value=dummy_ds) + df, regressors_to_remove, lag_regressors_to_remove = df_utils.check_dataframe( + df=df, + n_forecasts=self.n_forecasts, + n_lags=self.n_lags, + check_y=False, + covariates=None, + regressors=None, + events=None, + seasonalities=None, + ) + freq = df_utils.infer_frequency(df, n_lags=self.max_lags, freq=freq) df = _handle_missing_data( df=df, @@ -1506,7 +1591,20 @@ def double_crossvalidation_split_df( elements same as :meth:`crossvalidation_split_df` returns """ df, _, _, _ = df_utils.prep_or_copy_df(df) - df = _check_dataframe(self, df, check_y=False, exogenous=False) + if "ds" not in df and "y" in df: + dummy_ds = df_utils.create_dummy_datestamps(len(df)) + df.insert(loc=0, column="ds", value=dummy_ds) + df, regressors_to_remove, lag_regressors_to_remove = df_utils.check_dataframe( + df=df, + n_forecasts=self.n_forecasts, + n_lags=self.n_lags, + check_y=False, + covariates=None, + regressors=None, + events=None, + seasonalities=None, + ) + freq = df_utils.infer_frequency(df, n_lags=self.max_lags, freq=freq) df = _handle_missing_data( df=df, @@ -1552,7 +1650,17 @@ def create_df_with_events(self, df: pd.DataFrame, events_df: pd.DataFrame): "before creating the data with events features" ) df, received_ID_col, received_single_time_series, _ = df_utils.prep_or_copy_df(df) - df = _check_dataframe(self, df, check_y=True, exogenous=False) + df, regressors_to_remove, lag_regressors_to_remove = df_utils.check_dataframe( + df=df, + n_forecasts=self.n_forecasts, + n_lags=self.n_lags, + check_y=True, + covariates=None, + regressors=None, + events=None, + seasonalities=None, + ) + df_dict_events = df_utils.create_dict_for_events_or_regressors(df, events_df, "events") df_created = pd.DataFrame() for df_name, df_i in df.groupby("ID"): @@ -1711,7 +1819,20 @@ def predict_trend(self, df: pd.DataFrame, quantile: float = 0.5): raise ValueError("The quantile specified need to be a float in-between (0,1)") df, received_ID_col, received_single_time_series, _ = df_utils.prep_or_copy_df(df) - df = _check_dataframe(self, df, check_y=False, exogenous=False) + if "ds" not in df and "y" in df: + dummy_ds = df_utils.create_dummy_datestamps(len(df)) + df.insert(loc=0, column="ds", value=dummy_ds) + df, regressors_to_remove, lag_regressors_to_remove = df_utils.check_dataframe( + df=df, + n_forecasts=self.n_forecasts, + n_lags=self.n_lags, + check_y=False, + covariates=None, + regressors=None, + events=None, + seasonalities=None, + ) + df = _normalize(df=df, config_normalization=self.config_normalization) df_trend = pd.DataFrame() for df_name, df_i in df.groupby("ID"): @@ -1755,7 +1876,17 @@ def predict_seasonal_components(self, df: pd.DataFrame, quantile: float = 0.5): raise ValueError("The quantile specified need to be a float in-between (0,1)") df, received_ID_col, received_single_time_series, _ = df_utils.prep_or_copy_df(df) - df = _check_dataframe(self, df, check_y=False, exogenous=False) + df, regressors_to_remove, lag_regressors_to_remove = df_utils.check_dataframe( + df=df, + n_forecasts=self.n_forecasts, + n_lags=self.n_lags, + check_y=False, + covariates=None, + regressors=None, + events=None, + seasonalities=None, + ) + df = _normalize(df=df, config_normalization=self.config_normalization) df_seasonal = pd.DataFrame() for df_name, df_i in df.groupby("ID"): diff --git a/tests/test_utils.py b/tests/test_utils.py index 5ce37b25b..70d110f0e 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -25,6 +25,19 @@ PLOT = False +def test_create_dummy_datestamps(): + df = pd.read_csv(PEYTON_FILE, nrows=NROWS) + df_drop = df.drop("ds", axis=1) + + m = NeuralProphet(quantiles=[0.02, 0.98], epochs=10, weekly_seasonality=True) + _ = m.fit(df=df_drop, freq="S") + _ = m.split_df(df=df_drop) + _ = m.predict(df=df_drop) + + future = m.make_future_dataframe(df=df_drop, periods=365, n_historic_predictions=True) + forecast = m.predict(df=future) + + def test_save_load(): df = pd.read_csv(PEYTON_FILE, nrows=NROWS) m = NeuralProphet(