diff --git a/docs/source/how-to-guides/feature-guides/global_local_trend.ipynb b/docs/source/how-to-guides/feature-guides/global_local_trend.ipynb index ca7fd3eff..1fada83e4 100644 --- a/docs/source/how-to-guides/feature-guides/global_local_trend.ipynb +++ b/docs/source/how-to-guides/feature-guides/global_local_trend.ipynb @@ -1263,28 +1263,7 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [ - { - "ename": "ValueError", - "evalue": "Invalid frequency: NaT", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", - "Input \u001b[0;32mIn [27]\u001b[0m, in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m future \u001b[38;5;241m=\u001b[39m m\u001b[38;5;241m.\u001b[39mmake_future_dataframe(df_test)\n\u001b[0;32m----> 2\u001b[0m forecast \u001b[38;5;241m=\u001b[39m \u001b[43mm\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpredict\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfuture\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 3\u001b[0m metrics \u001b[38;5;241m=\u001b[39m m\u001b[38;5;241m.\u001b[39mtest(df_test)\n\u001b[1;32m 4\u001b[0m forecast_trend \u001b[38;5;241m=\u001b[39m m\u001b[38;5;241m.\u001b[39mpredict_trend(df_test)\n", - "File \u001b[0;32m~/Desktop/code/neural_prophet/neuralprophet/forecaster.py:831\u001b[0m, in \u001b[0;36mNeuralProphet.predict\u001b[0;34m(self, df, decompose, raw)\u001b[0m\n\u001b[1;32m 829\u001b[0m df, received_ID_col, received_single_time_series, _ \u001b[38;5;241m=\u001b[39m df_utils\u001b[38;5;241m.\u001b[39mprep_or_copy_df(df)\n\u001b[1;32m 830\u001b[0m \u001b[38;5;66;03m# to get all forecasteable values with df given, maybe extend into future:\u001b[39;00m\n\u001b[0;32m--> 831\u001b[0m df, periods_added \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_maybe_extend_df\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdf\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 832\u001b[0m df \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_prepare_dataframe_to_predict(df)\n\u001b[1;32m 833\u001b[0m \u001b[38;5;66;03m# normalize\u001b[39;00m\n", - "File \u001b[0;32m~/Desktop/code/neural_prophet/neuralprophet/forecaster.py:2773\u001b[0m, in \u001b[0;36mNeuralProphet._maybe_extend_df\u001b[0;34m(self, df)\u001b[0m\n\u001b[1;32m 2771\u001b[0m extended_df \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39mDataFrame()\n\u001b[1;32m 2772\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m df_name, df_i \u001b[38;5;129;01min\u001b[39;00m df\u001b[38;5;241m.\u001b[39mgroupby(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mID\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[0;32m-> 2773\u001b[0m _ \u001b[38;5;241m=\u001b[39m \u001b[43mdf_utils\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43minfer_frequency\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdf_i\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mn_lags\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmax_lags\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfreq\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdata_freq\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2774\u001b[0m \u001b[38;5;66;03m# to get all forecasteable values with df given, maybe extend into future:\u001b[39;00m\n\u001b[1;32m 2775\u001b[0m periods_add[df_name] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_get_maybe_extend_periods(df_i)\n", - "File \u001b[0;32m~/Desktop/code/neural_prophet/neuralprophet/df_utils.py:1324\u001b[0m, in \u001b[0;36minfer_frequency\u001b[0;34m(df, freq, n_lags, min_freq_percentage)\u001b[0m\n\u001b[1;32m 1322\u001b[0m freq_df \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlist\u001b[39m()\n\u001b[1;32m 1323\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m df_name, df_i \u001b[38;5;129;01min\u001b[39;00m df\u001b[38;5;241m.\u001b[39mgroupby(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mID\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n\u001b[0;32m-> 1324\u001b[0m freq_df\u001b[38;5;241m.\u001b[39mappend(\u001b[43m_infer_frequency\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdf_i\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfreq\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmin_freq_percentage\u001b[49m\u001b[43m)\u001b[49m)\n\u001b[1;32m 1325\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(\u001b[38;5;28mset\u001b[39m(freq_df)) \u001b[38;5;241m!=\u001b[39m \u001b[38;5;241m1\u001b[39m \u001b[38;5;129;01mand\u001b[39;00m n_lags \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[1;32m 1326\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 1327\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mOne or more dataframes present different major frequencies, please make sure all dataframes present the same major frequency for auto-regression\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1328\u001b[0m )\n", - "File \u001b[0;32m~/Desktop/code/neural_prophet/neuralprophet/df_utils.py:1252\u001b[0m, in \u001b[0;36m_infer_frequency\u001b[0;34m(df, freq, min_freq_percentage)\u001b[0m\n\u001b[1;32m 1250\u001b[0m dominant_freq_percentage \u001b[38;5;241m=\u001b[39m distribution\u001b[38;5;241m.\u001b[39mmax() \u001b[38;5;241m/\u001b[39m \u001b[38;5;28mlen\u001b[39m(df[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mds\u001b[39m\u001b[38;5;124m\"\u001b[39m])\n\u001b[1;32m 1251\u001b[0m num_freq \u001b[38;5;241m=\u001b[39m frequencies[np\u001b[38;5;241m.\u001b[39margmax(distribution)] \u001b[38;5;66;03m# get value of most common diff\u001b[39;00m\n\u001b[0;32m-> 1252\u001b[0m inferred_freq \u001b[38;5;241m=\u001b[39m \u001b[43mconvert_num_to_str_freq\u001b[49m\u001b[43m(\u001b[49m\u001b[43mnum_freq\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdf\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mds\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43miloc\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1254\u001b[0m log\u001b[38;5;241m.\u001b[39minfo(\n\u001b[1;32m 1255\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mMajor frequency \u001b[39m\u001b[38;5;132;01m{\u001b[39;00minferred_freq\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m corresponds to \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mnp\u001b[38;5;241m.\u001b[39mround(dominant_freq_percentage \u001b[38;5;241m*\u001b[39m \u001b[38;5;241m100\u001b[39m, \u001b[38;5;241m3\u001b[39m)\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m% of the data.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1256\u001b[0m )\n\u001b[1;32m 1257\u001b[0m ideal_freq_exists \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m \u001b[38;5;28;01mif\u001b[39;00m dominant_freq_percentage \u001b[38;5;241m>\u001b[39m\u001b[38;5;241m=\u001b[39m min_freq_percentage \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mFalse\u001b[39;00m\n", - "File \u001b[0;32m~/Desktop/code/neural_prophet/neuralprophet/df_utils.py:1159\u001b[0m, in \u001b[0;36mconvert_num_to_str_freq\u001b[0;34m(freq_num, initial_time_stamp)\u001b[0m\n\u001b[1;32m 1144\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mconvert_num_to_str_freq\u001b[39m(freq_num, initial_time_stamp):\n\u001b[1;32m 1145\u001b[0m \u001b[38;5;124;03m\"\"\"Convert numeric frequencies into frequency tags\u001b[39;00m\n\u001b[1;32m 1146\u001b[0m \n\u001b[1;32m 1147\u001b[0m \u001b[38;5;124;03m Parameters\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1157\u001b[0m \u001b[38;5;124;03m frequency tag\u001b[39;00m\n\u001b[1;32m 1158\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m-> 1159\u001b[0m aux_ts \u001b[38;5;241m=\u001b[39m \u001b[43mpd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdate_range\u001b[49m\u001b[43m(\u001b[49m\u001b[43minitial_time_stamp\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mperiods\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m100\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfreq\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpd\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mto_timedelta\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfreq_num\u001b[49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1160\u001b[0m freq_str \u001b[38;5;241m=\u001b[39m pd\u001b[38;5;241m.\u001b[39minfer_freq(aux_ts)\n\u001b[1;32m 1161\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m freq_str\n", - "File \u001b[0;32m~/Desktop/code/neural_prophet/env/lib/python3.8/site-packages/pandas/core/indexes/datetimes.py:1070\u001b[0m, in \u001b[0;36mdate_range\u001b[0;34m(start, end, periods, freq, tz, normalize, name, closed, inclusive, **kwargs)\u001b[0m\n\u001b[1;32m 1067\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m freq \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m com\u001b[38;5;241m.\u001b[39many_none(periods, start, end):\n\u001b[1;32m 1068\u001b[0m freq \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mD\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m-> 1070\u001b[0m dtarr \u001b[38;5;241m=\u001b[39m \u001b[43mDatetimeArray\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_generate_range\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1071\u001b[0m \u001b[43m \u001b[49m\u001b[43mstart\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstart\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1072\u001b[0m \u001b[43m \u001b[49m\u001b[43mend\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mend\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1073\u001b[0m \u001b[43m \u001b[49m\u001b[43mperiods\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mperiods\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1074\u001b[0m \u001b[43m \u001b[49m\u001b[43mfreq\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfreq\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1075\u001b[0m \u001b[43m \u001b[49m\u001b[43mtz\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtz\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1076\u001b[0m \u001b[43m \u001b[49m\u001b[43mnormalize\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mnormalize\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1077\u001b[0m \u001b[43m \u001b[49m\u001b[43minclusive\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minclusive\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1078\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1079\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1080\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m DatetimeIndex\u001b[38;5;241m.\u001b[39m_simple_new(dtarr, name\u001b[38;5;241m=\u001b[39mname)\n", - "File \u001b[0;32m~/Desktop/code/neural_prophet/env/lib/python3.8/site-packages/pandas/core/arrays/datetimes.py:409\u001b[0m, in \u001b[0;36mDatetimeArray._generate_range\u001b[0;34m(cls, start, end, periods, freq, tz, normalize, ambiguous, nonexistent, inclusive)\u001b[0m\n\u001b[1;32m 404\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m com\u001b[38;5;241m.\u001b[39mcount_not_none(start, end, periods, freq) \u001b[38;5;241m!=\u001b[39m \u001b[38;5;241m3\u001b[39m:\n\u001b[1;32m 405\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 406\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mOf the four parameters: start, end, periods, \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 407\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mand freq, exactly three must be specified\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 408\u001b[0m )\n\u001b[0;32m--> 409\u001b[0m freq \u001b[38;5;241m=\u001b[39m \u001b[43mto_offset\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfreq\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 411\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m start \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 412\u001b[0m start \u001b[38;5;241m=\u001b[39m Timestamp(start)\n", - "File \u001b[0;32mpandas/_libs/tslibs/offsets.pyx:3580\u001b[0m, in \u001b[0;36mpandas._libs.tslibs.offsets.to_offset\u001b[0;34m()\u001b[0m\n", - "File \u001b[0;32mpandas/_libs/tslibs/offsets.pyx:3682\u001b[0m, in \u001b[0;36mpandas._libs.tslibs.offsets.to_offset\u001b[0;34m()\u001b[0m\n", - "\u001b[0;31mValueError\u001b[0m: Invalid frequency: NaT" - ] - } - ], + "outputs": [], "source": [ "future = m.make_future_dataframe(df_test)\n", "forecast = m.predict(future)\n", diff --git a/neuralprophet/data/process.py b/neuralprophet/data/process.py index 549c747cd..3e099c83c 100644 --- a/neuralprophet/data/process.py +++ b/neuralprophet/data/process.py @@ -399,7 +399,8 @@ def _check_dataframe( "Dataframe has less than n_forecasts + n_lags rows. " "Forecasting not possible. Please either use a larger dataset, or adjust the model parameters." ) - df, _, _, _ = df_utils.prep_or_copy_df(df) + # df = df.copy(deep=True) + # df, _, _, _ = df_utils.check_multiple_series_id(df) df, regressors_to_remove, lag_regressors_to_remove = df_utils.check_dataframe( df=df, check_y=check_y, @@ -474,7 +475,8 @@ def _handle_missing_data( The pre-processed DataFrame, including imputed missing data, if applicable. """ - df, _, _, _ = df_utils.prep_or_copy_df(df) + # df = df.copy(deep=True) + # df, _, _, _ = df_utils.check_multiple_series_id(df) if n_lags == 0 and not predicting: # drop rows with NaNs in y and count them diff --git a/neuralprophet/data/transform.py b/neuralprophet/data/transform.py index a7a772f37..e79518af3 100644 --- a/neuralprophet/data/transform.py +++ b/neuralprophet/data/transform.py @@ -24,7 +24,8 @@ def _normalize(df: pd.DataFrame, config_normalization: Normalization) -> pd.Data ------- df: pd.DataFrame, normalized """ - df, _, _, _ = df_utils.prep_or_copy_df(df) + # df = df.copy(deep=True) + # df, _, _, _ = df_utils.check_multiple_series_id(df) df_norm = pd.DataFrame() for df_name, df_i in df.groupby("ID"): data_params = config_normalization.get_data_params(df_name) diff --git a/neuralprophet/df_utils.py b/neuralprophet/df_utils.py index 4bcb558cc..0252fdf72 100644 --- a/neuralprophet/df_utils.py +++ b/neuralprophet/df_utils.py @@ -22,7 +22,7 @@ class ShiftScale: scale: float = 1.0 -def prep_or_copy_df(df: pd.DataFrame) -> tuple[pd.DataFrame, bool, bool, list[str]]: +def check_multiple_series_id(df: pd.DataFrame) -> tuple[pd.DataFrame, bool, bool, list[str]]: """Copy df if it contains the ID column. Creates ID column with '__df__' if it is a df with a single time series. Parameters ---------- @@ -42,26 +42,23 @@ def prep_or_copy_df(df: pd.DataFrame) -> tuple[pd.DataFrame, bool, bool, list[st if not isinstance(df, pd.DataFrame): raise ValueError("Provided DataFrame (df) must be of pd.DataFrame type.") - # Create a copy of the dataframe - df_copy = df.copy(deep=True) - - df_has_id_column = "ID" in df_copy.columns + df_has_id_column = "ID" in df.columns # If there is no ID column, then add one with a single value if not df_has_id_column: log.debug("Provided DataFrame (df) contains a single time series.") - df_copy["ID"] = "__df__" - return df_copy, df_has_id_column, True, ["__df__"] + df["ID"] = "__df__" + return df, df_has_id_column, True, ["__df__"] # Create a list of unique ID values - unique_id_values = list(df_copy["ID"].unique()) + unique_id_values = list(df["ID"].unique()) # Check if there is only one unique ID value df_has_single_time_series = len(unique_id_values) == 1 + num_time_series_id = len(unique_id_values) - single_or_multiple_message = "a single" if df_has_single_time_series else "multiple" - log.debug(f"Provided DataFrame (df) has an ID column and contains {single_or_multiple_message} time series.") + log.debug(f"Provided DataFrame (df) has an ID column and contains {num_time_series_id} time series.") - return df_copy, df_has_id_column, df_has_single_time_series, unique_id_values + return df, df_has_id_column, df_has_single_time_series, unique_id_values def return_df_in_original_format(df, received_ID_col=False, received_single_time_series=True): @@ -285,7 +282,8 @@ def init_data_params( ShiftScale entries containing ``shift`` and ``scale`` parameters for each column """ # Compute Global data params - df, _, _, _ = prep_or_copy_df(df) + # df = df.copy(deep=True) + # df, _, _, _ = check_multiple_series_id(df) df_merged = df.copy(deep=True).drop("ID", axis=1) global_data_params = data_params_definition( df_merged, normalize, config_lagged_regressors, config_regressors, config_events, config_seasonality @@ -382,6 +380,8 @@ def normalize(df, data_params): """ df = df.copy(deep=True) for name in df.columns: + if name == "ID": + continue if name not in data_params.keys(): raise ValueError(f"Unexpected column {name} in data") new_name = name @@ -428,7 +428,8 @@ def check_dataframe( pd.DataFrame or dict checked dataframe """ - df, _, _, _ = prep_or_copy_df(df) + # df = df.copy(deep=True) + # df, _, _, _ = check_multiple_series_id(df) if df.groupby("ID").size().min() < 1: raise ValueError("Dataframe has no rows.") if "ds" not in df: @@ -642,7 +643,9 @@ def _crossvalidation_with_time_threshold(df, n_lags, n_forecasts, k, fold_pct, f min_train = total_samples - samples_fold - (k - 1) * (samples_fold - samples_overlap) assert min_train >= samples_fold folds = [] - df_fold, _, _, _ = prep_or_copy_df(df) + df_fold = df + # df_fold = df.copy(deep=True) + # df_fold, _, _, _ = check_multiple_series_id(df_fold) for i in range(k, 0, -1): threshold_time_stamp = find_time_threshold(df_fold, n_lags, n_forecasts, samples_fold, inputs_overbleed=True) df_train, df_val = split_considering_timestamp( @@ -704,7 +707,8 @@ def crossvalidation_split_df( validation data """ - df, _, _, _ = prep_or_copy_df(df) + # df = df.copy(deep=True) + df, _, _, _ = check_multiple_series_id(df) folds = [] if len(df["ID"].unique()) == 1: for df_name, df_i in df.groupby("ID"): @@ -764,7 +768,8 @@ def double_crossvalidation_split_df(df, n_lags, n_forecasts, k, valid_pct, test_ tuple of k tuples [(folds_val, folds_test), …] elements same as :meth:`crossvalidation_split_df` returns """ - df, _, _, _ = prep_or_copy_df(df) + # df = df.copy(deep=True) + # df, _, _, _ = check_multiple_series_id(df) if len(df["ID"].unique()) > 1: raise NotImplementedError("double_crossvalidation_split_df not implemented for df with many time series") fold_pct_test = float(test_pct) / k @@ -885,7 +890,8 @@ def split_df( pd.DataFrame, dict validation data """ - df, _, _, _ = prep_or_copy_df(df) + # df = df.copy(deep=True) + # df, _, _, _ = check_multiple_series_id(df) df_train = pd.DataFrame() df_val = pd.DataFrame() if local_split: @@ -1367,7 +1373,8 @@ def infer_frequency(df, freq, n_lags, min_freq_percentage=0.7): Valid frequency tag according to major frequency. """ - df, _, _, _ = prep_or_copy_df(df) + # df = df.copy(deep=True) + # df, _, _, _ = check_multiple_series_id(df) freq_df = list() for df_name, df_i in df.groupby("ID"): freq_df.append(_infer_frequency(df_i, freq, min_freq_percentage)) @@ -1410,8 +1417,8 @@ def create_dict_for_events_or_regressors( if other_df is None: # if other_df is None, create dictionary with None for each ID return {df_name: None for df_name in df_names} - - other_df, received_ID_col, _, _ = prep_or_copy_df(other_df) + other_df = other_df.copy(deep=True) + other_df, received_ID_col, _, _ = check_multiple_series_id(other_df) # if other_df does not contain ID, create dictionary with original ID with the same other_df for each ID if not received_ID_col: other_df = other_df.drop("ID", axis=1) diff --git a/neuralprophet/forecaster.py b/neuralprophet/forecaster.py index 6f14e2684..e95a4bcaa 100644 --- a/neuralprophet/forecaster.py +++ b/neuralprophet/forecaster.py @@ -615,7 +615,7 @@ def _create_dataset(self, df, predict_mode, components_stacker=None): ------- TimeDataset """ - df, _, _, _ = df_utils.prep_or_copy_df(df) + # df, _, _, _ = df_utils.check_multiple_series_id(df) return time_dataset.GlobalTimeDataset( df, predict_mode=predict_mode, @@ -1071,7 +1071,8 @@ def fit( ) # Copy df and save list of unique time series IDs (the latter for global-local modelling if enabled) - df, _, _, self.id_list = df_utils.prep_or_copy_df(df) + df = df.copy(deep=True) + df, _, _, self.id_list = df_utils.check_multiple_series_id(df) df = _check_dataframe(self, df, check_y=True, exogenous=True) # Infer frequency from data @@ -1187,6 +1188,7 @@ def fit( if not self.fitted: if self.config_trend.changepoints is not None: df_aux = pd.DataFrame({"ds": pd.Series(self.config_trend.changepoints)}) + df_aux["ID"] = "__df__" df_aux = _normalize(df=df_aux, config_normalization=self.config_normalization) self.config_trend.changepoints = df_aux["t"].values @@ -1228,8 +1230,8 @@ def fit( # Set up DataLoaders: Validation validation_enabled = validation_df is not None and isinstance(validation_df, pd.DataFrame) if validation_enabled: - df_val = validation_df - df_val, _, _, _ = df_utils.prep_or_copy_df(df_val) + df_val = validation_df.copy(deep=True) + df_val, _, _, _ = df_utils.check_multiple_series_id(df_val) df_val = _check_dataframe(self, df_val, check_y=False, exogenous=False) df_val = _handle_missing_data( df=df_val, @@ -1243,7 +1245,8 @@ def fit( config_seasonality=self.config_seasonality, predicting=False, ) - # df_val, _, _, _ = df_utils.prep_or_copy_df(df_val) + # df_val = df_val.copy(deep=True) + # df_val, _, _, _ = df_utils.check_multiple_series_id(df_val) df_val = _normalize(df=df_val, config_normalization=self.config_normalization) val_components_stacker = utils_time_dataset.ComponentStacker( n_lags=self.config_ar.n_lags, @@ -1399,7 +1402,8 @@ def predict(self, df: pd.DataFrame, decompose: bool = True, raw: bool = False, a log.warning("Raw forecasts are incompatible with plotting utilities") if self.fitted is False: raise ValueError("Model has not been fitted. Predictions will be random.") - df, received_ID_col, received_single_time_series, _ = df_utils.prep_or_copy_df(df) + df = df.copy(deep=True) + df, received_ID_col, received_single_time_series, _ = df_utils.check_multiple_series_id(df) # to get all forecasteable values with df given, maybe extend into future: df, periods_added = _maybe_extend_df( df=df, @@ -1463,9 +1467,10 @@ def test(self, df: pd.DataFrame, verbose: bool = True): pd.DataFrame evaluation metrics """ - df, _, _, _ = df_utils.prep_or_copy_df(df) if self.fitted is False: log.warning("Model has not been fitted. Test results will be random.") + df = df.copy(deep=True) + df, _, _, _ = df_utils.check_multiple_series_id(df) df = _check_dataframe(self, df, check_y=True, exogenous=True) freq = df_utils.infer_frequency(df, n_lags=self.config_model.max_lags, freq=self.data_freq) df = _handle_missing_data( @@ -1480,7 +1485,7 @@ def test(self, df: pd.DataFrame, verbose: bool = True): config_seasonality=self.config_seasonality, predicting=False, ) - df, _, _, _ = df_utils.prep_or_copy_df(df) + df, _, _, _ = df_utils.check_multiple_series_id(df) df = _normalize(df=df, config_normalization=self.config_normalization) components_stacker = utils_time_dataset.ComponentStacker( n_lags=self.config_ar.n_lags, @@ -1616,7 +1621,8 @@ def split_df(self, df: pd.DataFrame, freq: str = "auto", valid_p: float = 0.2, l 1 2022-12-13 8.02 data2 2 2022-12-13 8.30 data3 """ - df, received_ID_col, received_single_time_series, _ = df_utils.prep_or_copy_df(df) + df = df.copy(deep=True) + df, received_ID_col, received_single_time_series, _ = df_utils.check_multiple_series_id(df) df = _check_dataframe(self, df, check_y=False, exogenous=False) freq = df_utils.infer_frequency(df, n_lags=self.config_model.max_lags, freq=freq) df = _handle_missing_data( @@ -1805,7 +1811,8 @@ def crossvalidation_split_df( 1 2022-12-10 8.25 data2 2 2022-12-10 7.55 data3 """ - df, received_ID_col, received_single_time_series, _ = df_utils.prep_or_copy_df(df) + df = df.copy(deep=True) + df, received_ID_col, received_single_time_series, _ = df_utils.check_multiple_series_id(df) df = _check_dataframe(self, df, check_y=False, exogenous=False) freq = df_utils.infer_frequency(df, n_lags=self.config_model.max_lags, freq=freq) df = _handle_missing_data( @@ -1869,7 +1876,8 @@ def double_crossvalidation_split_df( tuple of k tuples [(folds_val, folds_test), …] elements same as :meth:`crossvalidation_split_df` returns """ - df, _, _, _ = df_utils.prep_or_copy_df(df) + df = df.copy(deep=True) + df, _, _, _ = df_utils.check_multiple_series_id(df) df = _check_dataframe(self, df, check_y=False, exogenous=False) freq = df_utils.infer_frequency(df, n_lags=self.config_model.max_lags, freq=freq) df = _handle_missing_data( @@ -1915,7 +1923,8 @@ def create_df_with_events(self, df: pd.DataFrame, events_df: pd.DataFrame): "The events configs should be added to the NeuralProphet object (add_events fn)" "before creating the data with events features" ) - df, received_ID_col, received_single_time_series, _ = df_utils.prep_or_copy_df(df) + df = df.copy(deep=True) + df, received_ID_col, received_single_time_series, _ = df_utils.check_multiple_series_id(df) df = _check_dataframe(self, df, check_y=True, exogenous=False) df_dict_events = df_utils.create_dict_for_events_or_regressors(df, events_df, "events") df_created = pd.DataFrame() @@ -1991,7 +2000,8 @@ def make_future_dataframe( >>> forecast = m.predict(df=future) """ - df, received_ID_col, received_single_time_series, _ = df_utils.prep_or_copy_df(df) + df = df.copy(deep=True) + df, received_ID_col, received_single_time_series, _ = df_utils.check_multiple_series_id(df) events_dict = df_utils.create_dict_for_events_or_regressors(df, events_df, "events") regressors_dict = df_utils.create_dict_for_events_or_regressors(df, regressors_df, "regressors") @@ -2073,8 +2083,8 @@ def predict_trend(self, df: pd.DataFrame, quantile: float = 0.5): """ if quantile is not None and not (0 < quantile < 1): raise ValueError("The quantile specified need to be a float in-between (0,1)") - - df, received_ID_col, received_single_time_series, _ = df_utils.prep_or_copy_df(df) + df = df.copy(deep=True) + df, received_ID_col, received_single_time_series, _ = df_utils.check_multiple_series_id(df) df = _check_dataframe(self, df, check_y=False, exogenous=False) df = _normalize(df=df, config_normalization=self.config_normalization) df_trend = pd.DataFrame() @@ -2130,7 +2140,8 @@ def predict_seasonal_components(self, df: pd.DataFrame, quantile: float = 0.5): self.config_ar.n_lags = 0 self.config_model.n_forecasts = 1 - df, received_ID_col, received_single_time_series, _ = df_utils.prep_or_copy_df(df) + df = df.copy(deep=True) + df, received_ID_col, received_single_time_series, _ = df_utils.check_multiple_series_id(df) df = _check_dataframe(self, df, check_y=False, exogenous=False) df = _normalize(df=df, config_normalization=self.config_normalization) for df_name, df_i in df.groupby("ID"): @@ -2298,7 +2309,8 @@ def plot( ---- None (default): plot self.highlight_forecast_step_n by default """ - fcst, received_ID_col, received_single_time_series, _ = df_utils.prep_or_copy_df(fcst) + fcst = fcst.copy(deep=True) + fcst, received_ID_col, received_single_time_series, _ = df_utils.check_multiple_series_id(fcst) if not received_single_time_series: if df_name not in fcst["ID"].unique(): assert len(fcst["ID"].unique()) > 1 @@ -2413,7 +2425,8 @@ def get_latest_forecast( """ if self.config_model.max_lags == 0: raise ValueError("Use the standard plot function for models without lags.") - fcst, received_ID_col, received_single_time_series, _ = df_utils.prep_or_copy_df(fcst) + fcst = fcst.copy(deep=True) + fcst, received_ID_col, received_single_time_series, _ = df_utils.check_multiple_series_id(fcst) if not received_single_time_series: if df_name not in fcst["ID"].unique(): assert len(fcst["ID"].unique()) > 1 @@ -2489,7 +2502,8 @@ def plot_latest_forecast( """ if self.config_model.max_lags == 0: raise ValueError("Use the standard plot function for models without lags.") - fcst, received_ID_col, received_single_time_series, _ = df_utils.prep_or_copy_df(fcst) + fcst = fcst.copy(deep=True) + fcst, received_ID_col, received_single_time_series, _ = df_utils.check_multiple_series_id(fcst) if not received_single_time_series: if df_name not in fcst["ID"].unique(): assert len(fcst["ID"].unique()) > 1 @@ -2625,7 +2639,8 @@ def plot_components( matplotlib.axes.Axes plot of NeuralProphet components """ - fcst, received_ID_col, received_single_time_series, _ = df_utils.prep_or_copy_df(fcst) + fcst = fcst.copy(deep=True) + fcst, received_ID_col, received_single_time_series, _ = df_utils.check_multiple_series_id(fcst) if not received_single_time_series: if df_name not in fcst["ID"].unique(): assert len(fcst["ID"].unique()) > 1 diff --git a/neuralprophet/time_dataset.py b/neuralprophet/time_dataset.py index 1044d63eb..85116040f 100644 --- a/neuralprophet/time_dataset.py +++ b/neuralprophet/time_dataset.py @@ -45,10 +45,10 @@ def __init__( # Context Notes # Currently done to df before it arrives here: - # -> fit calls prep_or_copy_df, _check_dataframe, and _handle_missing_data, passes to _train - # -> _train calls prep_or_copy_df, then passes to init_train_loader, which returns the train_loader - # -> init_train_loader calls prep_or_copy_df, _normalize, _create_dataset (returns TimeDataset), returns dataset wrapped in DataLoader - # ->_create_dataset calls prep_or_copy_df, then returns GlobalTimeDataset + # -> fit calls copy, check_multiple_series_id, _check_dataframe, and _handle_missing_data, passes to _train + # -> _train calls passes to init_train_loader, which returns the train_loader + # -> init_train_loader calls _normalize, _create_dataset (returns TimeDataset), returns dataset wrapped in DataLoader + # ->_create_dataset returns GlobalTimeDataset # Future TODO: integrate some of these preprocessing steps happening outside? self.df = df.reset_index(drop=True) # Needed for index based operations in __getitem__ diff --git a/tests/test_integration.py b/tests/test_integration.py index 8185f5d2e..0abe17bd3 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -55,6 +55,7 @@ def test_train_eval_test(): learning_rate=LR, ) df = pd.read_csv(PEYTON_FILE, nrows=95) + df, _, _, _ = df_utils.check_multiple_series_id(df) df, _, _ = df_utils.check_dataframe(df, check_y=False) _handle_missing_data( df=df, @@ -78,10 +79,11 @@ def test_train_eval_test(): def test_df_utils_func(): log.info("testing: df_utils Test") df = pd.read_csv(PEYTON_FILE, nrows=95) + # df = df.copy(deep=True) + df, _, _, _ = df_utils.check_multiple_series_id(df) df, _, _ = df_utils.check_dataframe(df, check_y=False) # test find_time_threshold - df, _, _, _ = df_utils.prep_or_copy_df(df) time_threshold = df_utils.find_time_threshold(df, n_lags=2, n_forecasts=2, valid_p=0.2, inputs_overbleed=True) df_train, df_val = df_utils.split_considering_timestamp( df, n_lags=2, n_forecasts=2, inputs_overbleed=True, threshold_time_stamp=time_threshold diff --git a/tests/test_regularization.py b/tests/test_regularization.py index 6d75e0b40..d96464c3b 100644 --- a/tests/test_regularization.py +++ b/tests/test_regularization.py @@ -57,6 +57,7 @@ def test_reg_func_abs(): def test_regularization_holidays(): log.info("testing: regularization of holidays") df = generate_holiday_dataset(y_holidays_override=Y_HOLIDAYS_OVERRIDE) + df, _, _, _ = df_utils.check_multiple_series_id(df) df, _, _ = df_utils.check_dataframe(df, check_y=False) m = NeuralProphet( @@ -92,6 +93,7 @@ def test_regularization_holidays(): def test_regularization_events(): log.info("testing: regularization of events") df, events = generate_event_dataset(y_events_override=Y_EVENTS_OVERRIDE) + df, _, _, id_list = df_utils.check_multiple_series_id(df) df, _, _ = df_utils.check_dataframe(df, check_y=False) m = NeuralProphet( @@ -147,6 +149,7 @@ def test_regularization_lagged_regressor(): """ log.info("testing: regularization lagged regressors") df, lagged_regressors = generate_lagged_regressor_dataset(periods=100) + df, _, _, id_list = df_utils.check_multiple_series_id(df) df, _, _ = df_utils.check_dataframe(df, check_y=False) m = NeuralProphet( diff --git a/tests/test_unit.py b/tests/test_unit.py index a273f0acd..7f42b1495 100644 --- a/tests/test_unit.py +++ b/tests/test_unit.py @@ -79,9 +79,11 @@ def test_timedataset_minimal(): config_model.set_max_num_lags(n_lags) config_missing = configure.MissingDataHandling() # config_train = configure.Train() + df_in, _, _, _ = df_utils.check_multiple_series_id(df_in) df, df_val = df_utils.split_df(df_in, n_lags, n_forecasts, valid_p) # create a tabularized dataset from time series - df, _, _, _ = df_utils.prep_or_copy_df(df) + # df = df.copy(deep=True) + # df, _, _, _ = df_utils.check_multiple_series_id(df) df, _, _ = df_utils.check_dataframe(df) df = _handle_missing_data( df, @@ -135,10 +137,7 @@ def test_timedataset_minimal(): def test_normalize(): length = 100 days = pd.date_range(start="2017-01-01", periods=length) - y = np.ones(length) - y[1] = 0 - y[2] = 2 - y[3] = 3.3 + y = np.arange(length) df = pd.DataFrame({"ds": days, "y": y}) m = NeuralProphet( epochs=EPOCHS, @@ -146,7 +145,8 @@ def test_normalize(): learning_rate=LR, normalize="soft", ) - df, _, _, _ = df_utils.prep_or_copy_df(df) + df = df.copy(deep=True) + df, _, _, _ = df_utils.check_multiple_series_id(df) # with config m.config_normalization.init_data_params(df, m.config_lagged_regressors, m.config_regressors, m.config_events) @@ -155,11 +155,28 @@ def test_normalize(): m.config_normalization.unknown_data_normalization = True _normalize(df=df, config_normalization=m.config_normalization) m.config_normalization.unknown_data_normalization = False + # using config for utils df = df.drop("ID", axis=1) df_utils.normalize(df, m.config_normalization.global_data_params) df_utils.normalize(df, m.config_normalization.local_data_params["__df__"]) + +def test_normalize_utils(): + length = 100 + days = pd.date_range(start="2017-01-01", periods=length) + y = np.arange(length) + df = pd.DataFrame({"ds": days, "y": y}) + m = NeuralProphet( + epochs=EPOCHS, + batch_size=BATCH_SIZE, + learning_rate=LR, + normalize="soft", + ) + df, _, _, _ = df_utils.check_multiple_series_id(df) + + # m.config_normalization.unknown_data_normalization = True + # with utils local_data_params, global_data_params = df_utils.init_data_params( df=df, @@ -170,8 +187,10 @@ def test_normalize(): global_normalization=m.config_normalization.global_normalization, global_time_normalization=m.config_normalization.global_time_normalization, ) - df_utils.normalize(df, global_data_params) - df_utils.normalize(df, local_data_params["__df__"]) + log.error(local_data_params) + log.error(global_data_params) + df_utils.normalize(df.copy(deep=True), global_data_params) + df_utils.normalize(df.copy(deep=True), local_data_params["__df__"]) def test_add_lagged_regressors(): @@ -250,6 +269,7 @@ def check_split(df_in, df_len_expected, n_lags, n_forecasts, freq, p=0.1): n_lags=n_lags, n_forecasts=n_forecasts, ) + df_in, _, _, _ = df_utils.check_multiple_series_id(df_in) df_in, _, _ = df_utils.check_dataframe(df_in, check_y=False) df_in = _handle_missing_data( df=df_in, @@ -297,6 +317,7 @@ def check_split(df_in, df_len_expected, n_lags, n_forecasts, freq, p=0.1): def test_cv(): def check_folds(df, n_lags, n_forecasts, valid_fold_num, valid_fold_pct, fold_overlap_pct): + df, _, _, _ = df_utils.check_multiple_series_id(df) folds = df_utils.crossvalidation_split_df( df, n_lags, n_forecasts, valid_fold_num, valid_fold_pct, fold_overlap_pct ) @@ -318,8 +339,9 @@ def check_folds(df, n_lags, n_forecasts, valid_fold_num, valid_fold_pct, fold_ov assert all([x == y for (x, y) in zip(train_folds_samples, train_folds_should)]) len_df = 100 + df = pd.DataFrame({"ds": pd.date_range(start="2017-01-01", periods=len_df), "y": np.arange(len_df)}) check_folds( - df=pd.DataFrame({"ds": pd.date_range(start="2017-01-01", periods=len_df), "y": np.arange(len_df)}), + df=df, n_lags=0, n_forecasts=1, valid_fold_num=3, @@ -327,8 +349,9 @@ def check_folds(df, n_lags, n_forecasts, valid_fold_num, valid_fold_pct, fold_ov fold_overlap_pct=0.0, ) len_df = 1000 + df = pd.DataFrame({"ds": pd.date_range(start="2017-01-01", periods=len_df), "y": np.arange(len_df)}) check_folds( - df=pd.DataFrame({"ds": pd.date_range(start="2017-01-01", periods=len_df), "y": np.arange(len_df)}), + df=df, n_lags=50, n_forecasts=10, valid_fold_num=10, @@ -342,6 +365,7 @@ def check_folds_dict( df, n_lags, n_forecasts, valid_fold_num, valid_fold_pct, fold_overlap_pct, global_model_cv_type="local" ): "Does not work with global_model_cv_type == global-time or global_model_cv_type is None" + df, _, _, _ = df_utils.check_multiple_series_id(df) folds = df_utils.crossvalidation_split_df( df, n_lags, @@ -502,8 +526,9 @@ def test_reg_delay(): def test_double_crossvalidation(): len_df = 100 + df = pd.DataFrame({"ds": pd.date_range(start="2017-01-01", periods=len_df), "y": np.arange(len_df), "ID": "__df__"}) folds_val, folds_test = df_utils.double_crossvalidation_split_df( - df=pd.DataFrame({"ds": pd.date_range(start="2017-01-01", periods=len_df), "y": np.arange(len_df)}), + df=df, n_lags=0, n_forecasts=1, k=3, @@ -531,8 +556,10 @@ def test_double_crossvalidation(): learning_rate=LR, n_lags=2, ) + len_df = 100 + df = pd.DataFrame({"ds": pd.date_range(start="2017-01-01", periods=len_df), "y": np.arange(len_df), "ID": "__df__"}) folds_val, folds_test = m.double_crossvalidation_split_df( - df=pd.DataFrame({"ds": pd.date_range(start="2017-01-01", periods=len_df), "y": np.arange(len_df)}), + df=df, k=3, valid_pct=0.3, test_pct=0.15, @@ -554,7 +581,10 @@ def test_double_crossvalidation(): # Raise not implemented error as double_crossvalidation is not compatible with many time series with pytest.raises(NotImplementedError): - df = pd.DataFrame({"ds": pd.date_range(start="2017-01-01", periods=len_df), "y": np.arange(len_df)}) + len_df = 100 + df = pd.DataFrame( + {"ds": pd.date_range(start="2017-01-01", periods=len_df), "y": np.arange(len_df), "ID": "__df__"} + ) df1 = df.copy(deep=True) df1["ID"] = "df1" df2 = df.copy(deep=True) @@ -891,6 +921,7 @@ def test_too_many_NaN(): limit_linear=config_missing.impute_linear, rolling=config_missing.impute_rolling, ) + df, _, _, id_list = df_utils.check_multiple_series_id(df) df, _, _ = df_utils.check_dataframe(df) local_data_params, global_data_params = df_utils.init_data_params(df=df, normalize="minmax") df = df.drop("ID", axis=1) diff --git a/tests/utils/benchmark_time_dataset.py b/tests/utils/benchmark_time_dataset.py index af2dda090..f20f30e7e 100644 --- a/tests/utils/benchmark_time_dataset.py +++ b/tests/utils/benchmark_time_dataset.py @@ -70,8 +70,8 @@ def load(nrows=NROWS, epochs=EPOCHS, batch=BATCH_SIZE, season=True, iterations=1 ) # Mimick m.fit(df) behavior - - df, _, _, m.id_list = df_utils.prep_or_copy_df(df) + df = df.copy(deep=True) + df, _, _, m.id_list = df_utils.check_multiple_series_id(df) df = _check_dataframe(m, df, check_y=True, exogenous=True) m.data_freq = df_utils.infer_frequency(df, n_lags=m.config_model.max_lags, freq=freq) df = _handle_missing_data(