biomarkersParkinson · KarsVeldkamp · Jan 14, 2025 · Dec 3, 2024 · Dec 3, 2024 · Dec 3, 2024
diff --git a/docs/notebooks/ppg/ppg_analysis.ipynb b/docs/notebooks/ppg/ppg_analysis.ipynb
@@ -13,6 +13,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "import importlib.resources\n",
+    "import pickle\n",
     "from pathlib import Path\n",
     "\n",
     "from paradigma.config import PPGConfig, IMUConfig, SignalQualityFeatureExtractionConfig, SignalQualityFeatureExtractionAccConfig, SignalQualityClassificationConfig, HeartRateExtractionConfig\n",
@@ -67,8 +69,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "df_ppg, metadata_time, metadata_values = load_tsdf_dataframe(path_to_prepared_data / ppg_prefix, prefix = ppg_prefix)\n",
-    "df_imu, metadata_time, metadata_values = load_tsdf_dataframe(path_to_prepared_data / imu_prefix, prefix = imu_prefix)"
+    "df_ppg, _, _ = load_tsdf_dataframe(path_to_prepared_data / ppg_prefix, prefix = ppg_prefix)\n",
+    "df_imu, _, _ = load_tsdf_dataframe(path_to_prepared_data / imu_prefix, prefix = imu_prefix)"
    ]
   },
   {
@@ -121,8 +123,11 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "full_path_to_classifier_package = path_to_assets / ppg_classifier_package_filename\n",
+    "with importlib.resources.files('paradigma.assets').joinpath(ppg_classifier_package_filename).open('rb') as f:\n",
+    "    clf_package = pickle.load(f)\n",
     "config = SignalQualityClassificationConfig()\n",
-    "df_sqa = signal_quality_classification(df_features, config, path_to_classifier)"
+    "df_sqa = signal_quality_classification(df_features, config, clf_package)"
    ]
   },
   {

diff --git a/docs/tutorials/heart_rate_analysis.ipynb b/docs/tutorials/heart_rate_analysis.ipynb
@@ -11,21 +11,21 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "This tutorial shows how to extract heart rate estimates using photoplethysmography (PPG) data and accelerometer data. The pipeline consists of a stepwise approach ot determine signal quality, assessing both PPG morphology and accounting for periodic artifacts using the accelerometer. Based on the signal quality, we extract high-quality segments and estimate the heart rate for every 2 s using the smoothed pseudo Wigner-Ville Distribution. "
+    "This tutorial shows how to extract heart rate estimates using photoplethysmography (PPG) data and accelerometer data. The pipeline consists of a stepwise approach to determine signal quality, assessing both PPG morphology and accounting for periodic artifacts using the accelerometer. Based on the signal quality, we extract high-quality segments and estimate the heart rate for every 2 s using the smoothed pseudo Wigner-Ville Distribution. "
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Load example data"
+    "## Load data"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Example PPG + accelerometer (from IMU files) data from a participant of the Personalized Parkinson Project is loaded. The data was prepared as explained in `data_preparation.ipynb`. The prepared IMU data contains both accelerometer and gyroscope data, but only accelerometer is used in this pipeline alongside the PPG data. We load the corresponding dataframes using the `load_tsdf_dataframe function`."
+    "This pipeline requires accelerometer and PPG data to run. In this example we loaded data from a participant of the Personalized Parkinson Project. We load the corresponding dataframes using the `load_tsdf_dataframe function`. The channel `green` represents the values obtained with PPG using green light."
    ]
   },
   {
@@ -329,8 +329,8 @@
     "ppg_prefix = 'PPG'\n",
     "imu_prefix = 'IMU'\n",
     "\n",
-    "df_ppg, metadata_time, metadata_values = load_tsdf_dataframe(path_to_prepared_data / ppg_prefix, prefix = ppg_prefix)\n",
-    "df_imu, metadata_time, metadata_values = load_tsdf_dataframe(path_to_prepared_data / imu_prefix, prefix = imu_prefix)\n",
+    "df_ppg, _, _ = load_tsdf_dataframe(path_to_prepared_data / ppg_prefix, prefix = ppg_prefix)\n",
+    "df_imu, _, _ = load_tsdf_dataframe(path_to_prepared_data / imu_prefix, prefix = imu_prefix)\n",
     "\n",
     "display(df_ppg, df_imu)"
    ]
@@ -346,7 +346,9 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "The first step after loading the data is to preprocess the data. Preprocessing starts by extracting the data where there is both PPG and IMU data. In this way, we discard the the first or last part of the segment when e.g. the PPG segment is recorded longer than the accelerometer. After this step, the preprocess_ppg_data function  resamples the values of both PPG and accelerometer data using uniformly distributed timestamps, since the sampling rate of both sensors is fixed but not uniform. After this, a bandpass filter (butterworth, 4th-order, cut-off frequencies: [0.4, 3.5]) is applied to the PPG signal and a high-pass (butterworth, 4th-order, cut-off: 0.2 Hz) filter is applied to the accelerometer. "
+    "The first step after loading the data is preprocessing. This begins by isolating segments containing both PPG and IMU data, discarding portions where one modality (e.g., PPG) extends beyond the other, such as when the PPG recording is longer than the accelerometer data. After this step, the preprocess_ppg_data function resamples the PPG and accelerometer data to uniformly distributed timestamps, addressing the fixed but non-uniform sampling rates of the sensors. After this, a bandpass Butterworth filter (4th-order, bandpass frequencies: 0.4--3.5 Hz) is applied to the PPG signal, while a high-pass Butterworth filter (4th-order, cut-off frequency: 0.2 Hz) is applied to the accelerometer data.\n",
+    "\n",
+    "Note: the printed shapes are the data points (rows) and the data columns (rows x columns). The number of rows of the overlapping segments of PPG and accelerometer are not exactly the same due to sampling differences (other sensors and possibly other sampling frequencies). "
    ]
   },
   {
@@ -358,8 +360,12 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Shape of the original data: (64775, 2) (72947, 4)\n",
-      "Shape of the overlapping segments: (64775, 2) (64361, 4)\n"
+      "Original data shapes:\n",
+      "- PPG data: (64775, 2)\n",
+      "- Accelerometer data: (72947, 4)\n",
+      "Overlapping data shapes:\n",
+      "- PPG data: (64775, 2)\n",
+      "- Accelerometer data: (64361, 4)\n"
      ]
     },
     {
@@ -678,6 +684,14 @@
    "execution_count": null,
    "metadata": {},
    "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The default window length for the signal quality feature extraction is set to 6 seconds.\n",
+      "The default step size for the signal quality feature extraction is set to 1 seconds.\n"
+     ]
+    },
     {
      "data": {
       "text/html": [
@@ -927,17 +941,18 @@
        "[639 rows x 12 columns]"
       ]
      },
-     "execution_count": 27,
+     "execution_count": 3,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
     "from paradigma.config import SignalQualityFeatureExtractionConfig, SignalQualityFeatureExtractionAccConfig\n",
     "from paradigma.heart_rate.heart_rate_analysis import extract_signal_quality_features\n",
-    "\n",
     "ppg_config = SignalQualityFeatureExtractionConfig()\n",
     "acc_config = SignalQualityFeatureExtractionAccConfig()\n",
+    "print(\"The default window length for the signal quality feature extraction is set to\", ppg_config.window_length_s, \"seconds.\")\n",
+    "print(\"The default step size for the signal quality feature extraction is set to\", ppg_config.window_step_length_s, \"seconds.\")\n",
     "df_features = extract_signal_quality_features(ppg_config, df_ppg_proc, acc_config, df_acc_proc)\n",
     "df_features\n"
    ]
@@ -953,7 +968,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "A trained logistic classifier is used to classify PPG signal quality and returns the `pred_sqa_proba`, which is the posterior probability of a PPG window to look like the typical PPG morphology (higher probability indicates toward the typical PPG morphology). The relative power feature from the accelerometer is compared to a threshold for periodic artifacts and therefore `pred_sqa_acc_label` returns a label indicating probably periodic motion artifacts (label 0) or no periodic motion artifacts (label 1). "
+    "A trained logistic classifier is used to predicts PPG signal quality and returns the `pred_sqa_proba`, which is the posterior probability of a PPG window to look like the typical PPG morphology (higher probability indicates toward the typical PPG morphology). The relative power feature from the accelerometer is compared to a threshold for periodic artifacts and therefore `pred_sqa_acc_label` returns a label indicating predicted periodic motion artifacts (label 0) or no periodic motion artifacts (label 1). "
    ]
   },
   {
@@ -1064,7 +1079,7 @@
        "[639 rows x 2 columns]"
       ]
      },
-     "execution_count": 28,
+     "execution_count": 4,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1090,14 +1105,21 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "For heart rate estimation, we extract segments of `config.tfd_length`. Using a 2-step approach, we calculate the smoothed-pseudo Wigner-Ville Distribution to obtain the frequency content of the PPG signal over time. For every 2-second window, we identified the frequency with the highest power for each data point and assigned the average of these frequency as the heart rate in that 2-second window. "
+    "For heart rate estimation, we extract segments of `config.tfd_length`. We calculate the smoothed-pseudo Wigner-Ville Distribution (SPWVD) to obtain the frequency content of the PPG signal over time. We extract for every timestamp in the SPWVD for the frequency with the highest power. For every non-overlapping 2-second window we average the corresponding frequencies to obtain a heart rate per window. "
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The default minimal window length for the heart rate extraction is set to 10 seconds.\n"
+     ]
+    },
     {
      "data": {
       "text/html": [
@@ -1119,7 +1141,7 @@
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
-       "      <th>rel_time</th>\n",
+       "      <th>time</th>\n",
        "      <th>heart_rate</th>\n",
        "    </tr>\n",
        "  </thead>\n",
@@ -1159,16 +1181,16 @@
        "</div>"
       ],
       "text/plain": [
-       "   rel_time  heart_rate\n",
-       "0      56.0   86.404715\n",
-       "1      58.0   86.640472\n",
-       "2      60.0   86.345776\n",
-       "3      62.0   84.872299\n",
-       "4      64.0   84.872299\n",
-       "5      66.0   84.194499"
+       "   time  heart_rate\n",
+       "0  56.0   86.404715\n",
+       "1  58.0   86.640472\n",
+       "2  60.0   86.345776\n",
+       "3  62.0   84.872299\n",
+       "4  64.0   84.872299\n",
+       "5  66.0   84.194499"
       ]
      },
-     "execution_count": 29,
+     "execution_count": 5,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1178,6 +1200,7 @@
     "from paradigma.heart_rate.heart_rate_analysis import estimate_heart_rate\n",
     "\n",
     "config = HeartRateExtractionConfig()\n",
+    "print(\"The default minimal window length for the heart rate extraction is set to\", config.tfd_length, \"seconds.\")\n",
     "df_hr = estimate_heart_rate(df_sqa, df_ppg_proc, config)\n",
     "df_hr"
    ]
@@ -1193,7 +1216,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "The final step is to aggregate all 2-second heart rate estimates. In the current example, the mode and 99th percentile are calculated. The mode represent the resting heart rate while the 99th percentile indicates the maximum heart rate. In Parkinson's disease, we expect that these two measures could reflect autonomic (dys)functioning."
+    "The final step is to aggregate all 2-second heart rate estimates. In the current example, the mode and 99th percentile are calculated. We hypothesize that the mode gives representation of the resting heart rate while the 99th percentile indicates the maximum heart rate. In Parkinson's disease, we expect that these two measures could reflect autonomic (dys)functioning. The `nr_hr_est` in the metadata indicates on how many 2-second windows these aggregates are determined."
    ]
   },
   {

diff --git a/src/paradigma/heart_rate/heart_rate_analysis.py b/src/paradigma/heart_rate/heart_rate_analysis.py
@@ -249,7 +249,7 @@ def estimate_heart_rate(df_sqa: pd.DataFrame, df_ppg_preprocessed: pd.DataFrame,
         t_hr_rel[hr_pos:hr_pos + n_hr] = hr_time
         hr_pos += n_hr
 
-    df_hr = pd.DataFrame({"rel_time": t_hr_rel, "heart_rate": v_hr_rel})
+    df_hr = pd.DataFrame({"time": t_hr_rel, "heart_rate": v_hr_rel})
 
     return df_hr
 

diff --git a/src/paradigma/preprocessing.py b/src/paradigma/preprocessing.py
@@ -244,7 +244,7 @@ def preprocess_imu_data_io(path_to_input: str | Path, path_to_output: str | Path
             write_df_data(metadata_time, metadata_values, path_to_output, f'{sensor}_meta.json', df_sensor)
 
 
-def scan_and_sync_segments(input_path_ppg, input_path_imu):
+def scan_and_sync_segments(input_path_ppg: str | Path, input_path_imu: str | Path) -> Tuple[List[tsdf.TSDFMetadata], List[tsdf.TSDFMetadata]]:
     """
     Scan for available TSDF metadata files in the specified directories and synchronize the data segments based on the metadata start and end times.
 
@@ -309,9 +309,9 @@ def preprocess_ppg_data(df_ppg: pd.DataFrame, df_imu: pd.DataFrame, ppg_config:
     df_acc = df_imu.drop(cols_to_drop, axis=1)
 
     # Extract overlapping segments
-    print("Shape of the original data:", df_ppg.shape, df_acc.shape)
+    print(f"Original data shapes:\n- PPG data: {df_ppg.shape}\n- Accelerometer data: {df_acc.shape}")
     df_ppg_overlapping, df_acc_overlapping = extract_overlapping_segments(df_ppg, df_acc)
-    print("Shape of the overlapping segments:", df_ppg_overlapping.shape, df_acc_overlapping.shape)
+    print(f"Overlapping data shapes:\n- PPG data: {df_ppg_overlapping.shape}\n- Accelerometer data: {df_acc_overlapping.shape}")
 
     # Resample accelerometer data
     df_acc_proc = resample_data(
@@ -372,12 +372,10 @@ def preprocess_ppg_data_io(tsdf_meta_ppg: tsdf.TSDFMetadata, tsdf_meta_imu: tsdf
         Metadata for the IMU data.
     output_path : Union[str, Path]
         Path to store the preprocessed data.
-    ppg_config : PPGPreprocessingConfig
+    ppg_config : PPGConfig
         Configuration object for PPG preprocessing.
-    imu_config : IMUPreprocessingConfig
+    imu_config : IMUConfig
         Configuration object for IMU preprocessing.
-    sensor: str
-        Name of the sensor data to be preprocessed. 
 
     Returns
     -------