Merge remote-tracking branch 'origin/developer' into calculate_go_enr…

…ichment-improvement
jpquast · Oct 29, 2024 · fd9985e · fd9985e
2 parents f04a19d + ab3bda4
commit fd9985e
Show file tree

Hide file tree

Showing 2 changed files with 41 additions and 1 deletion.
diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,9 @@
 # protti 0.9.1.9000
 
+## Bug fixes
+
+* Fixed issue #193. This makes sure that information in retained columns can be propagated to newly created combinations, which were not present in the original data.
+
 ## Additional Changes
 
 * `assign_peptide_type` now takes the `start` argument, containing the start position of a peptide. If a protein does not have any peptide starting at position `1` and there is a peptide starting at position `2`, this peptide will be considered "tryptic" at the N-terminus. This is because the initial Methionine is likely missing due to processing for every copy of the protein and therefore position `2` is the true N-terminus.

diff --git a/R/assign_missingness.R b/R/assign_missingness.R
@@ -217,7 +217,43 @@ from the conditions and assigned their missingness. The created comparisons are:
       dplyr::distinct() %>%
       dplyr::right_join(result, by = colnames(result)[!colnames(result) %in% c("comparison", "missingness")]) %>%
       # Arrange by grouping but in a numeric order of the character vector.
-      dplyr::arrange(factor({{ grouping }}, levels = unique(stringr::str_sort({{ grouping }}, numeric = TRUE))))
+      dplyr::arrange(factor({{ grouping }}, levels = unique(stringr::str_sort({{ grouping }}, numeric = TRUE)))) %>%
+      # propagation of consistent values to NA places
+      dplyr::group_by({{ grouping }}) %>%
+      dplyr::mutate(dplyr::across(!!enquo(retain_columns), ~ {
+        # Check if all non-NA values are the same
+        if (any(is.na(.x)) & dplyr::n_distinct(na.omit(.x)) == 1 & !any(is.na(.x) & !is.na({{ intensity }}))) {
+          # Replace NA with the consistent value
+          tidyr::replace_na(.x, unique(na.omit(.x)))
+        } else {
+          # Leave as is
+          .x
+        }
+      })) %>%
+      dplyr::ungroup()
+
+    # Annotate sample related retained columns. These have a unique value for every sample.
+    # Above we annotated any columns that had a consistent value for every group, here the inconsistent ones are annotated
+
+    sample_annotations <- join_result %>%
+      dplyr::select(!!enquo(retain_columns), {{ intensity }}, {{ sample }}) %>%
+      dplyr::select(
+        dplyr::where(~ !any(is.na(.x) & !is.na(dplyr::pull(join_result, {{ intensity }}))) & any(is.na(.x))),
+        {{ sample }},
+        -{{ intensity }}
+      ) %>%
+      tidyr::drop_na() %>%
+      dplyr::distinct() %>%
+      dplyr::group_by({{ sample }}) %>%
+      # drop the columns that contain multiple values per group
+      # grouping doesn't work with selection so first we need to find the columns with the non-distinct values with the summary below
+      dplyr::summarise(dplyr::across(dplyr::everything(), ~ if (dplyr::n_distinct(.x) == 1) dplyr::first(.x) else NA), .groups = "drop") %>%
+      dplyr::select(-dplyr::where(~ any(is.na(.x)))) %>%
+      dplyr::ungroup() %>%
+      dplyr::distinct()
+
+    join_result <- join_result %>%
+      dplyr::rows_update(sample_annotations, by = rlang::as_name(rlang::enquo(sample)))
 
     return(join_result)
   }