From 3a466e46160ac85baed4298f7e2cb5f872e7c4da Mon Sep 17 00:00:00 2001
From: Corey Broeckling <cbroeckl@users.noreply.github.com>
Date: Thu, 9 Nov 2023 08:47:55 -0700
Subject: [PATCH] add function to merge split clusters

---
 NAMESPACE                      |   1 +
 R/rc.merge.clusters.R          | 102 +++++++++++++++++++++++++++++++++
 man/rc.merge.split.clusters.Rd |  40 +++++++++++++
 3 files changed, 143 insertions(+)
 create mode 100644 R/rc.merge.clusters.R
 create mode 100644 man/rc.merge.split.clusters.Rd

diff --git a/NAMESPACE b/NAMESPACE
index 47fae67..493a56f 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -38,6 +38,7 @@ export(rc.feature.replace.na)
 export(rc.get.csv.data)
 export(rc.get.df.data)
 export(rc.get.xcms.data)
+export(rc.merge.split.clusters)
 export(rc.qc)
 export(rc.ramclustr)
 export(rc.remove.qc)
diff --git a/R/rc.merge.clusters.R b/R/rc.merge.clusters.R
new file mode 100644
index 0000000..1a16f6a
--- /dev/null
+++ b/R/rc.merge.clusters.R
@@ -0,0 +1,102 @@
+#' rc.merge.split.clusters
+#'
+#' Cluster refinement - scanning instruments (quadrupole, as in GC-MS) can display cluster splitting, possibily due to slight differences in measured peak retentiont time as a function of mass due to scan dynamics.  this function enables a second pass clustering designed to merge two clusters if the second cluster is within a small retention time window and shows a sufficiently strong correlation.  
+#'
+#' @param ramclustObj ramclustR object to annotate. 
+#' @param merge.threshold numeric. value between -1 and 1 indicating the correlational r threshold above which two clusters will be merged
+#' @param cor.method character.  default = 'spearman'.  correlational method to use for calculating r.  see documentation on R base cor() function for available options
+#' @param rt.sd.factor numeric.  default = 3.  clusters within rt.sd.factor * ramclustObj$rtsd (cluster retention time standard deviation) are considered for merging.
+#' @details exports files to a directory called 'spectra'.  If one.file = FALSE, a new directory 'spectra/msp' is created to hold the individual msp files. if do.findman has been run, spectra are written as ms2 spectra, else as ms1. 
+#' @return new ramclustR object, with (generally) fewer clusters than the input ramclustR object.
+#' @concept ramclustR
+#' @concept RAMClustR
+#' @concept metabolomics
+#' @concept mass spectrometry
+#' @concept clustering
+#' @concept xcms
+#' @author Corey Broeckling
+#' @importFrom methods is
+#' @export 
+#' 
+#' 
+rc.merge.split.clusters <- function(
+    ramclustObj = NULL,
+    merge.threshold = 0.7,
+    cor.method = 'spearman',
+    rt.sd.factor = 3
+    
+) {
+  
+  if(is.null(ramclustObj)) stop('please provide a ramclustObj', '\n')
+  if(!is.numeric(merge.threshold)) merge.threshold <- as.numeric(merge.threshold)
+  if(merge.threshold < 0 | merge.threshold > 1) stop("'merge.threshold' must be between zero and one", '\n')
+  
+  
+  ## for all clusters, see if there are clusters at nearby retentiontimes which highly correlate
+  ## if there are, join them, renumber featclus, and regenerate SpecAbund file.  
+  orig.cl.n <- max(ramclustObj$featclus)
+  cls <- max(ramclustObj$featclus):2
+  # ramclustObj <- RC
+  for(i in cls) {
+    if(max(diff(sort(unique(ramclustObj$featclus)))) > 1) stop("error 1")
+    potential.merges <- which(
+      (abs(ramclustObj$clrt[1:i] - ramclustObj$clrt[i])) < (rt.sd.factor*ramclustObj$clrtsd[i])
+    )
+    potential.merges <- potential.merges[!potential.merges == i]
+    if(length(potential.merges) == 0) next
+    
+    rval <- cor(ramclustObj$SpecAbund[,i], ramclustObj$SpecAbund[,potential.merges], method = cor.method)
+    merges <- potential.merges[which(rval >= merge.threshold)]
+    if(length(merges) == 0) next
+    
+    merges <- max(merges)
+    
+    ramclustObj$featclus[which(ramclustObj$featclus == i)] <- merges
+    # if(max(diff(sort(unique(ramclustObj$featclus)))) > 1) cat(i, "max diff = ", max(diff(sort(unique(ramclustObj$featclus)))), '\n')
+    old.featclus <- ramclustObj$featclus
+    new.featclus <- old.featclus
+    decend.by.one <- which(old.featclus > i)
+    new.featclus[decend.by.one] <- (old.featclus[decend.by.one])-1
+    if(max(diff(sort(unique(new.featclus)))) > 1) stop("error 2")
+    ramclustObj$featclus <- new.featclus
+    
+  }
+  # sort(unique(ramclustObj$featclus))
+  # sort(unique(RC$featclus))
+  
+  ## store SpecAbund sample names
+  sa.rn <- dimnames(ramclustObj$SpecAbund)[[1]]
+  
+  # 0.99888
+  
+  # collapse feature dataset into spectrum dataset
+  wts<-colSums(ramclustObj$MSdata)
+  ramclustObj$SpecAbund<-matrix(nrow=nrow(ramclustObj$SpecAbund), ncol=max(ramclustObj$featclus))
+  for (ro in 1:nrow(ramclustObj$SpecAbund)) { 
+    for (co in 1:ncol(ramclustObj$SpecAbund)) {
+      ramclustObj$SpecAbund[ro,co]<- weighted.mean(ramclustObj$MSdata[ro,which(ramclustObj$featclus==co)], wts[which(ramclustObj$featclus==co)])
+    }
+  }
+  dimnames(ramclustObj$SpecAbund)[[1]] <- sa.rn
+  
+  strl <- nchar(max(ramclustObj$featclus)) - 1
+  ramclustObj$cmpd <- paste("C", formatC(1:max(ramclustObj$featclus), digits = strl, flag = 0 ) , sep="")
+  ramclustObj$ann <- ramclustObj$cmpd
+  
+  clrt<-aggregate(ramclustObj$frt, by=list(ramclustObj$featclus), FUN="mean")
+  ramclustObj$clrt<-clrt[which(clrt[,1]!=0),2]
+  clrtsd<-aggregate(ramclustObj$frt, by=list(ramclustObj$featclus), FUN="sd")
+  ramclustObj$clrtsd<-clrtsd[which(clrtsd[,1]!=0),2]
+  ramclustObj$nfeat<-as.vector(table(ramclustObj$featclus)[2:max(ramclustObj$featclus)])
+  ramclustObj$nsing<-length(which(ramclustObj$featclus==0))
+  ramclustObj$annconf<-rep(4, length(ramclustObj$clrt))
+  ramclustObj$annnotes<-rep("", length(ramclustObj$clrt))
+  dimnames(ramclustObj$SpecAbund)[[1]] <- ramclustObj$sample_names
+  
+  new.cl.n <- max(ramclustObj$featclus)
+  cat(paste("Original cluster number =", orig.cl.n, '\n', "New cluster number =", new.cl.n, '\n'))
+  
+  return(ramclustObj)
+  
+}
+
diff --git a/man/rc.merge.split.clusters.Rd b/man/rc.merge.split.clusters.Rd
new file mode 100644
index 0000000..f97739d
--- /dev/null
+++ b/man/rc.merge.split.clusters.Rd
@@ -0,0 +1,40 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/rc.merge.clusters.R
+\name{rc.merge.split.clusters}
+\alias{rc.merge.split.clusters}
+\title{rc.merge.split.clusters}
+\usage{
+rc.merge.split.clusters(
+  ramclustObj = NULL,
+  merge.threshold = 0.7,
+  cor.method = "spearman",
+  rt.sd.factor = 3
+)
+}
+\arguments{
+\item{ramclustObj}{ramclustR object to annotate.}
+
+\item{merge.threshold}{numeric. value between -1 and 1 indicating the correlational r threshold above which two clusters will be merged}
+
+\item{cor.method}{character.  default = 'spearman'.  correlational method to use for calculating r.  see documentation on R base cor() function for available options}
+
+\item{rt.sd.factor}{numeric.  default = 3.  clusters within rt.sd.factor * ramclustObj$rtsd (cluster retention time standard deviation) are considered for merging.}
+}
+\value{
+new ramclustR object, with (generally) fewer clusters than the input ramclustR object.
+}
+\description{
+Cluster refinement - scanning instruments (quadrupole, as in GC-MS) can display cluster splitting, possibily due to slight differences in measured peak retentiont time as a function of mass due to scan dynamics.  this function enables a second pass clustering designed to merge two clusters if the second cluster is within a small retention time window and shows a sufficiently strong correlation.
+}
+\details{
+exports files to a directory called 'spectra'.  If one.file = FALSE, a new directory 'spectra/msp' is created to hold the individual msp files. if do.findman has been run, spectra are written as ms2 spectra, else as ms1.
+}
+\author{
+Corey Broeckling
+}
+\concept{RAMClustR}
+\concept{clustering}
+\concept{mass spectrometry}
+\concept{metabolomics}
+\concept{ramclustR}
+\concept{xcms}