Skip to content

Commit

Permalink
add function to merge split clusters
Browse files Browse the repository at this point in the history
  • Loading branch information
cbroeckl committed Nov 9, 2023
1 parent 49434aa commit 3a466e4
Show file tree
Hide file tree
Showing 3 changed files with 143 additions and 0 deletions.
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ export(rc.feature.replace.na)
export(rc.get.csv.data)
export(rc.get.df.data)
export(rc.get.xcms.data)
export(rc.merge.split.clusters)
export(rc.qc)
export(rc.ramclustr)
export(rc.remove.qc)
Expand Down
102 changes: 102 additions & 0 deletions R/rc.merge.clusters.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
#' rc.merge.split.clusters
#'
#' Cluster refinement - scanning instruments (quadrupole, as in GC-MS) can display cluster splitting, possibily due to slight differences in measured peak retentiont time as a function of mass due to scan dynamics. this function enables a second pass clustering designed to merge two clusters if the second cluster is within a small retention time window and shows a sufficiently strong correlation.
#'
#' @param ramclustObj ramclustR object to annotate.
#' @param merge.threshold numeric. value between -1 and 1 indicating the correlational r threshold above which two clusters will be merged
#' @param cor.method character. default = 'spearman'. correlational method to use for calculating r. see documentation on R base cor() function for available options
#' @param rt.sd.factor numeric. default = 3. clusters within rt.sd.factor * ramclustObj$rtsd (cluster retention time standard deviation) are considered for merging.
#' @details exports files to a directory called 'spectra'. If one.file = FALSE, a new directory 'spectra/msp' is created to hold the individual msp files. if do.findman has been run, spectra are written as ms2 spectra, else as ms1.
#' @return new ramclustR object, with (generally) fewer clusters than the input ramclustR object.
#' @concept ramclustR
#' @concept RAMClustR
#' @concept metabolomics
#' @concept mass spectrometry
#' @concept clustering
#' @concept xcms
#' @author Corey Broeckling
#' @importFrom methods is
#' @export
#'
#'
rc.merge.split.clusters <- function(
ramclustObj = NULL,
merge.threshold = 0.7,
cor.method = 'spearman',
rt.sd.factor = 3

) {

if(is.null(ramclustObj)) stop('please provide a ramclustObj', '\n')
if(!is.numeric(merge.threshold)) merge.threshold <- as.numeric(merge.threshold)
if(merge.threshold < 0 | merge.threshold > 1) stop("'merge.threshold' must be between zero and one", '\n')


## for all clusters, see if there are clusters at nearby retentiontimes which highly correlate
## if there are, join them, renumber featclus, and regenerate SpecAbund file.
orig.cl.n <- max(ramclustObj$featclus)
cls <- max(ramclustObj$featclus):2
# ramclustObj <- RC
for(i in cls) {
if(max(diff(sort(unique(ramclustObj$featclus)))) > 1) stop("error 1")
potential.merges <- which(
(abs(ramclustObj$clrt[1:i] - ramclustObj$clrt[i])) < (rt.sd.factor*ramclustObj$clrtsd[i])
)
potential.merges <- potential.merges[!potential.merges == i]
if(length(potential.merges) == 0) next

rval <- cor(ramclustObj$SpecAbund[,i], ramclustObj$SpecAbund[,potential.merges], method = cor.method)
merges <- potential.merges[which(rval >= merge.threshold)]
if(length(merges) == 0) next

merges <- max(merges)

ramclustObj$featclus[which(ramclustObj$featclus == i)] <- merges
# if(max(diff(sort(unique(ramclustObj$featclus)))) > 1) cat(i, "max diff = ", max(diff(sort(unique(ramclustObj$featclus)))), '\n')
old.featclus <- ramclustObj$featclus
new.featclus <- old.featclus
decend.by.one <- which(old.featclus > i)
new.featclus[decend.by.one] <- (old.featclus[decend.by.one])-1
if(max(diff(sort(unique(new.featclus)))) > 1) stop("error 2")
ramclustObj$featclus <- new.featclus

}
# sort(unique(ramclustObj$featclus))
# sort(unique(RC$featclus))

## store SpecAbund sample names
sa.rn <- dimnames(ramclustObj$SpecAbund)[[1]]

# 0.99888

# collapse feature dataset into spectrum dataset
wts<-colSums(ramclustObj$MSdata)
ramclustObj$SpecAbund<-matrix(nrow=nrow(ramclustObj$SpecAbund), ncol=max(ramclustObj$featclus))
for (ro in 1:nrow(ramclustObj$SpecAbund)) {
for (co in 1:ncol(ramclustObj$SpecAbund)) {
ramclustObj$SpecAbund[ro,co]<- weighted.mean(ramclustObj$MSdata[ro,which(ramclustObj$featclus==co)], wts[which(ramclustObj$featclus==co)])
}
}
dimnames(ramclustObj$SpecAbund)[[1]] <- sa.rn

strl <- nchar(max(ramclustObj$featclus)) - 1
ramclustObj$cmpd <- paste("C", formatC(1:max(ramclustObj$featclus), digits = strl, flag = 0 ) , sep="")
ramclustObj$ann <- ramclustObj$cmpd

clrt<-aggregate(ramclustObj$frt, by=list(ramclustObj$featclus), FUN="mean")
ramclustObj$clrt<-clrt[which(clrt[,1]!=0),2]
clrtsd<-aggregate(ramclustObj$frt, by=list(ramclustObj$featclus), FUN="sd")
ramclustObj$clrtsd<-clrtsd[which(clrtsd[,1]!=0),2]
ramclustObj$nfeat<-as.vector(table(ramclustObj$featclus)[2:max(ramclustObj$featclus)])
ramclustObj$nsing<-length(which(ramclustObj$featclus==0))
ramclustObj$annconf<-rep(4, length(ramclustObj$clrt))
ramclustObj$annnotes<-rep("", length(ramclustObj$clrt))
dimnames(ramclustObj$SpecAbund)[[1]] <- ramclustObj$sample_names

new.cl.n <- max(ramclustObj$featclus)
cat(paste("Original cluster number =", orig.cl.n, '\n', "New cluster number =", new.cl.n, '\n'))

return(ramclustObj)

}

40 changes: 40 additions & 0 deletions man/rc.merge.split.clusters.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 3a466e4

Please sign in to comment.