From 2ee4ccbc02458088bc4704ebc14a497eb67487da Mon Sep 17 00:00:00 2001 From: Bryson Gibbons Date: Thu, 12 Jan 2023 15:48:04 -0800 Subject: [PATCH] Add a parameter to override a test and include spectra for mzML/mzXML centroided spectra Existing code always tests the spectrum peak data for a minimum median peak distance of 50 ppm (to be considered centroid data), which always overrides the value read from an mzML or mzXML file. The parameter overrides the test result in the case that the mzML/mzXML file reports the spectrum as centroided, but the peak data failed the test. --- docs/Changelog.html | 7 ++++ docs/MSGFPlus.html | 4 +++ .../ucsd/msjava/msdbsearch/SearchParams.java | 7 ++++ .../java/edu/ucsd/msjava/msutil/SpecKey.java | 28 ++++++++++++---- .../java/edu/ucsd/msjava/msutil/Spectrum.java | 32 +++++++++++++++++-- .../edu/ucsd/msjava/params/ParamManager.java | 19 +++++++++++ src/main/java/edu/ucsd/msjava/ui/MSGFDB.java | 3 +- .../java/edu/ucsd/msjava/ui/MSGFDBLib.java | 2 +- .../java/edu/ucsd/msjava/ui/MSGFPlus.java | 7 ++-- 9 files changed, 95 insertions(+), 14 deletions(-) diff --git a/docs/Changelog.html b/docs/Changelog.html index a03e3f75..93a81405 100644 --- a/docs/Changelog.html +++ b/docs/Changelog.html @@ -13,6 +13,13 @@

MS-GF+ ChangeLog

MS-GF+ Documentation home

+

+ v2023.01.12 +

+ +

v2022.04.18

diff --git a/docs/MSGFPlus.html b/docs/MSGFPlus.html index 3bfe8717..c526971d 100644 --- a/docs/MSGFPlus.html +++ b/docs/MSGFPlus.html @@ -90,6 +90,10 @@

MS-GF+

[-maxMissedCleavages Count] (Exclude peptides with more than this number of missed cleavages from the search; Default: -1 (no limit)) [-numMods Count] (Maximum number of dynamic (variable) modifications per peptide; Default: 3) + +[-allowDenseCentroidedPeaks 0/1] (Default: 0 (disabled); 1: (for mzML/mzXML input only) allows inclusion of spectra with high-density centroid data in the search) + MS-GF+ checks the distance between consecutive peaks in the spectrum, and if the median distance is less than 50 ppm, they are considered profile spectra regardless of the value provided in mzML and mzXML files. + This parameter allows overriding this check when the mzML/mzXML file says the spectrum is centroided. diff --git a/src/main/java/edu/ucsd/msjava/msdbsearch/SearchParams.java b/src/main/java/edu/ucsd/msjava/msdbsearch/SearchParams.java index 01787e2e..55982a06 100644 --- a/src/main/java/edu/ucsd/msjava/msdbsearch/SearchParams.java +++ b/src/main/java/edu/ucsd/msjava/msdbsearch/SearchParams.java @@ -50,6 +50,7 @@ public class SearchParams { private double chargeCarrierMass; private int maxMissedCleavages; private int maxNumMods; + private boolean allowDenseCentroidedPeaks; public SearchParams() { } @@ -214,6 +215,10 @@ public int getMaxMissedCleavages() { return maxMissedCleavages; } + // Used by MS-GF+ + public boolean getAllowDenseCentroidedPeaks() { + return allowDenseCentroidedPeaks; + } /** * Look for # in dataLine @@ -402,6 +407,8 @@ public String parse(ParamManager paramManager) { } else if (maxMissedCleavages > -1 && enzyme.getName().equals("NoCleavage")) { return "Cannot specify a MaxMissedCleavages when using no cleavage enzyme"; } + + allowDenseCentroidedPeaks = paramManager.getAllowDenseCentroidedPeaks() == 1; maxNumMods = paramManager.getMaxNumModsPerPeptide(); int maxNumModsCompare = aaSet.getMaxNumberOfVariableModificationsPerPeptide(); diff --git a/src/main/java/edu/ucsd/msjava/msutil/SpecKey.java b/src/main/java/edu/ucsd/msjava/msutil/SpecKey.java index 4862882c..796b3af9 100644 --- a/src/main/java/edu/ucsd/msjava/msutil/SpecKey.java +++ b/src/main/java/edu/ucsd/msjava/msutil/SpecKey.java @@ -67,7 +67,8 @@ public static ArrayList getSpecKeyList( int minCharge, int maxCharge, ActivationMethod activationMethod, - int minNumPeaksPerSpectrum) { + int minNumPeaksPerSpectrum, + boolean allowDenseCentroidedData) { Iterator itr = specAcc.getSpecItr(); @@ -78,7 +79,8 @@ public static ArrayList getSpecKeyList( minCharge, maxCharge, activationMethod, - minNumPeaksPerSpectrum); + minNumPeaksPerSpectrum, + allowDenseCentroidedData); SpectrumParser parser = specAcc.getSpectrumParser(); @@ -101,7 +103,8 @@ public static ArrayList getSpecKeyList( int minCharge, int maxCharge, ActivationMethod activationMethod, - int minNumPeaksPerSpectrum) { + int minNumPeaksPerSpectrum, + boolean allowDenseCentroidedData) { if (activationMethod == ActivationMethod.FUSION) return getFusedSpecKeyList(itr, startSpecIndex, endSpecIndex, minCharge, maxCharge); @@ -109,6 +112,7 @@ public static ArrayList getSpecKeyList( ArrayList specKeyList = new ArrayList(); int numProfileSpectra = 0; + int numDenseCentroidedSpectra = 0; int numSpectraWithTooFewPeaks = 0; final int MAX_INFORMATIVE_MESSAGES = 10; int informativeMessageCount = 0; @@ -176,9 +180,18 @@ public static ArrayList getSpecKeyList( } } - if (!spec.isCentroided()) { + if (!spec.isCentroided() && !(spec.isCentroidedWithDensePeaks() && allowDenseCentroidedData)) { + String message = "Skip spectrum " + spec.getID() + " since "; + if (spec.isCentroidedWithDensePeaks()) { + message += "peaks are too dense"; + numDenseCentroidedSpectra++; + } else { + message += "it is not centroided"; + numProfileSpectra++; + } + if (informativeMessageCount < MAX_INFORMATIVE_MESSAGES) { - System.out.println("Skip spectrum " + spec.getID() + " since it is not centroided"); + System.out.println(message); informativeMessageCount++; } else { if (informativeMessageCount == MAX_INFORMATIVE_MESSAGES) { @@ -186,7 +199,6 @@ public static ArrayList getSpecKeyList( informativeMessageCount++; } } - numProfileSpectra++; continue; } @@ -206,6 +218,10 @@ public static ArrayList getSpecKeyList( System.out.println("Ignoring " + numProfileSpectra + " profile spectra."); System.out.println("Ignoring " + numSpectraWithTooFewPeaks + " spectra having less than " + minNumPeaksPerSpectrum + " peaks."); + if (numDenseCentroidedSpectra > 0) { + System.out.println("Ignoring " + numDenseCentroidedSpectra + " spectra marked as centroid with dense peaks (<50ppm median distance).\n" + + " Re-run search with parameter '-allowDenseCentroidedPeaks 1' to include these spectra in the search"); + } return specKeyList; } diff --git a/src/main/java/edu/ucsd/msjava/msutil/Spectrum.java b/src/main/java/edu/ucsd/msjava/msutil/Spectrum.java index 59240e82..a18bfc1b 100644 --- a/src/main/java/edu/ucsd/msjava/msutil/Spectrum.java +++ b/src/main/java/edu/ucsd/msjava/msutil/Spectrum.java @@ -44,6 +44,8 @@ public enum Polarity { private Polarity scanPolarity = Polarity.POSITIVE; private Boolean isCentroided = true; + private Boolean externalSetIsCentroided = false; + private Boolean isCentroidedWithDensePeaks = false; private boolean isHighPrecision = false; // private Tolerance precursorTolerance = null; @@ -256,6 +258,15 @@ public boolean isCentroided() { return this.isCentroided; } + /** + * Whether this spectrum is centroided according to the reader, but failed determineIfCentroided() because peaks are too dense. + * + * @return false unless the reader called setIsCentroided(true) and determineIfCentroided() failed + */ + public boolean isCentroidedWithDensePeaks() { + return this.isCentroidedWithDensePeaks; + } + /** * Returns whether this spectrum peaks are measured with high-precision. * @@ -437,6 +448,8 @@ public void setScanPolarity(Polarity scanPolarity) { */ public void setIsCentroided(boolean isCentroided) { this.isCentroided = isCentroided; + // function is used for mzML and mzXML files, track that isCentroided was set outside of this class + this.externalSetIsCentroided = true; } /** @@ -489,7 +502,7 @@ public Float getIsolationWindowTargetMz() { * Sets isCentroided by a simple testing. */ public void determineIsCentroided() { - this.isCentroided = true; + boolean centroidedCheckPass = true; // if(this.size() > 100) // { @@ -516,8 +529,21 @@ public void determineIsCentroided() { prevMz = curMz; } Collections.sort(diff); - if (diff.size() > 0 && diff.get(diff.size() / 2) < 50) - isCentroided = false; + if (diff.size() > 0 && diff.get(diff.size() / 2) < 50) { + // Check failed - the median PPM distance between peaks is less than 50 PPM + centroidedCheckPass = false; + } + } + + if (centroidedCheckPass) { + this.isCentroided = true; + } else { + if (this.isCentroided && this.externalSetIsCentroided) { + // set a flag to notify the user + this.isCentroidedWithDensePeaks = true; + } + + this.isCentroided = false; } } diff --git a/src/main/java/edu/ucsd/msjava/params/ParamManager.java b/src/main/java/edu/ucsd/msjava/params/ParamManager.java index c8974ddb..24862bed 100644 --- a/src/main/java/edu/ucsd/msjava/params/ParamManager.java +++ b/src/main/java/edu/ucsd/msjava/params/ParamManager.java @@ -131,6 +131,9 @@ public enum ParamNameEnum { ADD_FEATURES("addFeatures", "AddFeatures", "Include additional features in the output (enable this to post-process results with Percolator)", "0 means Output basic scores only (Default)\n" + "\t 1 means Output additional features"), + + ALLOW_DENSE_CENTROIDED_PEAKS("allowDenseCentroidedPeaks", "AllowDenseCentroidedPeaks", "Allow centroid scans with dense peaks (Default: 0)\n" + + "\t (for mzML or mzXML files, the console output will tell you if you might want to use this)", null), DD_DIRECTORY("dd", "DBIndexDir", "Path to the directory containing database index files", null), @@ -652,6 +655,13 @@ private void addMaxNumModsParam() { addParameter(maxNumMods); } + private void addAllowDenseCentroidedPeaksParam() { + EnumParameter allowDenseCentroidedPeaksParam = new EnumParameter(ParamNameEnum.ALLOW_DENSE_CENTROIDED_PEAKS); + allowDenseCentroidedPeaksParam.registerEntry("Skip all spectra that fail a peak density check").setDefault(); + allowDenseCentroidedPeaksParam.registerEntry("Allow mzML/mzXML centroided spectra that fail a peak density check"); + addParameter(allowDenseCentroidedPeaksParam); + } + private void addDbIndexDirParam(boolean isHidden) { FileParameter dbIndexDirParam = new FileParameter(ParamNameEnum.DD_DIRECTORY); dbIndexDirParam.fileMustExist(); @@ -780,6 +790,8 @@ public void addMSGFPlusParams() { addChargeCarrierMassParam(); addMaxMissedCleavagesParam(); addMaxNumModsParam(); + + addAllowDenseCentroidedPeaksParam(); addExample("Example (high-precision): java -Xmx3500M -jar MSGFPlus.jar -s test.mzML -d IPI_human_3.79.fasta -inst 1 -t 20ppm -ti -1,2 -ntt 2 -tda 1 -o testMSGFPlus.mzid -mod Mods.txt"); addExample("Example (low-precision): java -Xmx3500M -jar MSGFPlus.jar -s test.mzML -d IPI_human_3.79.fasta -inst 0 -t 0.5Da,2.5Da -ntt 2 -tda 1 -o testMSGFPlus.mzid -mod Mods.txt"); @@ -907,6 +919,8 @@ public void addMSGFDBParams() { uniformAAProb.registerEntry("Use amino acid probabilities computed from the input database").setDefault(); uniformAAProb.registerEntry("Use probability 0.05 for all amino acids"); addParameter(uniformAAProb); + + addAllowDenseCentroidedPeaksParam(); addExample("Example (high-precision): java -Xmx2000M -jar MSGFDB.jar -s test.mzXML -d IPI_human_3.79.fasta -t 30ppm -c13 1 -nnet 0 -tda 1 -o testMSGFDB.tsv"); addExample("Example (low-precision): java -Xmx2000M -jar MSGFDB.jar -s test.mzXML -d IPI_human_3.79.fasta -t 0.5Da,2.5Da -nnet 0 -tda 1 -o testMSGFDB.tsv"); @@ -1175,6 +1189,11 @@ public FileParameter getConfigFileParam() { return ((FileParameter) getParameter(ParamNameEnum.CONFIGURATION_FILE.key)); } + // Used by MS-GF+ + public int getAllowDenseCentroidedPeaks() { + return getIntValue(ParamNameEnum.ALLOW_DENSE_CENTROIDED_PEAKS.key); + } + public int getIntValue(String key) { Parameter param = this.getParameter(key); if (param instanceof IntParameter) diff --git a/src/main/java/edu/ucsd/msjava/ui/MSGFDB.java b/src/main/java/edu/ucsd/msjava/ui/MSGFDB.java index 763a7715..1fe86467 100644 --- a/src/main/java/edu/ucsd/msjava/ui/MSGFDB.java +++ b/src/main/java/edu/ucsd/msjava/ui/MSGFDB.java @@ -208,6 +208,7 @@ private static String runMSGFDB(File specFile, SpecFileFormat specFormat, File o boolean useUniformAAProb = paramManager.getIntValue(ParamManager.ParamNameEnum.UNIFORM_AA_PROBABILITY.getKey()) == 1; boolean replicateMergedResults = paramManager.getIntValue("replicate") == 1; boolean doNotDseEdgeScore = paramManager.getIntValue(ParamManager.ParamNameEnum.EDGE_SCORE.getKey()) == 1; + boolean allowDenseCentroidedPeaks = paramManager.getIntValue(ParamManager.ParamNameEnum.ALLOW_DENSE_CENTROIDED_PEAKS.getKey()) == 1; System.out.println("Loading database files..."); File dbIndexDir = paramManager.getFile(ParamManager.ParamNameEnum.DD_DIRECTORY.getKey()); @@ -277,7 +278,7 @@ private static String runMSGFDB(File specFile, SpecFileFormat specFormat, File o int avgPeptideMass = 2000; int numBytesPerMass = 12; int numSpecScannedTogether = (int) ((float) maxMemory / avgPeptideMass / numBytesPerMass); - ArrayList specKeyList = SpecKey.getSpecKeyList(specAcc.getSpecItr(), startSpecIndex, endSpecIndex, minCharge, maxCharge, activationMethod, Constants.MIN_NUM_PEAKS_PER_SPECTRUM); + ArrayList specKeyList = SpecKey.getSpecKeyList(specAcc.getSpecItr(), startSpecIndex, endSpecIndex, minCharge, maxCharge, activationMethod, Constants.MIN_NUM_PEAKS_PER_SPECTRUM, allowDenseCentroidedPeaks); int specSize = specKeyList.size(); System.out.print("Reading spectra finished "); diff --git a/src/main/java/edu/ucsd/msjava/ui/MSGFDBLib.java b/src/main/java/edu/ucsd/msjava/ui/MSGFDBLib.java index 648886ef..b84dcf0f 100644 --- a/src/main/java/edu/ucsd/msjava/ui/MSGFDBLib.java +++ b/src/main/java/edu/ucsd/msjava/ui/MSGFDBLib.java @@ -104,7 +104,7 @@ public static String runMSGFLib(ParamManager paramManager) { int avgPeptideMass = 2000; int numBytesPerMass = 12; int numSpecScannedTogether = (int) ((float) maxMemory / avgPeptideMass / numBytesPerMass); - ArrayList specKeyList = SpecKey.getSpecKeyList(specAcc.getSpecItr(), 0, Integer.MAX_VALUE, 0, Integer.MAX_VALUE, activationMethod, Constants.MIN_NUM_PEAKS_PER_SPECTRUM); + ArrayList specKeyList = SpecKey.getSpecKeyList(specAcc.getSpecItr(), 0, Integer.MAX_VALUE, 0, Integer.MAX_VALUE, activationMethod, Constants.MIN_NUM_PEAKS_PER_SPECTRUM, false); int specSize = specKeyList.size(); System.out.print("Reading spectra finished "); diff --git a/src/main/java/edu/ucsd/msjava/ui/MSGFPlus.java b/src/main/java/edu/ucsd/msjava/ui/MSGFPlus.java index 5b1fdbb0..83d435ee 100644 --- a/src/main/java/edu/ucsd/msjava/ui/MSGFPlus.java +++ b/src/main/java/edu/ucsd/msjava/ui/MSGFPlus.java @@ -22,8 +22,8 @@ public class MSGFPlus { - public static final String VERSION = "Release (v2022.04.18)"; - public static final String RELEASE_DATE = "18 April 2022"; + public static final String VERSION = "Release (v2023.01.12)"; + public static final String RELEASE_DATE = "12 January 2023"; public static final String DECOY_DB_EXTENSION = ".revCat.fasta"; public static final String DEFAULT_DECOY_PROTEIN_PREFIX = "XXX"; @@ -185,6 +185,7 @@ private static String runMSGFPlus(int ioIndex, SpecFileFormat specFormat, File o int numThreads = params.getNumThreads(); boolean doNotUseEdgeScore = params.doNotUseEdgeScore(); + boolean allowDenseCentroidedPeaks = params.getAllowDenseCentroidedPeaks(); int minNumPeaksPerSpectrum = params.getMinNumPeaksPerSpectrum(); if (minNumPeaksPerSpectrum == -1) // not specified @@ -267,7 +268,7 @@ private static String runMSGFPlus(int ioIndex, SpecFileFormat specFormat, File o return "Error while parsing spectrum file: " + specFile.getPath(); ArrayList specKeyList = SpecKey.getSpecKeyList(specAcc, - startSpecIndex, endSpecIndex, minCharge, maxCharge, activationMethod, minNumPeaksPerSpectrum); + startSpecIndex, endSpecIndex, minCharge, maxCharge, activationMethod, minNumPeaksPerSpectrum, allowDenseCentroidedPeaks); int specSize = specKeyList.size(); if (specSize == 0)