From da1b6881a82bfd3bdcbcb75e7efea8c9507f4e05 Mon Sep 17 00:00:00 2001 From: Maxalcer Date: Thu, 4 May 2023 15:07:05 +0200 Subject: [PATCH 01/19] =?UTF-8?q?Reupload=20f=C3=BCr=20Codereview?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../OpenMS/ANALYSIS/ID/SearchDatabase.h | 136 +++++++ .../source/ANALYSIS/ID/SearchDatabase.cpp | 300 ++++++++++++++++ .../openms/source/SearchDatabase_test.cpp | 340 ++++++++++++++++++ 3 files changed, 776 insertions(+) create mode 100644 src/openms/include/OpenMS/ANALYSIS/ID/SearchDatabase.h create mode 100644 src/openms/source/ANALYSIS/ID/SearchDatabase.cpp create mode 100644 src/tests/class_tests/openms/source/SearchDatabase_test.cpp diff --git a/src/openms/include/OpenMS/ANALYSIS/ID/SearchDatabase.h b/src/openms/include/OpenMS/ANALYSIS/ID/SearchDatabase.h new file mode 100644 index 00000000000..ea502ddb7ba --- /dev/null +++ b/src/openms/include/OpenMS/ANALYSIS/ID/SearchDatabase.h @@ -0,0 +1,136 @@ +// -------------------------------------------------------------------------- +// OpenMS -- Open-Source Mass Spectrometry +// -------------------------------------------------------------------------- +// Copyright The OpenMS Team -- Eberhard Karls University Tuebingen, +// ETH Zurich, and Freie Universitaet Berlin 2002-2022. +// +// This software is released under a three-clause BSD license: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of any author or any participating institution +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// For a full list of authors, refer to the file AUTHORS. +// -------------------------------------------------------------------------- +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL ANY OF THE AUTHORS OR THE CONTRIBUTING +// INSTITUTIONS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF +// ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// -------------------------------------------------------------------------- +// $Maintainer: $ +// $Authors: Max Alcer, Heike Einsfeld $ +// -------------------------------------------------------------------------- + + +#pragma once + +#include +#include +#include +#include +#include + +#include + +namespace OpenMS +{ + /** @brief Creating a two level tree from a given FASTAFile by generating the theoretical peptides and their b-y-ions and sorting + * them in the outer tree by fragment-MZ and the inner tree by precursor-MZ. Search function to find candidates to experimental MS2 sepctra + * @ingroup ID + */ +class OPENMS_DLLAPI SearchDatabase : public DefaultParamHandler +{ + public: + + /** @brief Storing found candiates from search + */ + struct Candidate + { + const AASequence* sequence; ///< pointer to AASequence of candidate + size_t protein_index; ///< Index to Entry of std::vector + Candidate(const AASequence* seq, size_t pi) : sequence(seq), protein_index(pi){} + }; + SearchDatabase() = delete; + virtual ~SearchDatabase() = default; + /** @brief Builds up the Search Datastructure + + * @param entries Input vector of FASTAFile-Entries to base SearchDatastructure on + */ + SearchDatabase(const std::vector& entries); + /** @brief Searches Peaks of every MSSpectrum of the MSExperiment in the Database + + * @param experiment Input MSExperiment containing of MS2 Spectra + * @param candidates Output vector of found candidates with the index of MSSpectrum in experiment + */ + void search(MSExperiment& experiment, std::vector, size_t>>& candidates) const; + /** @brief Searches Peaks of the MSSpectrum in the Database + + * @param spectrum Input MS2 Spectrum + * @param candidates Output vector of found candidatest + */ + void search(MSSpectrum& spectrum, std::vector& candidates) const; + + protected: + + void updateMembers_() override; + + struct Fragment_ + { + size_t peptide_index_; + double fragment_mz_; + Fragment_() = delete; + Fragment_(size_t prec, const Peak1D& frag):peptide_index_(prec), fragment_mz_(frag.getMZ()){} + }; + + struct Peptide_ + { + AASequence sequence_; + size_t protein_index_; + double peptide_mz_; + Peptide_() = default; + Peptide_(const Peptide_&) = default; + Peptide_(const AASequence& pep, size_t index, double mass):sequence_(pep), protein_index_(index), peptide_mz_(mass){} + Peptide_& operator=(Peptide_&&) = default; + Peptide_& operator=(const Peptide_&) = default; + }; + + /// generates theoretical Peptides from all Proteins in fasta-File + std::vector generate_peptides_(const std::vector& entries) const; + /// Merges presorted Chunks of Peptide-Fragments inplace + void fragment_merge_(int first, int last, const std::vector& chunks, std::vector& input) const; + ///generates sortet vector with all theoretical Fragments for all theoretical Peptides + std::vector generate_fragments_() const; + + std::string digestor_enzyme_; + size_t missed_cleavages_; + double peptide_min_mass_; + double peptide_max_mass_; + size_t peptide_min_length_; + size_t peptide_max_length_; + double fragment_min_mz_; + double fragment_max_mz_; + size_t bucketsize_; ///< number of fragments per outer node + double precursor_mz_tolerance_; + std::string precursor_mz_tolerance_unit_; + double fragment_mz_tolerance_; + std::string fragment_mz_tolerance_unit_; + StringList modifications_fixed_; + StringList modifications_variable_; + size_t max_variable_mods_per_peptide_; + std::vector all_peptides_{}; + std::vector bucket_frags_mz_{}; ///< Minimum fragment-MZ for each other node + std::vector all_fragments_{}; +}; + +} \ No newline at end of file diff --git a/src/openms/source/ANALYSIS/ID/SearchDatabase.cpp b/src/openms/source/ANALYSIS/ID/SearchDatabase.cpp new file mode 100644 index 00000000000..81994124e46 --- /dev/null +++ b/src/openms/source/ANALYSIS/ID/SearchDatabase.cpp @@ -0,0 +1,300 @@ +// -------------------------------------------------------------------------- +// OpenMS -- Open-Source Mass Spectrometry +// -------------------------------------------------------------------------- +// Copyright The OpenMS Team -- Eberhard Karls University Tuebingen, +// ETH Zurich, and Freie Universitaet Berlin 2002-2022. +// +// This software is released under a three-clause BSD license: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of any author or any participating institution +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// For a full list of authors, refer to the file AUTHORS. +// -------------------------------------------------------------------------- +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL ANY OF THE AUTHORS OR THE CONTRIBUTING +// INSTITUTIONS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF +// ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// -------------------------------------------------------------------------- +// $Maintainer: $ +// $Authors: Max Alcer, Heike Einsfeld $ +// -------------------------------------------------------------------------- + + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +using namespace std; + +namespace OpenMS +{ +std::vector SearchDatabase::generate_peptides_(const std::vector& entries) const +{ + vector all_peptides; + #pragma omp parallel + { + ProteaseDigestion digestor; + digestor.setEnzyme(digestor_enzyme_); + digestor.setMissedCleavages(missed_cleavages_); + vector all_peptides_pvt; + #pragma omp for nowait + for (size_t i = 0; i < entries.size(); i++) + { + vector peptides; + digestor.digest(AASequence::fromString(entries[i].sequence), peptides, peptide_min_length_, peptide_max_length_); + + for (const auto& pep : peptides) + { + if (pep.toString().find('X') != string::npos) continue; // filtering peptide with unknown AA, can't calculate MonoWeight + + double seq_mz = pep.getMonoWeight(); + + if (!Math::contains(seq_mz, peptide_min_mass_, peptide_max_mass_)) continue; + + all_peptides_pvt.emplace_back(pep, i, seq_mz); + + } + } + #pragma omp critical (add_critical) + all_peptides.insert(all_peptides.end(), all_peptides_pvt.begin(), all_peptides_pvt.end()); + } + return all_peptides; +} + +void SearchDatabase::fragment_merge_(int first, int last, const std::vector& chunks, std::vector& input) const +{ + if (last - first > 1) + { + int mid = first + (last-first) / 2; + #pragma omp parallel sections + { + #pragma omp section + SearchDatabase::fragment_merge_(first, mid, chunks, input); + #pragma omp section + SearchDatabase::fragment_merge_(mid, last, chunks, input); + } + std::inplace_merge(input.begin() + chunks[first], input.begin() + chunks[mid], input.begin() + chunks[last], + [&](const SearchDatabase::Fragment_& l, const SearchDatabase::Fragment_& r)-> bool + {return (tie(l.fragment_mz_, all_peptides_[l.peptide_index_].peptide_mz_) < tie(r.fragment_mz_, all_peptides_[r.peptide_index_].peptide_mz_));}); + } +} + +std::vector SearchDatabase::generate_fragments_() const +{ + TheoreticalSpectrumGenerator tsg; + PeakSpectrum b_y_ions; + std::vector all_frags; + std::vector chunk_start = {0}; //for fragment_merge need start and end of presorted chunks + + for (size_t i = 0; i < all_peptides_.size(); i++) + { + tsg.getSpectrum(b_y_ions, all_peptides_[i].sequence_, 1, 1); + for (const auto& frag : b_y_ions) + { + if (!Math::contains(frag.getMZ(), fragment_min_mz_, fragment_max_mz_)) continue; + all_frags.emplace_back(i, frag); + } + chunk_start.emplace_back(all_frags.size()); + b_y_ions.clear(true); + } + + SearchDatabase::fragment_merge_(0, chunk_start.size()-1, chunk_start, all_frags); + + return all_frags; +} + +SearchDatabase::SearchDatabase(const std::vector& entries) : DefaultParamHandler("SearchDatabase") +{ + vector all_enzymes; + vector tolerance_units{"Da", "ppm"}; + ProteaseDB::getInstance()->getAllNames(all_enzymes); + defaults_.setValue("digestor_enzyme", "Trypsin", "Enzyme for digestion"); + defaults_.setValidStrings("digestor_enzyme", ListUtils::create(all_enzymes)); + defaults_.setValue("missed_cleavages", 0, "missed cleavages for digestion"); + defaults_.setValue("peptide_min_mass", 500, "minimal peptide mass for database"); + defaults_.setValue("peptide_max_mass", 5000, "maximal peptide mass for database"); + defaults_.setValue("peptide_min_length", 5, "minimal peptide length for database"); + defaults_.setValue("peptide_max_length", 50, "maximal peptide length for database"); + defaults_.setValue("fragment_min_mz", 150, "minimal fragment mz for database"); + defaults_.setValue("fragment_max_mz", 2000, "maximal fragment mz for database"); + defaults_.setValue("precursor_mz_tolerance", 2, "tolerance for precursor-MZ in search"); + defaults_.setValue("fragment_mz_tolerance", 0.05, "tolerance for fragment-MZ in search"); + defaults_.setValue("precursor_mz_tolerance_unit", "Da", "unit of tolerance for precursor-MZ"); + defaults_.setValidStrings("precursor_mz_tolerance_unit", tolerance_units); + defaults_.setValue("fragment_mz_tolerance_unit", "Da", "unit of tolerance for fragment-MZ"); + defaults_.setValidStrings("fragment_mz_tolerance_unit", tolerance_units); + + defaultsToParam_(); + + if (entries.size() == 0) return; + + all_peptides_ = generate_peptides_(entries); + + all_fragments_ = generate_fragments_(); + + bucketsize_ = size_t(sqrt(all_fragments_.size())); //calculating bucketsize for balanced tree (performance) + + for (size_t i = 0; i < all_fragments_.size(); i += bucketsize_) + { + bucket_frags_mz_.emplace_back(all_fragments_[i].fragment_mz_); + } + #pragma omp parallel + { + // sorting the fragments of each bucket by the precursor-MZ + #pragma omp for + for (size_t i = 0; i < all_fragments_.size(); i += bucketsize_) + { + auto bucket_begin = all_fragments_.begin()+i; + auto condition = distance(all_fragments_.begin(), bucket_begin+bucketsize_) >= int(all_fragments_.size()); + + sort(bucket_begin, condition ? all_fragments_.end() : bucket_begin+bucketsize_, + [&](const SearchDatabase::Fragment_& l, const SearchDatabase::Fragment_& r) -> bool + {return (all_peptides_[l.peptide_index_].peptide_mz_ < all_peptides_[r.peptide_index_].peptide_mz_);}); + } + } +} + +void SearchDatabase::updateMembers_() +{ + digestor_enzyme_ = param_.getValue("digestor_enzyme").toString(); + missed_cleavages_ = param_.getValue("missed_cleavages"); + peptide_min_mass_ = param_.getValue("peptide_min_mass"); + peptide_max_mass_ = param_.getValue("peptide_max_mass"); + peptide_min_length_ = param_.getValue("peptide_min_length"); + peptide_max_length_ = param_.getValue("peptide_max_length"); + fragment_min_mz_ = param_.getValue("fragment_min_mz"); + fragment_max_mz_ = param_.getValue("fragment_max_mz"); + precursor_mz_tolerance_ = param_.getValue("precursor_mz_tolerance"); + fragment_mz_tolerance_ = param_.getValue("fragment_mz_tolerance"); + precursor_mz_tolerance_unit_ = param_.getValue("precursor_mz_tolerance_unit").toString(); + fragment_mz_tolerance_unit_ = param_.getValue("fragment_mz_tolerance_unit").toString(); +} + +void SearchDatabase::search(MSSpectrum& spectrum, std::vector& candidates) const +{ + candidates.clear(); + if (spectrum.size() == 0) return; + + unordered_set index_hash; // saving every candidate only once + + std::vector precursor = spectrum.getPrecursors(); + + if (precursor.size() != 1) return; + + double prec_mz = precursor[0].getUnchargedMass(); + + spectrum.sortByIntensity(true); + size_t count_found = 0; + size_t count_rounds = 0; + + auto prec_unit_cond = precursor_mz_tolerance_unit_ == "Da"; + auto frag_unit_cond = fragment_mz_tolerance_unit_ == "Da"; + + for (const auto& peak : spectrum) + { + if (count_found == 0 && count_rounds == 5) return; // if peak with the 5 highest intensity don't match, skip whole spectrum + + double new_frag_mz_tolerance = frag_unit_cond ? fragment_mz_tolerance_ : Math::ppmToMassAbs(fragment_mz_tolerance_, peak.getMZ()); + + auto itr_lower = lower_bound(bucket_frags_mz_.begin(), bucket_frags_mz_.end(), peak.getMZ() - new_frag_mz_tolerance); + + size_t index_lower = distance(bucket_frags_mz_.begin(), itr_lower); + + if (index_lower != 0) index_lower--; // lower bound returns one too high (because searching the minimum of each bucket) + + while (bucket_frags_mz_[index_lower] <= (peak.getMZ() + new_frag_mz_tolerance) && index_lower < bucket_frags_mz_.size()) + { + auto bucket_start = all_fragments_.begin() + (index_lower * bucketsize_); + auto bucket_end = all_fragments_.begin() + (index_lower + 1) * bucketsize_; + + auto condition = distance(all_fragments_.begin(), bucket_end) >= int(all_fragments_.size()); + + double new_prec_mz_tolerance = prec_unit_cond ? precursor_mz_tolerance_ : Math::ppmToMassAbs(precursor_mz_tolerance_, prec_mz); + + auto itr_lower_inner = lower_bound(bucket_start, condition ? all_fragments_.end() : bucket_end, prec_mz - new_prec_mz_tolerance, + [&](const SearchDatabase::Fragment_& r, double l) { + return (all_peptides_[r.peptide_index_].peptide_mz_ < l); + }); + + while (all_peptides_[(*itr_lower_inner).peptide_index_].peptide_mz_ <= (prec_mz + new_prec_mz_tolerance) && itr_lower_inner != bucket_end && itr_lower_inner != all_fragments_.end()) + { + double theoretical_peak_tolerance = frag_unit_cond ? fragment_mz_tolerance_ : Math::ppmToMassAbs(fragment_mz_tolerance_, (*itr_lower_inner).fragment_mz_); + // filter by matching fragment-MZ + if (!Math::contains(peak.getMZ(), (*itr_lower_inner).fragment_mz_ - theoretical_peak_tolerance, (*itr_lower_inner).fragment_mz_ + theoretical_peak_tolerance)) + { + ++itr_lower_inner; + continue; + } + index_hash.insert((*itr_lower_inner).peptide_index_); + ++itr_lower_inner; + count_found++; + } + index_lower++; + } + count_rounds++; + } + + for (size_t j : index_hash) + { + candidates.emplace_back(&all_peptides_[j].sequence_, all_peptides_[j].protein_index_); + } +} + +void SearchDatabase::search(MSExperiment& experiment, std::vector, size_t>>& candidates) const +{ + candidates.clear(); + if (experiment.size() == 0) return; + + #pragma omp parallel + { + #pragma omp for + for (size_t i = 0; i < experiment.size(); i++) + { + if (experiment[i].size() == 0) continue; + + vector temp_cand; + + SearchDatabase::search(experiment[i], temp_cand); + + #pragma omp critical (add_critical) + { + candidates.emplace_back(temp_cand, i); + } + } + } +} + +} // ende namespace OpenMS + + + + + diff --git a/src/tests/class_tests/openms/source/SearchDatabase_test.cpp b/src/tests/class_tests/openms/source/SearchDatabase_test.cpp new file mode 100644 index 00000000000..af0d630ff9d --- /dev/null +++ b/src/tests/class_tests/openms/source/SearchDatabase_test.cpp @@ -0,0 +1,340 @@ +// -------------------------------------------------------------------------- +// OpenMS -- Open-Source Mass Spectrometry +// -------------------------------------------------------------------------- +// Copyright The OpenMS Team -- Eberhard Karls University Tuebingen, +// ETH Zurich, and Freie Universitaet Berlin 2002-2022. +// +// This software is released under a three-clause BSD license: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of any author or any participating institution +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// For a full list of authors, refer to the file AUTHORS. +// -------------------------------------------------------------------------- +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL ANY OF THE AUTHORS OR THE CONTRIBUTING +// INSTITUTIONS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF +// ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// -------------------------------------------------------------------------- +// $Maintainer: $ +// $Authors: Max Alcer, Heike Einsfeld $ +// -------------------------------------------------------------------------- + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace OpenMS; +using namespace std; + +class SearchDatabase_test : public SearchDatabase +{ + public: + + using SearchDatabase::SearchDatabase; + + void printAllFragments() + { + + int counter = 0; + + for (auto j : all_fragments_) + { + if(counter == bucketsize_) + { + cout << "\n"; + counter = 0; + } + + cout << j.fragment_mz_ << " " << all_peptides_[j.peptide_index_].peptide_mz_ << "\n"; + + counter++; + } + + } + + std::vector getAllFragments(){return all_fragments_;} + + size_t getBucketsize(){return bucketsize_;} + + string getDigestorEnzyme(){return digestor_enzyme_;} + + size_t getMissedCleavages(){return missed_cleavages_;} + + size_t getpeptide_minlen(){return peptide_min_length_;} + size_t getpeptide_maxlen(){return peptide_max_length_;} + double getpeptide_minMass(){return peptide_min_mass_;} + double getpeptide_maxMass(){return peptide_max_mass_;} + double getfrag_minMZ(){return fragment_min_mz_;} + double getfrag_maxMZ(){return fragment_max_mz_;} + size_t getpeptideSize(){return all_peptides_.size();} + vector getpeptide(){return all_peptides_;} + + double getFragMZ(size_t i) + { + return all_fragments_[i].fragment_mz_; + } + + double getPrecMZ(size_t i) + { + return all_peptides_[all_fragments_[i].peptide_index_].peptide_mz_; + } + +}; + +START_TEST(SearchDatabase, "$Id:$") + + +START_SECTION(SearchDatabase(const std::vector& entries)) + + std::vector entries{ + {"test1", "test1", "LRLRACGLNFADLMARQGLY"}, + {"test2", "test2", "AAASPPLLRCLVLTGFGGYD"}, + {"test3", "test3", "KVKLQSRPAAPPAPGPGQLT"}}; + + SearchDatabase_test sdb(entries); + + vector all_peptides; + + ProteaseDigestion digestor; + + digestor.setEnzyme(sdb.getDigestorEnzyme()); + digestor.setMissedCleavages(sdb.getMissedCleavages()); + + for (auto i : entries) + { + vector peptides; + + digestor.digest(AASequence::fromString(i.sequence), peptides, sdb.getpeptide_minlen(), sdb.getpeptide_maxlen()); + for (const auto& pep : peptides) + { + if (pep.toString().find('X') != string::npos) continue; + double seq_mz = pep.getMonoWeight(); + if (!Math::contains(seq_mz, sdb.getpeptide_minMass(), sdb.getpeptide_maxMass())) continue; + all_peptides.emplace_back(pep); + + } + } + + TEST_EQUAL(all_peptides.size(), sdb.getpeptideSize()) + TheoreticalSpectrumGenerator tsg; + PeakSpectrum b_y_ions; + int count_all_frags=0; + + for (size_t i = 0; i < all_peptides.size(); i++) + { + tsg.getSpectrum(b_y_ions, all_peptides[i], 1, 1); + for (const auto& frag : b_y_ions) + { + if (!Math::contains(frag.getMZ(), sdb.getfrag_minMZ(), sdb.getfrag_maxMZ())) continue; + count_all_frags++; + } + b_y_ions.clear(true); + } + TEST_EQUAL(sdb.getAllFragments().size(), count_all_frags) + +END_SECTION + +START_SECTION(void search(MSSpectrum& spectrum, std::vector& candidates)) + + cout << "\n"; + + std::vector entries{ + {"test1", "test1", "LRLRACGLNFADLMARQGLY"}, + {"test2", "test2", "AAASPPLLRCLVLTGFGGYD"}, + {"test3", "test3", "KVKLQSRPAAPPAPGPGQLT"}}; + + SearchDatabase_test sdb(entries); + + MSSpectrum spec; + + Precursor prec{}; + + std::vector candidates; + + START_SECTION(Searching 3 Fragments it should find) + + prec.setCharge(1); + + prec.setMZ(1280.6); + + spec.setPrecursors({prec}); + + spec.push_back({605.308, 100}); + spec.push_back({676.345, 100}); + spec.push_back({823.413, 100}); + + sdb.search(spec, candidates); + + TEST_EQUAL(candidates.size(), 1) + + END_SECTION + + START_SECTION(Searching Fragment it should not find because of Fragment Mass) + + spec.clear(true); + + spec.setPrecursors({prec}); + + spec.push_back({1040, 100}); + + sdb.search(spec, candidates); + + TEST_EQUAL(candidates.size(), 0) + + END_SECTION + + START_SECTION(Searching Fragment it should not find because of Precursor Mass) + + spec.clear(true); + + prec.setMZ(1500); + + spec.setPrecursors({prec}); + + spec.push_back({572.304, 100}); + + sdb.search(spec, candidates); + + TEST_EQUAL(candidates.size(), 0) + + END_SECTION + + START_SECTION(Searching Fragment it should not find because its smaller then all Fragments in Database) + + spec.clear(true); + + prec.setMZ(1500); + + spec.setPrecursors({prec}); + + spec.push_back({100, 100}); + + sdb.search(spec, candidates); + + TEST_EQUAL(candidates.size(), 0) + + END_SECTION + + START_SECTION(Searching Fragment it should not find because its bigger then all Fragments in Database) + + spec.clear(true); + + prec.setMZ(1500); + + spec.setPrecursors({prec}); + + spec.push_back({2000, 100}); + + sdb.search(spec, candidates); + + TEST_EQUAL(candidates.size(), 0) + + END_SECTION + + START_SECTION(Testing filtering of Spectrum by best Peaks) + + spec.clear(true); + + prec.setMZ(1280.6); + + spec.setPrecursors({prec}); + + spec.push_back({2000, 80}); + + spec.push_back({1500, 99}); + + spec.push_back({500, 85}); + + spec.push_back({1000, 90}); + + spec.push_back({100, 100}); + + spec.push_back({937.456, 5}); + + sdb.search(spec, candidates); + + TEST_EQUAL(candidates.size(), 0) + + END_SECTION + + // sdb.printAllFragments(); + +END_SECTION + +START_SECTION(void search(MSExperiment& experiment, std::vector, size_t>>& candidates)) + + std::vector entries{ + {"test1", "test1", "LRLRACGLNFADLMARQGLY"}, + {"test2", "test2", "AAASPPLLRCLVLTGFGGYD"}, + {"test3", "test3", "KVKLQSRPAAPPAPGPGQLT"}}; + + SearchDatabase_test sdb(entries); + + MSExperiment exp; + + MSSpectrum spec; + + Precursor prec{}; + + prec.setCharge(1); + + prec.setMZ(1281.6); + + spec.setPrecursors({prec}); + + spec.push_back({605.318, 100}); + + exp.addSpectrum(spec); + + spec.clear(true); + + prec.setMZ(894.529); + + spec.setPrecursors({prec}); + + spec.push_back({175.119, 100}); + + exp.addSpectrum(spec); + + spec.clear(true); + + prec.setMZ(1655.89); + + spec.setPrecursors({prec}); + + spec.push_back({1544.83, 100}); + + exp.addSpectrum(spec); + + std::vector, size_t>> candidates; + + sdb.search(exp, candidates); + + TEST_EQUAL(candidates.size(), 3) + +END_SECTION + +END_TEST \ No newline at end of file From 13ea132f58a483d4b7d54b7dc78df13fa1024c23 Mon Sep 17 00:00:00 2001 From: Maxalcer Date: Thu, 4 May 2023 15:35:29 +0200 Subject: [PATCH 02/19] =?UTF-8?q?Modifikationen=20hinzugef=C3=BCgt?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../source/ANALYSIS/ID/SearchDatabase.cpp | 36 +++++++++++++++++-- 1 file changed, 34 insertions(+), 2 deletions(-) diff --git a/src/openms/source/ANALYSIS/ID/SearchDatabase.cpp b/src/openms/source/ANALYSIS/ID/SearchDatabase.cpp index 81994124e46..17c0e7937a9 100644 --- a/src/openms/source/ANALYSIS/ID/SearchDatabase.cpp +++ b/src/openms/source/ANALYSIS/ID/SearchDatabase.cpp @@ -37,6 +37,8 @@ #include #include +#include +#include #include #include #include @@ -59,6 +61,10 @@ namespace OpenMS std::vector SearchDatabase::generate_peptides_(const std::vector& entries) const { vector all_peptides; + + ModifiedPeptideGenerator::MapToResidueType fixed_modifications = ModifiedPeptideGenerator::getModifications(modifications_fixed_); + ModifiedPeptideGenerator::MapToResidueType variable_modifications = ModifiedPeptideGenerator::getModifications(modifications_variable_); + #pragma omp parallel { ProteaseDigestion digestor; @@ -75,6 +81,22 @@ std::vector SearchDatabase::generate_peptides_(const s { if (pep.toString().find('X') != string::npos) continue; // filtering peptide with unknown AA, can't calculate MonoWeight + vector modified_peptides; + + AASequence modified_pep = pep; + + ModifiedPeptideGenerator::applyFixedModifications(fixed_modifications, modified_pep); + ModifiedPeptideGenerator::applyVariableModifications(variable_modifications, modified_pep, max_variable_mods_per_peptide_, modified_peptides); + + for (const auto & mod_pep : modified_peptides) + { + double mod_seq_mz = mod_pep.getMonoWeight(); + + if (!Math::contains(mod_seq_mz, peptide_min_mass_, peptide_max_mass_)) continue; + + all_peptides_pvt.emplace_back(mod_pep, i, mod_seq_mz); + } + double seq_mz = pep.getMonoWeight(); if (!Math::contains(seq_mz, peptide_min_mass_, peptide_max_mass_)) continue; @@ -133,9 +155,11 @@ std::vector SearchDatabase::generate_fragments_() con SearchDatabase::SearchDatabase(const std::vector& entries) : DefaultParamHandler("SearchDatabase") { - vector all_enzymes; - vector tolerance_units{"Da", "ppm"}; + vector all_enzymes; ProteaseDB::getInstance()->getAllNames(all_enzymes); + vector all_mods; + ModificationsDB::getInstance()->getAllSearchModifications(all_mods); + vector tolerance_units{"Da", "ppm"}; defaults_.setValue("digestor_enzyme", "Trypsin", "Enzyme for digestion"); defaults_.setValidStrings("digestor_enzyme", ListUtils::create(all_enzymes)); defaults_.setValue("missed_cleavages", 0, "missed cleavages for digestion"); @@ -151,6 +175,11 @@ SearchDatabase::SearchDatabase(const std::vector& entries defaults_.setValidStrings("precursor_mz_tolerance_unit", tolerance_units); defaults_.setValue("fragment_mz_tolerance_unit", "Da", "unit of tolerance for fragment-MZ"); defaults_.setValidStrings("fragment_mz_tolerance_unit", tolerance_units); + defaults_.setValue("modifications_fixed", std::vector{"Carbamidomethyl (C)"}, "Fixed modifications, specified using UniMod (www.unimod.org) terms, e.g. 'Carbamidomethyl (C)'"); + defaults_.setValidStrings("modifications_fixed", ListUtils::create(all_mods)); + defaults_.setValue("modifications_variable", std::vector{"Oxidation (M)"}, "Variable modifications, specified using UniMod (www.unimod.org) terms, e.g. 'Oxidation (M)'"); + defaults_.setValidStrings("modifications_variable", ListUtils::create(all_mods)); + defaults_.setValue("max_variable_mods_per_peptide", 2, "Maximum number of residues carrying a variable modification per candidate peptide"); defaultsToParam_(); @@ -196,6 +225,9 @@ void SearchDatabase::updateMembers_() fragment_mz_tolerance_ = param_.getValue("fragment_mz_tolerance"); precursor_mz_tolerance_unit_ = param_.getValue("precursor_mz_tolerance_unit").toString(); fragment_mz_tolerance_unit_ = param_.getValue("fragment_mz_tolerance_unit").toString(); + modifications_fixed_ = ListUtils::toStringList(param_.getValue("modifications_fixed")); + modifications_variable_ = ListUtils::toStringList(param_.getValue("modifications_variable")); + max_variable_mods_per_peptide_ = param_.getValue("max_variable_mods_per_peptide"); } void SearchDatabase::search(MSSpectrum& spectrum, std::vector& candidates) const From 31cee373fffcb856839e342e60b966e1a605689d Mon Sep 17 00:00:00 2001 From: Maxalcer Date: Fri, 5 May 2023 14:36:02 +0200 Subject: [PATCH 03/19] made changes based on code rewiev \n added test for tolerance in ppm \n added test to check if Datasructure is sorted \n added UNIT_DA and UNIT_PPM to Constants.h --- .../OpenMS/ANALYSIS/ID/SearchDatabase.h | 44 +++-- .../include/OpenMS/ANALYSIS/ID/sources.cmake | 1 + src/openms/include/OpenMS/CONCEPT/Constants.h | 10 ++ .../source/ANALYSIS/ID/SearchDatabase.cpp | 77 +++++---- src/openms/source/ANALYSIS/ID/sources.cmake | 1 + .../class_tests/openms/executables.cmake | 1 + .../openms/source/SearchDatabase_test.cpp | 155 ++++++------------ 7 files changed, 137 insertions(+), 152 deletions(-) mode change 100644 => 100755 src/openms/include/OpenMS/ANALYSIS/ID/SearchDatabase.h mode change 100644 => 100755 src/openms/include/OpenMS/ANALYSIS/ID/sources.cmake mode change 100644 => 100755 src/openms/include/OpenMS/CONCEPT/Constants.h mode change 100644 => 100755 src/openms/source/ANALYSIS/ID/SearchDatabase.cpp mode change 100644 => 100755 src/openms/source/ANALYSIS/ID/sources.cmake mode change 100644 => 100755 src/tests/class_tests/openms/executables.cmake mode change 100644 => 100755 src/tests/class_tests/openms/source/SearchDatabase_test.cpp diff --git a/src/openms/include/OpenMS/ANALYSIS/ID/SearchDatabase.h b/src/openms/include/OpenMS/ANALYSIS/ID/SearchDatabase.h old mode 100644 new mode 100755 index ea502ddb7ba..4bcb3b4da56 --- a/src/openms/include/OpenMS/ANALYSIS/ID/SearchDatabase.h +++ b/src/openms/include/OpenMS/ANALYSIS/ID/SearchDatabase.h @@ -28,7 +28,7 @@ // ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // // -------------------------------------------------------------------------- -// $Maintainer: $ +// $Maintainer: Chris Bielow $ // $Authors: Max Alcer, Heike Einsfeld $ // -------------------------------------------------------------------------- @@ -45,8 +45,11 @@ namespace OpenMS { + //class MSExperiment; + /** @brief Creating a two level tree from a given FASTAFile by generating the theoretical peptides and their b-y-ions and sorting - * them in the outer tree by fragment-MZ and the inner tree by precursor-MZ. Search function to find candidates to experimental MS2 sepctra + * them in the outer tree by fragment-m/z and the inner tree by precursor-m/z. Search function to find candidates to experimental MS2 sepctra + * (implimented based on https://lazear.github.io/sage/) * @ingroup ID */ class OPENMS_DLLAPI SearchDatabase : public DefaultParamHandler @@ -57,34 +60,44 @@ class OPENMS_DLLAPI SearchDatabase : public DefaultParamHandler */ struct Candidate { - const AASequence* sequence; ///< pointer to AASequence of candidate + const AASequence* sequence; ///< Pointer to AASequence of candidate size_t protein_index; ///< Index to Entry of std::vector Candidate(const AASequence* seq, size_t pi) : sequence(seq), protein_index(pi){} }; + + /** @brief Storing vector of found candiates from search with the Index of the MSSpectrum in the MSExperiment + */ + struct CandidatesWithIndex + { + std::vector candidates; ///< Vector of found candidates + size_t spectrum_index; ///< Index to the MSSpectrum in the MSExperiment + CandidatesWithIndex(const std::vector& cand, size_t spectrum_i): candidates(cand), spectrum_index(spectrum_i){} + }; SearchDatabase() = delete; virtual ~SearchDatabase() = default; - /** @brief Builds up the Search Datastructure + /** @brief Builds up the Search Datastructure * @param entries Input vector of FASTAFile-Entries to base SearchDatastructure on */ - SearchDatabase(const std::vector& entries); - /** @brief Searches Peaks of every MSSpectrum of the MSExperiment in the Database + SearchDatabase(const std::vector& entries); - * @param experiment Input MSExperiment containing of MS2 Spectra + /** @brief Searches Peaks of every MSSpectrum of the MSExperiment in the Database + * @param experiment Input MSExperiment containing of MS2 Spectra (MS1 Spectra has an empty output) * @param candidates Output vector of found candidates with the index of MSSpectrum in experiment */ - void search(MSExperiment& experiment, std::vector, size_t>>& candidates) const; - /** @brief Searches Peaks of the MSSpectrum in the Database + void search(MSExperiment& experiment, std::vector& candidates) const; - * @param spectrum Input MS2 Spectrum + /** @brief Searches Peaks of the MSSpectrum in the Database + * @param spectrum Input MS2 Spectrum (MS1 Spectra has an empty output) * @param candidates Output vector of found candidatest */ - void search(MSSpectrum& spectrum, std::vector& candidates) const; + void search(MSSpectrum& spectrum, std::vector& candidates) const; protected: void updateMembers_() override; + //Saving b_y_ion after creating theoretical spectrum with index to precursor-peptide struct Fragment_ { size_t peptide_index_; @@ -93,13 +106,14 @@ class OPENMS_DLLAPI SearchDatabase : public DefaultParamHandler Fragment_(size_t prec, const Peak1D& frag):peptide_index_(prec), fragment_mz_(frag.getMZ()){} }; + //Saving precursors AASequence (, index to protein and the MonoWeight) after digesting struct Peptide_ { AASequence sequence_; size_t protein_index_; double peptide_mz_; - Peptide_() = default; Peptide_(const Peptide_&) = default; + Peptide_(Peptide_&&) = default; Peptide_(const AASequence& pep, size_t index, double mass):sequence_(pep), protein_index_(index), peptide_mz_(mass){} Peptide_& operator=(Peptide_&&) = default; Peptide_& operator=(const Peptide_&) = default; @@ -109,11 +123,11 @@ class OPENMS_DLLAPI SearchDatabase : public DefaultParamHandler std::vector generate_peptides_(const std::vector& entries) const; /// Merges presorted Chunks of Peptide-Fragments inplace void fragment_merge_(int first, int last, const std::vector& chunks, std::vector& input) const; - ///generates sortet vector with all theoretical Fragments for all theoretical Peptides + ///generates sorted vector with all theoretical Fragments for all theoretical Peptides std::vector generate_fragments_() const; std::string digestor_enzyme_; - size_t missed_cleavages_; + size_t missed_cleavages_; ///< number of missed cleavages double peptide_min_mass_; double peptide_max_mass_; size_t peptide_min_length_; @@ -129,7 +143,7 @@ class OPENMS_DLLAPI SearchDatabase : public DefaultParamHandler StringList modifications_variable_; size_t max_variable_mods_per_peptide_; std::vector all_peptides_{}; - std::vector bucket_frags_mz_{}; ///< Minimum fragment-MZ for each other node + std::vector bucket_frags_mz_{}; ///< Minimum fragment-m/z for each other node std::vector all_fragments_{}; }; diff --git a/src/openms/include/OpenMS/ANALYSIS/ID/sources.cmake b/src/openms/include/OpenMS/ANALYSIS/ID/sources.cmake old mode 100644 new mode 100755 index 5f8165029a2..ecfc0f6796f --- a/src/openms/include/OpenMS/ANALYSIS/ID/sources.cmake +++ b/src/openms/include/OpenMS/ANALYSIS/ID/sources.cmake @@ -38,6 +38,7 @@ PrecursorPurity.h ProtonDistributionModel.h PeptideIndexing.h PercolatorFeatureSetHelper.h +SearchDatabase.h SimpleSearchEngineAlgorithm.h SiriusAdapterAlgorithm.h SiriusMSConverter.h diff --git a/src/openms/include/OpenMS/CONCEPT/Constants.h b/src/openms/include/OpenMS/CONCEPT/Constants.h old mode 100644 new mode 100755 index d5b5ca0be30..26d34f86962 --- a/src/openms/include/OpenMS/CONCEPT/Constants.h +++ b/src/openms/include/OpenMS/CONCEPT/Constants.h @@ -575,6 +575,16 @@ namespace OpenMS String */ inline const std::string MSM_SUM_FORMULA = "Sum_Formula"; + + /** User parameter for the unit type of the m/z tolerance. (Required for SearchDatabase) + String + */ + inline const std::string UNIT_DA = "Da"; + + /** User parameter for the unit type of the m/z tolerance. (Required for SearchDatabase) + String + */ + inline const std::string UNIT_PPM = "ppm"; } //@} diff --git a/src/openms/source/ANALYSIS/ID/SearchDatabase.cpp b/src/openms/source/ANALYSIS/ID/SearchDatabase.cpp old mode 100644 new mode 100755 index 17c0e7937a9..76f88246c34 --- a/src/openms/source/ANALYSIS/ID/SearchDatabase.cpp +++ b/src/openms/source/ANALYSIS/ID/SearchDatabase.cpp @@ -28,7 +28,7 @@ // ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // // -------------------------------------------------------------------------- -// $Maintainer: $ +// $Maintainer: Chris Bielow $ // $Authors: Max Alcer, Heike Einsfeld $ // -------------------------------------------------------------------------- @@ -42,6 +42,7 @@ #include #include #include +#include #include #include #include @@ -49,6 +50,7 @@ #include #include #include +#include #include #include @@ -56,8 +58,13 @@ using namespace std; + namespace OpenMS { + +using namespace Constants::UserParam; +using namespace Math; + std::vector SearchDatabase::generate_peptides_(const std::vector& entries) const { vector all_peptides; @@ -92,14 +99,14 @@ std::vector SearchDatabase::generate_peptides_(const s { double mod_seq_mz = mod_pep.getMonoWeight(); - if (!Math::contains(mod_seq_mz, peptide_min_mass_, peptide_max_mass_)) continue; + if (!contains(mod_seq_mz, peptide_min_mass_, peptide_max_mass_)) continue; all_peptides_pvt.emplace_back(mod_pep, i, mod_seq_mz); } double seq_mz = pep.getMonoWeight(); - if (!Math::contains(seq_mz, peptide_min_mass_, peptide_max_mass_)) continue; + if (!contains(seq_mz, peptide_min_mass_, peptide_max_mass_)) continue; all_peptides_pvt.emplace_back(pep, i, seq_mz); @@ -141,7 +148,7 @@ std::vector SearchDatabase::generate_fragments_() con tsg.getSpectrum(b_y_ions, all_peptides_[i].sequence_, 1, 1); for (const auto& frag : b_y_ions) { - if (!Math::contains(frag.getMZ(), fragment_min_mz_, fragment_max_mz_)) continue; + if (!contains(frag.getMZ(), fragment_min_mz_, fragment_max_mz_)) continue; all_frags.emplace_back(i, frag); } chunk_start.emplace_back(all_frags.size()); @@ -159,21 +166,21 @@ SearchDatabase::SearchDatabase(const std::vector& entries ProteaseDB::getInstance()->getAllNames(all_enzymes); vector all_mods; ModificationsDB::getInstance()->getAllSearchModifications(all_mods); - vector tolerance_units{"Da", "ppm"}; + vector tolerance_units{UNIT_DA, UNIT_PPM}; defaults_.setValue("digestor_enzyme", "Trypsin", "Enzyme for digestion"); defaults_.setValidStrings("digestor_enzyme", ListUtils::create(all_enzymes)); - defaults_.setValue("missed_cleavages", 0, "missed cleavages for digestion"); - defaults_.setValue("peptide_min_mass", 500, "minimal peptide mass for database"); - defaults_.setValue("peptide_max_mass", 5000, "maximal peptide mass for database"); - defaults_.setValue("peptide_min_length", 5, "minimal peptide length for database"); - defaults_.setValue("peptide_max_length", 50, "maximal peptide length for database"); - defaults_.setValue("fragment_min_mz", 150, "minimal fragment mz for database"); - defaults_.setValue("fragment_max_mz", 2000, "maximal fragment mz for database"); - defaults_.setValue("precursor_mz_tolerance", 2, "tolerance for precursor-MZ in search"); - defaults_.setValue("fragment_mz_tolerance", 0.05, "tolerance for fragment-MZ in search"); - defaults_.setValue("precursor_mz_tolerance_unit", "Da", "unit of tolerance for precursor-MZ"); + defaults_.setValue("missed_cleavages", 0, "Missed cleavages for digestion"); + defaults_.setValue("peptide_min_mass", 500, "Minimal peptide mass for database"); + defaults_.setValue("peptide_max_mass", 5000, "Maximal peptide mass for database"); + defaults_.setValue("peptide_min_length", 5, "Minimal peptide length for database"); + defaults_.setValue("peptide_max_length", 50, "Maximal peptide length for database"); + defaults_.setValue("fragment_min_mz", 150, "Minimal fragment mz for database"); + defaults_.setValue("fragment_max_mz", 2000, "Maximal fragment mz for database"); + defaults_.setValue("precursor_mz_tolerance", 2.0, "Tolerance for precursor-m/z in search"); + defaults_.setValue("fragment_mz_tolerance", 0.05, "Tolerance for fragment-m/z in search"); + defaults_.setValue("precursor_mz_tolerance_unit", UNIT_DA, "Unit of tolerance for precursor-m/z"); defaults_.setValidStrings("precursor_mz_tolerance_unit", tolerance_units); - defaults_.setValue("fragment_mz_tolerance_unit", "Da", "unit of tolerance for fragment-MZ"); + defaults_.setValue("fragment_mz_tolerance_unit", UNIT_DA, "Unit of tolerance for fragment-m/z"); defaults_.setValidStrings("fragment_mz_tolerance_unit", tolerance_units); defaults_.setValue("modifications_fixed", std::vector{"Carbamidomethyl (C)"}, "Fixed modifications, specified using UniMod (www.unimod.org) terms, e.g. 'Carbamidomethyl (C)'"); defaults_.setValidStrings("modifications_fixed", ListUtils::create(all_mods)); @@ -195,20 +202,18 @@ SearchDatabase::SearchDatabase(const std::vector& entries { bucket_frags_mz_.emplace_back(all_fragments_[i].fragment_mz_); } - #pragma omp parallel - { - // sorting the fragments of each bucket by the precursor-MZ - #pragma omp for - for (size_t i = 0; i < all_fragments_.size(); i += bucketsize_) - { - auto bucket_begin = all_fragments_.begin()+i; - auto condition = distance(all_fragments_.begin(), bucket_begin+bucketsize_) >= int(all_fragments_.size()); + // sorting the fragments of each bucket by the precursor-m/z + #pragma omp parallel for + for (size_t i = 0; i < all_fragments_.size(); i += bucketsize_) + { + auto bucket_begin = all_fragments_.begin()+i; + auto condition = distance(all_fragments_.begin(), bucket_begin+bucketsize_) >= int(all_fragments_.size()); - sort(bucket_begin, condition ? all_fragments_.end() : bucket_begin+bucketsize_, - [&](const SearchDatabase::Fragment_& l, const SearchDatabase::Fragment_& r) -> bool - {return (all_peptides_[l.peptide_index_].peptide_mz_ < all_peptides_[r.peptide_index_].peptide_mz_);}); - } + sort(bucket_begin, condition ? all_fragments_.end() : bucket_begin+bucketsize_, + [&](const SearchDatabase::Fragment_& l, const SearchDatabase::Fragment_& r) -> bool + {return (all_peptides_[l.peptide_index_].peptide_mz_ < all_peptides_[r.peptide_index_].peptide_mz_);}); } + } void SearchDatabase::updateMembers_() @@ -247,14 +252,14 @@ void SearchDatabase::search(MSSpectrum& spectrum, std::vector= int(all_fragments_.size()); - double new_prec_mz_tolerance = prec_unit_cond ? precursor_mz_tolerance_ : Math::ppmToMassAbs(precursor_mz_tolerance_, prec_mz); + double new_prec_mz_tolerance = prec_unit_da ? precursor_mz_tolerance_ : ppmToMassAbs(precursor_mz_tolerance_, prec_mz); auto itr_lower_inner = lower_bound(bucket_start, condition ? all_fragments_.end() : bucket_end, prec_mz - new_prec_mz_tolerance, [&](const SearchDatabase::Fragment_& r, double l) { @@ -278,9 +283,9 @@ void SearchDatabase::search(MSSpectrum& spectrum, std::vector, size_t>>& candidates) const +void SearchDatabase::search(MSExperiment& experiment, std::vector& candidates) const { candidates.clear(); if (experiment.size() == 0) return; diff --git a/src/openms/source/ANALYSIS/ID/sources.cmake b/src/openms/source/ANALYSIS/ID/sources.cmake old mode 100644 new mode 100755 index 70ecff7564e..f422df769be --- a/src/openms/source/ANALYSIS/ID/sources.cmake +++ b/src/openms/source/ANALYSIS/ID/sources.cmake @@ -38,6 +38,7 @@ PrecursorPurity.cpp ProtonDistributionModel.cpp PeptideIndexing.cpp PercolatorFeatureSetHelper.cpp +SearchDatabase.cpp SimpleSearchEngineAlgorithm.cpp SiriusAdapterAlgorithm.cpp SiriusMSConverter.cpp diff --git a/src/tests/class_tests/openms/executables.cmake b/src/tests/class_tests/openms/executables.cmake old mode 100644 new mode 100755 index 5680cea3537..9b9fa26cf51 --- a/src/tests/class_tests/openms/executables.cmake +++ b/src/tests/class_tests/openms/executables.cmake @@ -577,6 +577,7 @@ set(analysis_executables_list ReactionMonitoringTransition_test RNPxlModificationsGenerator_test SVMWrapper_test + SearchDatabase_test SimpleSearchEngineAlgorithm_test SimplePairFinder_test SimpleSVM_test diff --git a/src/tests/class_tests/openms/source/SearchDatabase_test.cpp b/src/tests/class_tests/openms/source/SearchDatabase_test.cpp old mode 100644 new mode 100755 index af0d630ff9d..78769abfeba --- a/src/tests/class_tests/openms/source/SearchDatabase_test.cpp +++ b/src/tests/class_tests/openms/source/SearchDatabase_test.cpp @@ -28,7 +28,7 @@ // ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // // -------------------------------------------------------------------------- -// $Maintainer: $ +// $Maintainer: Chris Bielow $ // $Authors: Max Alcer, Heike Einsfeld $ // -------------------------------------------------------------------------- @@ -36,17 +36,14 @@ #include #include -#include +#include #include -#include #include #include -#include -#include #include -#include using namespace OpenMS; +using namespace Constants::UserParam; using namespace std; class SearchDatabase_test : public SearchDatabase @@ -54,52 +51,28 @@ class SearchDatabase_test : public SearchDatabase public: using SearchDatabase::SearchDatabase; - - void printAllFragments() - { - - int counter = 0; - - for (auto j : all_fragments_) - { - if(counter == bucketsize_) - { - cout << "\n"; - counter = 0; - } - - cout << j.fragment_mz_ << " " << all_peptides_[j.peptide_index_].peptide_mz_ << "\n"; - - counter++; - } - - } std::vector getAllFragments(){return all_fragments_;} - size_t getBucketsize(){return bucketsize_;} - - string getDigestorEnzyme(){return digestor_enzyme_;} - - size_t getMissedCleavages(){return missed_cleavages_;} - - size_t getpeptide_minlen(){return peptide_min_length_;} - size_t getpeptide_maxlen(){return peptide_max_length_;} - double getpeptide_minMass(){return peptide_min_mass_;} - double getpeptide_maxMass(){return peptide_max_mass_;} - double getfrag_minMZ(){return fragment_min_mz_;} - double getfrag_maxMZ(){return fragment_max_mz_;} - size_t getpeptideSize(){return all_peptides_.size();} - vector getpeptide(){return all_peptides_;} - - double getFragMZ(size_t i) + bool isSortedBucketFragsMZ() { - return all_fragments_[i].fragment_mz_; + return is_sorted(bucket_frags_mz_.begin(), bucket_frags_mz_.end()); } - double getPrecMZ(size_t i) + bool isSortedAllFragments() { - return all_peptides_[all_fragments_[i].peptide_index_].peptide_mz_; + bool test_sorted = true; + + for (size_t i = 0; i < all_fragments_.size(); i += bucketsize_) + { + auto bucket_begin = all_fragments_.begin()+i; + auto condition = distance(all_fragments_.begin(), bucket_begin+bucketsize_) >= int(all_fragments_.size()); + + test_sorted &= is_sorted(bucket_begin, condition ? all_fragments_.end() : bucket_begin+bucketsize_, + [&](const Fragment_& l, const Fragment_& r) -> bool + {return (all_peptides_[l.peptide_index_].peptide_mz_ < all_peptides_[r.peptide_index_].peptide_mz_);}); + } + return test_sorted; } }; @@ -115,45 +88,15 @@ START_SECTION(SearchDatabase(const std::vector& entries)) {"test3", "test3", "KVKLQSRPAAPPAPGPGQLT"}}; SearchDatabase_test sdb(entries); + START_SECTION(test number of fragments) + TEST_EQUAL(187, sdb.getAllFragments().size()) + END_SECTION - vector all_peptides; - - ProteaseDigestion digestor; - - digestor.setEnzyme(sdb.getDigestorEnzyme()); - digestor.setMissedCleavages(sdb.getMissedCleavages()); - - for (auto i : entries) - { - vector peptides; - - digestor.digest(AASequence::fromString(i.sequence), peptides, sdb.getpeptide_minlen(), sdb.getpeptide_maxlen()); - for (const auto& pep : peptides) - { - if (pep.toString().find('X') != string::npos) continue; - double seq_mz = pep.getMonoWeight(); - if (!Math::contains(seq_mz, sdb.getpeptide_minMass(), sdb.getpeptide_maxMass())) continue; - all_peptides.emplace_back(pep); - - } - } + START_SECTION(test sortation) + TEST_TRUE(sdb.isSortedBucketFragsMZ()) - TEST_EQUAL(all_peptides.size(), sdb.getpeptideSize()) - TheoreticalSpectrumGenerator tsg; - PeakSpectrum b_y_ions; - int count_all_frags=0; - - for (size_t i = 0; i < all_peptides.size(); i++) - { - tsg.getSpectrum(b_y_ions, all_peptides[i], 1, 1); - for (const auto& frag : b_y_ions) - { - if (!Math::contains(frag.getMZ(), sdb.getfrag_minMZ(), sdb.getfrag_maxMZ())) continue; - count_all_frags++; - } - b_y_ions.clear(true); - } - TEST_EQUAL(sdb.getAllFragments().size(), count_all_frags) + TEST_TRUE(sdb.isSortedAllFragments()) + END_SECTION END_SECTION @@ -174,11 +117,11 @@ START_SECTION(void search(MSSpectrum& spectrum, std::vector& candidat std::vector candidates; - START_SECTION(Searching 3 Fragments it should find) + START_SECTION(Searching 3 Fragments it should find (with Da and ppm)) prec.setCharge(1); - prec.setMZ(1280.6); + prec.setMZ(1281.6); spec.setPrecursors({prec}); @@ -190,13 +133,33 @@ START_SECTION(void search(MSSpectrum& spectrum, std::vector& candidat TEST_EQUAL(candidates.size(), 1) + auto params = sdb.getParameters(); + + params.setValue("fragment_mz_tolerance_unit", UNIT_PPM); + params.setValue("fragment_mz_tolerance", 5.f); + params.setValue("precursor_mz_tolerance_unit", UNIT_PPM); + params.setValue("precursor_mz_tolerance", 50.f); + + sdb.setParameters(params); + + sdb.search(spec, candidates); + + TEST_EQUAL(candidates.size(), 1) + END_SECTION START_SECTION(Searching Fragment it should not find because of Fragment Mass) - spec.clear(true); + auto params = sdb.getParameters(); - spec.setPrecursors({prec}); + params.setValue("fragment_mz_tolerance_unit", UNIT_DA); + params.setValue("fragment_mz_tolerance", 0.05); + params.setValue("precursor_mz_tolerance_unit", UNIT_DA); + params.setValue("precursor_mz_tolerance", 2.0); + + sdb.setParameters(params); + + spec.clear(false); spec.push_back({1040, 100}); @@ -224,11 +187,7 @@ START_SECTION(void search(MSSpectrum& spectrum, std::vector& candidat START_SECTION(Searching Fragment it should not find because its smaller then all Fragments in Database) - spec.clear(true); - - prec.setMZ(1500); - - spec.setPrecursors({prec}); + spec.clear(false); spec.push_back({100, 100}); @@ -240,11 +199,7 @@ START_SECTION(void search(MSSpectrum& spectrum, std::vector& candidat START_SECTION(Searching Fragment it should not find because its bigger then all Fragments in Database) - spec.clear(true); - - prec.setMZ(1500); - - spec.setPrecursors({prec}); + spec.clear(false); spec.push_back({2000, 100}); @@ -258,7 +213,7 @@ START_SECTION(void search(MSSpectrum& spectrum, std::vector& candidat spec.clear(true); - prec.setMZ(1280.6); + prec.setMZ(1281.6); spec.setPrecursors({prec}); @@ -280,11 +235,9 @@ START_SECTION(void search(MSSpectrum& spectrum, std::vector& candidat END_SECTION - // sdb.printAllFragments(); - END_SECTION -START_SECTION(void search(MSExperiment& experiment, std::vector, size_t>>& candidates)) +START_SECTION(void search(MSExperiment& experiment, std::vector& candidates)) std::vector entries{ {"test1", "test1", "LRLRACGLNFADLMARQGLY"}, @@ -329,7 +282,7 @@ START_SECTION(void search(MSExperiment& experiment, std::vector, size_t>> candidates; + std::vector candidates; sdb.search(exp, candidates); From 6c1077a435b3955dc4e7162b76f9817d593e8ea3 Mon Sep 17 00:00:00 2001 From: Maxalcer Date: Fri, 5 May 2023 14:41:06 +0200 Subject: [PATCH 04/19] Made Destructor not virtual --- src/openms/include/OpenMS/ANALYSIS/ID/SearchDatabase.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/openms/include/OpenMS/ANALYSIS/ID/SearchDatabase.h b/src/openms/include/OpenMS/ANALYSIS/ID/SearchDatabase.h index 4bcb3b4da56..85f60d71ab3 100755 --- a/src/openms/include/OpenMS/ANALYSIS/ID/SearchDatabase.h +++ b/src/openms/include/OpenMS/ANALYSIS/ID/SearchDatabase.h @@ -74,7 +74,7 @@ class OPENMS_DLLAPI SearchDatabase : public DefaultParamHandler CandidatesWithIndex(const std::vector& cand, size_t spectrum_i): candidates(cand), spectrum_index(spectrum_i){} }; SearchDatabase() = delete; - virtual ~SearchDatabase() = default; + ~SearchDatabase() = default; /** @brief Builds up the Search Datastructure * @param entries Input vector of FASTAFile-Entries to base SearchDatastructure on From 2b73ddb295e18d3c479dd36b179f9262363ead86 Mon Sep 17 00:00:00 2001 From: Maxalcer <128383119+Maxalcer@users.noreply.github.com> Date: Tue, 9 May 2023 10:08:18 +0200 Subject: [PATCH 05/19] small doc change Co-authored-by: Timo Sachsenberg --- src/openms/include/OpenMS/ANALYSIS/ID/SearchDatabase.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/openms/include/OpenMS/ANALYSIS/ID/SearchDatabase.h b/src/openms/include/OpenMS/ANALYSIS/ID/SearchDatabase.h index 85f60d71ab3..4e570b39311 100755 --- a/src/openms/include/OpenMS/ANALYSIS/ID/SearchDatabase.h +++ b/src/openms/include/OpenMS/ANALYSIS/ID/SearchDatabase.h @@ -49,7 +49,7 @@ namespace OpenMS /** @brief Creating a two level tree from a given FASTAFile by generating the theoretical peptides and their b-y-ions and sorting * them in the outer tree by fragment-m/z and the inner tree by precursor-m/z. Search function to find candidates to experimental MS2 sepctra - * (implimented based on https://lazear.github.io/sage/) + * (implementation inspired by https://lazear.github.io/sage/) * @ingroup ID */ class OPENMS_DLLAPI SearchDatabase : public DefaultParamHandler From 85265748e0d8ad718d581f2f4c716a3fc117caac Mon Sep 17 00:00:00 2001 From: Maxalcer <128383119+Maxalcer@users.noreply.github.com> Date: Tue, 9 May 2023 10:09:58 +0200 Subject: [PATCH 06/19] removed destr. and constr. deklaration (not needed) Co-authored-by: Timo Sachsenberg --- src/openms/include/OpenMS/ANALYSIS/ID/SearchDatabase.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/openms/include/OpenMS/ANALYSIS/ID/SearchDatabase.h b/src/openms/include/OpenMS/ANALYSIS/ID/SearchDatabase.h index 4e570b39311..bddb3fc7589 100755 --- a/src/openms/include/OpenMS/ANALYSIS/ID/SearchDatabase.h +++ b/src/openms/include/OpenMS/ANALYSIS/ID/SearchDatabase.h @@ -73,8 +73,6 @@ class OPENMS_DLLAPI SearchDatabase : public DefaultParamHandler size_t spectrum_index; ///< Index to the MSSpectrum in the MSExperiment CandidatesWithIndex(const std::vector& cand, size_t spectrum_i): candidates(cand), spectrum_index(spectrum_i){} }; - SearchDatabase() = delete; - ~SearchDatabase() = default; /** @brief Builds up the Search Datastructure * @param entries Input vector of FASTAFile-Entries to base SearchDatastructure on From c30c78fc1ced9020468af792a095f3f1771f824b Mon Sep 17 00:00:00 2001 From: Maxalcer <128383119+Maxalcer@users.noreply.github.com> Date: Tue, 9 May 2023 10:10:37 +0200 Subject: [PATCH 07/19] removed frag constr. Co-authored-by: Timo Sachsenberg --- src/openms/include/OpenMS/ANALYSIS/ID/SearchDatabase.h | 1 - 1 file changed, 1 deletion(-) diff --git a/src/openms/include/OpenMS/ANALYSIS/ID/SearchDatabase.h b/src/openms/include/OpenMS/ANALYSIS/ID/SearchDatabase.h index bddb3fc7589..c3328e532e3 100755 --- a/src/openms/include/OpenMS/ANALYSIS/ID/SearchDatabase.h +++ b/src/openms/include/OpenMS/ANALYSIS/ID/SearchDatabase.h @@ -100,7 +100,6 @@ class OPENMS_DLLAPI SearchDatabase : public DefaultParamHandler { size_t peptide_index_; double fragment_mz_; - Fragment_() = delete; Fragment_(size_t prec, const Peak1D& frag):peptide_index_(prec), fragment_mz_(frag.getMZ()){} }; From d6e17f3d81e3f7013d188535844d635dfedfbe50 Mon Sep 17 00:00:00 2001 From: Maxalcer <128383119+Maxalcer@users.noreply.github.com> Date: Tue, 9 May 2023 10:11:45 +0200 Subject: [PATCH 08/19] .empty() instead of == 0 Co-authored-by: Timo Sachsenberg --- src/openms/source/ANALYSIS/ID/SearchDatabase.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/openms/source/ANALYSIS/ID/SearchDatabase.cpp b/src/openms/source/ANALYSIS/ID/SearchDatabase.cpp index 76f88246c34..e72afe1fc87 100755 --- a/src/openms/source/ANALYSIS/ID/SearchDatabase.cpp +++ b/src/openms/source/ANALYSIS/ID/SearchDatabase.cpp @@ -238,7 +238,7 @@ void SearchDatabase::updateMembers_() void SearchDatabase::search(MSSpectrum& spectrum, std::vector& candidates) const { candidates.clear(); - if (spectrum.size() == 0) return; + if (spectrum.empty()) return; unordered_set index_hash; // saving every candidate only once From d8574dc80a6207def61e231f04c122c600685f93 Mon Sep 17 00:00:00 2001 From: Maxalcer <128383119+Maxalcer@users.noreply.github.com> Date: Tue, 9 May 2023 10:12:38 +0200 Subject: [PATCH 09/19] .empty() instead of == 0 Co-authored-by: Timo Sachsenberg --- src/openms/source/ANALYSIS/ID/SearchDatabase.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/openms/source/ANALYSIS/ID/SearchDatabase.cpp b/src/openms/source/ANALYSIS/ID/SearchDatabase.cpp index e72afe1fc87..5b4071d39d3 100755 --- a/src/openms/source/ANALYSIS/ID/SearchDatabase.cpp +++ b/src/openms/source/ANALYSIS/ID/SearchDatabase.cpp @@ -315,7 +315,7 @@ void SearchDatabase::search(MSExperiment& experiment, std::vector temp_cand; From cc0f6268029c488dd8fd26e631ab5fe749736d47 Mon Sep 17 00:00:00 2001 From: Maxalcer <128383119+Maxalcer@users.noreply.github.com> Date: Tue, 9 May 2023 10:14:21 +0200 Subject: [PATCH 10/19] coding convension change Co-authored-by: Julianus Pfeuffer --- src/openms/source/ANALYSIS/ID/SearchDatabase.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/openms/source/ANALYSIS/ID/SearchDatabase.cpp b/src/openms/source/ANALYSIS/ID/SearchDatabase.cpp index 5b4071d39d3..ccb2242cc95 100755 --- a/src/openms/source/ANALYSIS/ID/SearchDatabase.cpp +++ b/src/openms/source/ANALYSIS/ID/SearchDatabase.cpp @@ -207,7 +207,7 @@ SearchDatabase::SearchDatabase(const std::vector& entries for (size_t i = 0; i < all_fragments_.size(); i += bucketsize_) { auto bucket_begin = all_fragments_.begin()+i; - auto condition = distance(all_fragments_.begin(), bucket_begin+bucketsize_) >= int(all_fragments_.size()); + auto condition = distance(all_fragments_.begin(), bucket_begin + bucketsize_) >= int(all_fragments_.size()); sort(bucket_begin, condition ? all_fragments_.end() : bucket_begin+bucketsize_, [&](const SearchDatabase::Fragment_& l, const SearchDatabase::Fragment_& r) -> bool From 08538c744a4859e6463cd880597540e720aff9ab Mon Sep 17 00:00:00 2001 From: Maxalcer <128383119+Maxalcer@users.noreply.github.com> Date: Tue, 9 May 2023 10:14:33 +0200 Subject: [PATCH 11/19] coding convension change Co-authored-by: Julianus Pfeuffer --- src/openms/source/ANALYSIS/ID/SearchDatabase.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/openms/source/ANALYSIS/ID/SearchDatabase.cpp b/src/openms/source/ANALYSIS/ID/SearchDatabase.cpp index ccb2242cc95..6a6a9d898bc 100755 --- a/src/openms/source/ANALYSIS/ID/SearchDatabase.cpp +++ b/src/openms/source/ANALYSIS/ID/SearchDatabase.cpp @@ -206,7 +206,7 @@ SearchDatabase::SearchDatabase(const std::vector& entries #pragma omp parallel for for (size_t i = 0; i < all_fragments_.size(); i += bucketsize_) { - auto bucket_begin = all_fragments_.begin()+i; + auto bucket_begin = all_fragments_.begin() + i; auto condition = distance(all_fragments_.begin(), bucket_begin + bucketsize_) >= int(all_fragments_.size()); sort(bucket_begin, condition ? all_fragments_.end() : bucket_begin+bucketsize_, From 1f618f94ded3c80e96c06e77822afdab11bb0f12 Mon Sep 17 00:00:00 2001 From: Maxalcer <128383119+Maxalcer@users.noreply.github.com> Date: Tue, 9 May 2023 10:14:43 +0200 Subject: [PATCH 12/19] coding convension change Co-authored-by: Julianus Pfeuffer --- src/openms/source/ANALYSIS/ID/SearchDatabase.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/openms/source/ANALYSIS/ID/SearchDatabase.cpp b/src/openms/source/ANALYSIS/ID/SearchDatabase.cpp index 6a6a9d898bc..78c5ed00d35 100755 --- a/src/openms/source/ANALYSIS/ID/SearchDatabase.cpp +++ b/src/openms/source/ANALYSIS/ID/SearchDatabase.cpp @@ -213,7 +213,6 @@ SearchDatabase::SearchDatabase(const std::vector& entries [&](const SearchDatabase::Fragment_& l, const SearchDatabase::Fragment_& r) -> bool {return (all_peptides_[l.peptide_index_].peptide_mz_ < all_peptides_[r.peptide_index_].peptide_mz_);}); } - } void SearchDatabase::updateMembers_() From b0ecb9bd911f32055d3fb242aba9f46f72681fbf Mon Sep 17 00:00:00 2001 From: Maxalcer <128383119+Maxalcer@users.noreply.github.com> Date: Tue, 9 May 2023 10:16:16 +0200 Subject: [PATCH 13/19] ger -> eng Co-authored-by: Julianus Pfeuffer --- src/openms/source/ANALYSIS/ID/SearchDatabase.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/openms/source/ANALYSIS/ID/SearchDatabase.cpp b/src/openms/source/ANALYSIS/ID/SearchDatabase.cpp index 78c5ed00d35..e6a05683645 100755 --- a/src/openms/source/ANALYSIS/ID/SearchDatabase.cpp +++ b/src/openms/source/ANALYSIS/ID/SearchDatabase.cpp @@ -328,7 +328,7 @@ void SearchDatabase::search(MSExperiment& experiment, std::vector Date: Tue, 9 May 2023 10:16:32 +0200 Subject: [PATCH 14/19] doc change Co-authored-by: Julianus Pfeuffer --- src/openms/include/OpenMS/ANALYSIS/ID/SearchDatabase.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/openms/include/OpenMS/ANALYSIS/ID/SearchDatabase.h b/src/openms/include/OpenMS/ANALYSIS/ID/SearchDatabase.h index c3328e532e3..3a0be9a069f 100755 --- a/src/openms/include/OpenMS/ANALYSIS/ID/SearchDatabase.h +++ b/src/openms/include/OpenMS/ANALYSIS/ID/SearchDatabase.h @@ -47,7 +47,7 @@ namespace OpenMS { //class MSExperiment; - /** @brief Creating a two level tree from a given FASTAFile by generating the theoretical peptides and their b-y-ions and sorting + /** @brief Creating a two-level tree from a given FASTAFile by generating the theoretical peptides and their b-y-ions and sorting * them in the outer tree by fragment-m/z and the inner tree by precursor-m/z. Search function to find candidates to experimental MS2 sepctra * (implementation inspired by https://lazear.github.io/sage/) * @ingroup ID From 15cf7cfc7564ae1d414b257c1c2911032d208d74 Mon Sep 17 00:00:00 2001 From: Maxalcer Date: Thu, 11 May 2023 11:35:25 +0200 Subject: [PATCH 15/19] Added changes based on code review, added indices to FASTA entry instead of AASequence, changed name to FragmentIndex --- .../OpenMS/ANALYSIS/ID/FragmentIndex.h | 146 +++++++ .../include/OpenMS/ANALYSIS/ID/sources.cmake | 2 +- .../source/ANALYSIS/ID/FragmentIndex.cpp | 360 ++++++++++++++++++ src/openms/source/ANALYSIS/ID/sources.cmake | 2 +- .../class_tests/openms/executables.cmake | 3 +- .../openms/source/FragmentIndex_test.cpp | 299 +++++++++++++++ 6 files changed, 809 insertions(+), 3 deletions(-) create mode 100755 src/openms/include/OpenMS/ANALYSIS/ID/FragmentIndex.h create mode 100755 src/openms/source/ANALYSIS/ID/FragmentIndex.cpp create mode 100755 src/tests/class_tests/openms/source/FragmentIndex_test.cpp diff --git a/src/openms/include/OpenMS/ANALYSIS/ID/FragmentIndex.h b/src/openms/include/OpenMS/ANALYSIS/ID/FragmentIndex.h new file mode 100755 index 00000000000..abfcf2ffb60 --- /dev/null +++ b/src/openms/include/OpenMS/ANALYSIS/ID/FragmentIndex.h @@ -0,0 +1,146 @@ +// -------------------------------------------------------------------------- +// OpenMS -- Open-Source Mass Spectrometry +// -------------------------------------------------------------------------- +// Copyright The OpenMS Team -- Eberhard Karls University Tuebingen, +// ETH Zurich, and Freie Universitaet Berlin 2002-2022. +// +// This software is released under a three-clause BSD license: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of any author or any participating institution +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// For a full list of authors, refer to the file AUTHORS. +// -------------------------------------------------------------------------- +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL ANY OF THE AUTHORS OR THE CONTRIBUTING +// INSTITUTIONS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF +// ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// -------------------------------------------------------------------------- +// $Maintainer: Chris Bielow $ +// $Authors: Max Alcer, Heike Einsfeld $ +// -------------------------------------------------------------------------- + + +#pragma once + +#include +#include +#include +#include +#include + +#include + +namespace OpenMS +{ + //class MSExperiment; + + /** @brief Creating a two-level tree from a given FASTAFile by generating the theoretical peptides and their b-y-ions and sorting + * them in the outer tree by fragment-m/z and the inner tree by precursor-m/z. Search function to find candidates to experimental MS2 sepctra + * (implementation inspired by https://lazear.github.io/sage/) + * @ingroup ID + */ +class OPENMS_DLLAPI FragmentIndex : public DefaultParamHandler +{ + public: + + /** @brief Storing found candiates from search + */ + struct Candidate + { + size_t peptide_start; ///< Start position of peptide in protein + size_t peptide_end; ///< End position of peptide in protein + size_t protein_index; ///< Index to Entry of std::vector + }; + + /** @brief Storing vector of found candiates from search with the Index of the MSSpectrum in the MSExperiment + */ + struct CandidatesWithIndex + { + std::vector candidates; ///< Vector of found candidates + size_t spectrum_index; ///< Index to the MSSpectrum in the MSExperiment + CandidatesWithIndex(std::vector&& cand, size_t spectrum_i): candidates(std::move(cand)), spectrum_index(spectrum_i){} + }; + + /** @brief Builds up the Search Datastructure + * @param entries Input vector of FASTAFile-Entries to base SearchDatastructure on + */ + FragmentIndex(const std::vector& entries); + + /** @brief Searches Peaks of every MSSpectrum of the MSExperiment in the Database + * @param experiment Input MSExperiment containing of MS2 Spectra (MS1 Spectra has an empty output) + * @param candidates Output vector of found candidates with the index of MSSpectrum in experiment + */ + void search(MSExperiment& experiment, std::vector& candidates) const; + + /** @brief Searches Peaks of the MSSpectrum in the Database + * @param spectrum Input MS2 Spectrum (MS1 Spectra has an empty output) + * @param candidates Output vector of found candidatest + */ + void search(MSSpectrum& spectrum, std::vector& candidates) const; + + protected: + + void updateMembers_() override; + + //Saving b_y_ion after creating theoretical spectrum with index to precursor-peptide + struct Fragment_ + { + size_t peptide_index_; + double fragment_mz_; + Fragment_(size_t prec, const Peak1D& frag):peptide_index_(prec), fragment_mz_(frag.getMZ()){} + }; + + //Saving first and last index of precursor in FASTA-Entry, index to protein and the MonoWeight after digesting + struct Peptide_ + { + size_t peptide_begin_; + size_t peptide_end_; + size_t protein_index_; + double peptide_mz_; + }; + + /// in-silico digest protein database + std::vector generate_peptides_(const std::vector& entries) const; + /// Merges presorted Chunks of Peptide-Fragments inplace + void fragment_merge_(int first, int last, const std::vector& chunks, std::vector& input) const; + ///generates sorted vector with all theoretical Fragments for all theoretical Peptides + std::vector generate_fragments_(const std::vector& entries) const; + + std::string digestor_enzyme_; + size_t missed_cleavages_; ///< number of missed cleavages + double peptide_min_mass_; + double peptide_max_mass_; + size_t peptide_min_length_; + size_t peptide_max_length_; + double fragment_min_mz_; + double fragment_max_mz_; + size_t fragment_min_charge_; + size_t fragment_max_charge_; + size_t bucketsize_; ///< number of fragments per outer node + double precursor_mz_tolerance_; + std::string precursor_mz_tolerance_unit_; + double fragment_mz_tolerance_; + std::string fragment_mz_tolerance_unit_; + StringList modifications_fixed_; + StringList modifications_variable_; + size_t max_variable_mods_per_peptide_; + size_t max_missed_peaks_; + std::vector all_peptides_; + std::vector bucket_frags_mz_; ///< Minimum fragment-m/z for each other node + std::vector all_fragments_; +}; + +} \ No newline at end of file diff --git a/src/openms/include/OpenMS/ANALYSIS/ID/sources.cmake b/src/openms/include/OpenMS/ANALYSIS/ID/sources.cmake index ecfc0f6796f..cdb62e152c7 100755 --- a/src/openms/include/OpenMS/ANALYSIS/ID/sources.cmake +++ b/src/openms/include/OpenMS/ANALYSIS/ID/sources.cmake @@ -21,6 +21,7 @@ ConsensusMapMergerAlgorithm.h FalseDiscoveryRate.h FIAMSDataProcessor.h FIAMSScheduler.h +FragmentIndex.h HiddenMarkovModel.h IDBoostGraph.h IDDecoyProbability.h @@ -38,7 +39,6 @@ PrecursorPurity.h ProtonDistributionModel.h PeptideIndexing.h PercolatorFeatureSetHelper.h -SearchDatabase.h SimpleSearchEngineAlgorithm.h SiriusAdapterAlgorithm.h SiriusMSConverter.h diff --git a/src/openms/source/ANALYSIS/ID/FragmentIndex.cpp b/src/openms/source/ANALYSIS/ID/FragmentIndex.cpp new file mode 100755 index 00000000000..2f3b2d916a5 --- /dev/null +++ b/src/openms/source/ANALYSIS/ID/FragmentIndex.cpp @@ -0,0 +1,360 @@ +// -------------------------------------------------------------------------- +// OpenMS -- Open-Source Mass Spectrometry +// -------------------------------------------------------------------------- +// Copyright The OpenMS Team -- Eberhard Karls University Tuebingen, +// ETH Zurich, and Freie Universitaet Berlin 2002-2022. +// +// This software is released under a three-clause BSD license: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of any author or any participating institution +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// For a full list of authors, refer to the file AUTHORS. +// -------------------------------------------------------------------------- +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL ANY OF THE AUTHORS OR THE CONTRIBUTING +// INSTITUTIONS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF +// ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// -------------------------------------------------------------------------- +// $Maintainer: Chris Bielow $ +// $Authors: Max Alcer, Heike Einsfeld $ +// -------------------------------------------------------------------------- + + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +using namespace std; + + +namespace OpenMS +{ + +using namespace Constants::UserParam; +using namespace Math; + +std::vector FragmentIndex::generate_peptides_(const std::vector& entries) const +{ + vector all_peptides; + + ModifiedPeptideGenerator::MapToResidueType fixed_modifications = ModifiedPeptideGenerator::getModifications(modifications_fixed_); + ModifiedPeptideGenerator::MapToResidueType variable_modifications = ModifiedPeptideGenerator::getModifications(modifications_variable_); + + size_t skipped_peptides = 0; + size_t skipped_AAs = 0; + + #pragma omp parallel + { + ProteaseDigestion digestor; + digestor.setEnzyme(digestor_enzyme_); + digestor.setMissedCleavages(missed_cleavages_); + vector all_peptides_pvt; + #pragma omp for nowait + for (size_t i = 0; i < entries.size(); ++i) + { + vector> peptides; + digestor.digestUnmodified(StringView(entries[i].sequence), peptides, peptide_min_length_, peptide_max_length_); + + for (const auto& pep : peptides) + { + if (entries[i].sequence.substr(pep.first, pep.second).find('X') != string::npos) // filtering peptide with unknown AA, can't calculate MonoWeight + { + ++skipped_peptides; + skipped_AAs += pep.second - pep.first; + continue; + } + + vector modified_peptides; + + AASequence modified_pep = AASequence::fromString(entries[i].sequence.substr(pep.first, pep.second)); + + ModifiedPeptideGenerator::applyFixedModifications(fixed_modifications, modified_pep); + ModifiedPeptideGenerator::applyVariableModifications(variable_modifications, modified_pep, max_variable_mods_per_peptide_, modified_peptides); + + for (const auto & mod_pep : modified_peptides) + { + double mod_seq_mz = mod_pep.getMonoWeight(); + + if (!contains(mod_seq_mz, peptide_min_mass_, peptide_max_mass_)) continue; // Skip peptides out of mass range + + all_peptides_pvt.push_back({pep.first, pep.second, i, mod_seq_mz}); + } + + double seq_mz = AASequence::fromString(entries[i].sequence.substr(pep.first, pep.second)).getMonoWeight(); + + if (!contains(seq_mz, peptide_min_mass_, peptide_max_mass_)) continue; // Skip peptides out of mass range + + all_peptides_pvt.push_back({pep.first, pep.second, i, seq_mz}); + + } + } + #pragma omp critical + all_peptides.insert(all_peptides.end(), all_peptides_pvt.begin(), all_peptides_pvt.end()); + } + if (skipped_peptides > 0) OPENMS_LOG_WARN << skipped_peptides << " peptides with total length of " << skipped_AAs << " skipped due to unkown AA \n"; + return all_peptides; +} + +void FragmentIndex::fragment_merge_(int first, int last, const std::vector& chunks, std::vector& input) const +{ + if (last - first > 1) + { + int mid = first + (last-first) / 2; + #pragma omp parallel sections + { + #pragma omp section + FragmentIndex::fragment_merge_(first, mid, chunks, input); + #pragma omp section + FragmentIndex::fragment_merge_(mid, last, chunks, input); + } + std::inplace_merge(input.begin() + chunks[first], input.begin() + chunks[mid], input.begin() + chunks[last], + [&](const FragmentIndex::Fragment_& l, const FragmentIndex::Fragment_& r)-> bool + { + return (tie(l.fragment_mz_, all_peptides_[l.peptide_index_].peptide_mz_) < + tie(r.fragment_mz_, all_peptides_[r.peptide_index_].peptide_mz_)); + }); + } +} + +std::vector FragmentIndex::generate_fragments_(const std::vector& entries) const +{ + TheoreticalSpectrumGenerator tsg; + PeakSpectrum b_y_ions; + std::vector all_frags; + std::vector chunk_start = {0}; //for fragment_merge need start and end of presorted chunks + + for (size_t i = 0; i < all_peptides_.size(); ++i) + { + tsg.getSpectrum(b_y_ions, AASequence::fromString(entries[all_peptides_[i].protein_index_].sequence.substr(all_peptides_[i].peptide_begin_, all_peptides_[i].peptide_end_)), + fragment_min_charge_, fragment_max_charge_); + for (const auto& frag : b_y_ions) + { + if (!contains(frag.getMZ(), fragment_min_mz_, fragment_max_mz_)) continue; + all_frags.emplace_back(i, frag); + } + chunk_start.emplace_back(all_frags.size()); + b_y_ions.clear(true); + } + + FragmentIndex::fragment_merge_(0, chunk_start.size()-1, chunk_start, all_frags); + + return all_frags; +} + +FragmentIndex::FragmentIndex(const std::vector& entries) : DefaultParamHandler("FragmentIndex") +{ + vector all_enzymes; + ProteaseDB::getInstance()->getAllNames(all_enzymes); + vector all_mods; + ModificationsDB::getInstance()->getAllSearchModifications(all_mods); + vector tolerance_units{UNIT_DA, UNIT_PPM}; + defaults_.setValue("digestor_enzyme", "Trypsin", "Enzyme for digestion"); + defaults_.setValidStrings("digestor_enzyme", ListUtils::create(all_enzymes)); + defaults_.setValue("missed_cleavages", 0, "Missed cleavages for digestion"); + defaults_.setValue("peptide_min_mass", 500, "Minimal peptide mass for database"); + defaults_.setValue("peptide_max_mass", 5000, "Maximal peptide mass for database"); + defaults_.setValue("peptide_min_length", 5, "Minimal peptide length for database"); + defaults_.setValue("peptide_max_length", 50, "Maximal peptide length for database"); + defaults_.setValue("fragment_min_mz", 150, "Minimal fragment mz for database"); + defaults_.setValue("fragment_max_mz", 2000, "Maximal fragment mz for database"); + defaults_.setValue("fragment_min_charge", 1, "Minimal fragment charge for generation of theorethical Spectrum"); + defaults_.setValue("fragment_max_charge", 1, "Maximal fragment charge for generation of theorethical Spectrum"); + defaults_.setValue("precursor_mz_tolerance", 2.0, "Tolerance for precursor-m/z in search"); + defaults_.setValue("fragment_mz_tolerance", 0.05, "Tolerance for fragment-m/z in search"); + defaults_.setValue("precursor_mz_tolerance_unit", UNIT_DA, "Unit of tolerance for precursor-m/z"); + defaults_.setValidStrings("precursor_mz_tolerance_unit", tolerance_units); + defaults_.setValue("fragment_mz_tolerance_unit", UNIT_DA, "Unit of tolerance for fragment-m/z"); + defaults_.setValidStrings("fragment_mz_tolerance_unit", tolerance_units); + defaults_.setValue("max_missed_peaks", 5, "If this number of the highest peaks in a spectrum is not found, the spectrum gets skipped"); + defaults_.setValue("modifications_fixed", std::vector{"Carbamidomethyl (C)"}, "Fixed modifications, specified using UniMod (www.unimod.org) terms, e.g. 'Carbamidomethyl (C)'"); + defaults_.setValidStrings("modifications_fixed", ListUtils::create(all_mods)); + defaults_.setValue("modifications_variable", std::vector{"Oxidation (M)"}, "Variable modifications, specified using UniMod (www.unimod.org) terms, e.g. 'Oxidation (M)'"); + defaults_.setValidStrings("modifications_variable", ListUtils::create(all_mods)); + defaults_.setValue("max_variable_mods_per_peptide", 2, "Maximum number of residues carrying a variable modification per candidate peptide"); + + defaultsToParam_(); + + if (entries.empty()) return; + + all_peptides_ = generate_peptides_(entries); + + all_fragments_ = generate_fragments_(entries); + + bucketsize_ = size_t(sqrt(all_fragments_.size())); //calculating bucketsize for balanced tree (performance) + + for (size_t i = 0; i < all_fragments_.size(); i += bucketsize_) + { + bucket_frags_mz_.emplace_back(all_fragments_[i].fragment_mz_); //store lower m/z border of buckets + } + // sorting the fragments of each bucket by the precursor-m/z + #pragma omp parallel for + for (size_t i = 0; i < all_fragments_.size(); i += bucketsize_) + { + auto bucket_start = all_fragments_.begin() + i; + auto bucket_end = bucket_start + min(bucketsize_, size_t(all_fragments_.end() - bucket_start)); // Last bucket may be smaller then bucketsize -> end of bucket is all_fragments_.end() + + sort(bucket_start, bucket_end, + [&](const FragmentIndex::Fragment_& l, const FragmentIndex::Fragment_& r) -> bool + {return (all_peptides_[l.peptide_index_].peptide_mz_ < all_peptides_[r.peptide_index_].peptide_mz_);}); + } +} + +void FragmentIndex::updateMembers_() +{ + digestor_enzyme_ = param_.getValue("digestor_enzyme").toString(); + missed_cleavages_ = param_.getValue("missed_cleavages"); + peptide_min_mass_ = param_.getValue("peptide_min_mass"); + peptide_max_mass_ = param_.getValue("peptide_max_mass"); + peptide_min_length_ = param_.getValue("peptide_min_length"); + peptide_max_length_ = param_.getValue("peptide_max_length"); + fragment_min_mz_ = param_.getValue("fragment_min_mz"); + fragment_max_mz_ = param_.getValue("fragment_max_mz"); + fragment_min_charge_ = param_.getValue("fragment_min_charge"); + fragment_max_charge_ = param_.getValue("fragment_max_charge"); + precursor_mz_tolerance_ = param_.getValue("precursor_mz_tolerance"); + fragment_mz_tolerance_ = param_.getValue("fragment_mz_tolerance"); + precursor_mz_tolerance_unit_ = param_.getValue("precursor_mz_tolerance_unit").toString(); + fragment_mz_tolerance_unit_ = param_.getValue("fragment_mz_tolerance_unit").toString(); + max_missed_peaks_ = param_.getValue("max_missed_peaks"); + modifications_fixed_ = ListUtils::toStringList(param_.getValue("modifications_fixed")); + modifications_variable_ = ListUtils::toStringList(param_.getValue("modifications_variable")); + max_variable_mods_per_peptide_ = param_.getValue("max_variable_mods_per_peptide"); +} + +void FragmentIndex::search(MSSpectrum& spectrum, std::vector& candidates) const +{ + candidates.clear(); + if (spectrum.empty() || (spectrum.getMSLevel() != 2)) return; + + unordered_set index_hash; // saving every candidate only once + + const std::vector& precursors = spectrum.getPrecursors(); + + if (precursors.size() != 1) + { + OPENMS_LOG_WARN << "Number of precursors does not match MS level \n"; + return; + } + + double prec_mz = precursors[0].getUnchargedMass(); + + spectrum.sortByIntensity(true); + size_t count_found = 0; + size_t count_rounds = 0; + + auto prec_unit_da = precursor_mz_tolerance_unit_ == UNIT_DA; + auto frag_unit_da = fragment_mz_tolerance_unit_ == UNIT_DA; + + for (const auto& peak : spectrum) + { + if (count_found == 0 && count_rounds == max_missed_peaks_) return; // if top x peaks with the highest intensity don't match, skip whole spectrum + + double new_frag_mz_tolerance = frag_unit_da ? fragment_mz_tolerance_ : ppmToMassAbs(fragment_mz_tolerance_, peak.getMZ()); + + auto itr_lower = lower_bound(bucket_frags_mz_.begin(), bucket_frags_mz_.end(), peak.getMZ() - new_frag_mz_tolerance); + + size_t index_lower = distance(bucket_frags_mz_.begin(), itr_lower); + + if (index_lower != 0) index_lower--; // lower bound returns one too high (because searching the minimum of each bucket) + + while (bucket_frags_mz_[index_lower] <= (peak.getMZ() + new_frag_mz_tolerance) && index_lower < bucket_frags_mz_.size()) + { + auto bucket_start = all_fragments_.begin() + (index_lower * bucketsize_); + auto bucket_end = bucket_start + min(bucketsize_, size_t(all_fragments_.end() - bucket_start)); // Last bucket may be smaller then bucketsize -> end of bucket is all_fragments_.end() + + double new_prec_mz_tolerance = prec_unit_da ? precursor_mz_tolerance_ : ppmToMassAbs(precursor_mz_tolerance_, prec_mz); + + auto itr_lower_inner = lower_bound(bucket_start, bucket_end, prec_mz - new_prec_mz_tolerance, + [&](const FragmentIndex::Fragment_& r, double l) { + return (all_peptides_[r.peptide_index_].peptide_mz_ < l); + }); + + while (all_peptides_[(*itr_lower_inner).peptide_index_].peptide_mz_ <= (prec_mz + new_prec_mz_tolerance) && itr_lower_inner != bucket_end && itr_lower_inner != all_fragments_.end()) + { + double theoretical_peak_tolerance = frag_unit_da ? fragment_mz_tolerance_ : ppmToMassAbs(fragment_mz_tolerance_, (*itr_lower_inner).fragment_mz_); + // filter by matching fragment-m/z + if (!contains(peak.getMZ(), (*itr_lower_inner).fragment_mz_ - theoretical_peak_tolerance, (*itr_lower_inner).fragment_mz_ + theoretical_peak_tolerance)) + { + ++itr_lower_inner; + continue; + } + index_hash.insert((*itr_lower_inner).peptide_index_); + ++itr_lower_inner; + count_found++; + } + index_lower++; + } + count_rounds++; + } + + for (size_t j : index_hash) + { + candidates.push_back({all_peptides_[j].peptide_begin_, all_peptides_[j].peptide_end_, all_peptides_[j].protein_index_}); + } +} + +void FragmentIndex::search(MSExperiment& experiment, std::vector& candidates) const +{ + candidates.clear(); + if (experiment.size() == 0) return; + + #pragma omp parallel + { + #pragma omp for + for (size_t i = 0; i < experiment.size(); ++i) + { + if (experiment[i].empty()) continue; + + vector temp_cand; + + FragmentIndex::search(experiment[i], temp_cand); + + #pragma omp critical + { + candidates.emplace_back(move(temp_cand), i); + } + } + } +} + +} // end namespace OpenMS + + + + + diff --git a/src/openms/source/ANALYSIS/ID/sources.cmake b/src/openms/source/ANALYSIS/ID/sources.cmake index f422df769be..6a71fa12258 100755 --- a/src/openms/source/ANALYSIS/ID/sources.cmake +++ b/src/openms/source/ANALYSIS/ID/sources.cmake @@ -21,6 +21,7 @@ ConsensusMapMergerAlgorithm.cpp FalseDiscoveryRate.cpp FIAMSDataProcessor.cpp FIAMSScheduler.cpp +FragmentIndex.cpp HiddenMarkovModel.cpp IDBoostGraph.cpp IDConflictResolverAlgorithm.cpp @@ -38,7 +39,6 @@ PrecursorPurity.cpp ProtonDistributionModel.cpp PeptideIndexing.cpp PercolatorFeatureSetHelper.cpp -SearchDatabase.cpp SimpleSearchEngineAlgorithm.cpp SiriusAdapterAlgorithm.cpp SiriusMSConverter.cpp diff --git a/src/tests/class_tests/openms/executables.cmake b/src/tests/class_tests/openms/executables.cmake index 9b9fa26cf51..6c115908b77 100755 --- a/src/tests/class_tests/openms/executables.cmake +++ b/src/tests/class_tests/openms/executables.cmake @@ -501,6 +501,7 @@ set(analysis_executables_list FIAMSScheduler_test FLASHDeconvAlgorithm_test FLASHDeconvHelperStructs_test + FragmentIndex_test HiddenMarkovModel_test IDBoostGraph_test IDMapper_test @@ -577,7 +578,7 @@ set(analysis_executables_list ReactionMonitoringTransition_test RNPxlModificationsGenerator_test SVMWrapper_test - SearchDatabase_test + SearchDatabase_speed_test SimpleSearchEngineAlgorithm_test SimplePairFinder_test SimpleSVM_test diff --git a/src/tests/class_tests/openms/source/FragmentIndex_test.cpp b/src/tests/class_tests/openms/source/FragmentIndex_test.cpp new file mode 100755 index 00000000000..d05170d5855 --- /dev/null +++ b/src/tests/class_tests/openms/source/FragmentIndex_test.cpp @@ -0,0 +1,299 @@ +// -------------------------------------------------------------------------- +// OpenMS -- Open-Source Mass Spectrometry +// -------------------------------------------------------------------------- +// Copyright The OpenMS Team -- Eberhard Karls University Tuebingen, +// ETH Zurich, and Freie Universitaet Berlin 2002-2022. +// +// This software is released under a three-clause BSD license: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of any author or any participating institution +// may be used to endorse or promote products derived from this software +// without specific prior written permission. +// For a full list of authors, refer to the file AUTHORS. +// -------------------------------------------------------------------------- +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL ANY OF THE AUTHORS OR THE CONTRIBUTING +// INSTITUTIONS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; +// OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +// WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +// OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF +// ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// -------------------------------------------------------------------------- +// $Maintainer: Chris Bielow $ +// $Authors: Max Alcer, Heike Einsfeld $ +// -------------------------------------------------------------------------- + +#include +#include +#include + +#include +#include +#include +#include +#include + +using namespace OpenMS; +using namespace Constants::UserParam; +using namespace std; + +class FragmentIndex_test : public FragmentIndex +{ + public: + + using FragmentIndex::FragmentIndex; + + std::vector getAllFragments(){return all_fragments_;} + + bool isSortedBucketFragsMZ() + { + return is_sorted(bucket_frags_mz_.begin(), bucket_frags_mz_.end()); + } + + bool isSortedAllFragments() + { + bool test_sorted = true; + + for (size_t i = 0; i < all_fragments_.size(); i += bucketsize_) + { + auto bucket_begin = all_fragments_.begin()+i; + auto condition = distance(all_fragments_.begin(), bucket_begin+bucketsize_) >= int(all_fragments_.size()); + + test_sorted &= is_sorted(bucket_begin, condition ? all_fragments_.end() : bucket_begin+bucketsize_, + [&](const Fragment_& l, const Fragment_& r) -> bool + {return (all_peptides_[l.peptide_index_].peptide_mz_ < all_peptides_[r.peptide_index_].peptide_mz_);}); + } + return test_sorted; + } + +}; + +START_TEST(FragmentIndex, "$Id:$") + + +START_SECTION(FragmentIndex(const std::vector& entries)) + + std::vector entries{ + {"test1", "test1", "LRLRACGLNFADLMARQGLY"}, + {"test2", "test2", "AAASPPLLRCLVLTGFGGYD"}, + {"test3", "test3", "KVKLQSRPAAPPAPGPGQLT"}}; + + FragmentIndex_test sdb(entries); + START_SECTION(test number of fragments) + TEST_EQUAL(187, sdb.getAllFragments().size()) + END_SECTION + + START_SECTION(test sortation) + TEST_TRUE(sdb.isSortedBucketFragsMZ()) + + TEST_TRUE(sdb.isSortedAllFragments()) + END_SECTION + +END_SECTION + +START_SECTION(void search(MSSpectrum& spectrum, std::vector& candidates)) + + cout << "\n"; + + std::vector entries{ + {"test1", "test1", "LRLRACGLNFADLMARQGLY"}, + {"test2", "test2", "AAASPPLLRCLVLTGFGGYD"}, + {"test3", "test3", "KVKLQSRPAAPPAPGPGQLT"}}; + + FragmentIndex_test sdb(entries); + + MSSpectrum spec; + + Precursor prec{}; + + std::vector candidates; + + START_SECTION(Searching 3 Fragments it should find (with Da and ppm)) + + prec.setCharge(1); + + prec.setMZ(1281.6); + + spec.setPrecursors({prec}); + spec.setMSLevel(2); + + spec.push_back({605.308, 100}); + spec.push_back({676.345, 100}); + spec.push_back({823.413, 100}); + + sdb.search(spec, candidates); + + TEST_EQUAL(candidates.size(), 1) + + auto params = sdb.getParameters(); + + params.setValue("fragment_mz_tolerance_unit", UNIT_PPM); + params.setValue("fragment_mz_tolerance", 5.f); + params.setValue("precursor_mz_tolerance_unit", UNIT_PPM); + params.setValue("precursor_mz_tolerance", 50.f); + + sdb.setParameters(params); + + sdb.search(spec, candidates); + + TEST_EQUAL(candidates.size(), 1) + + END_SECTION + + START_SECTION(Searching Fragment it should not find because of Fragment Mass) + + auto params = sdb.getParameters(); + + params.setValue("fragment_mz_tolerance_unit", UNIT_DA); + params.setValue("fragment_mz_tolerance", 0.05); + params.setValue("precursor_mz_tolerance_unit", UNIT_DA); + params.setValue("precursor_mz_tolerance", 2.0); + + sdb.setParameters(params); + + spec.clear(false); + + spec.push_back({1040, 100}); + + sdb.search(spec, candidates); + + TEST_EQUAL(candidates.size(), 0) + + END_SECTION + + START_SECTION(Searching Fragment it should not find because of Precursor Mass) + + spec.clear(true); + + prec.setMZ(1500); + + spec.setPrecursors({prec}); + spec.setMSLevel(2); + + spec.push_back({572.304, 100}); + + sdb.search(spec, candidates); + + TEST_EQUAL(candidates.size(), 0) + + END_SECTION + + START_SECTION(Searching Fragment it should not find because its smaller then all Fragments in Database) + + spec.clear(false); + + spec.push_back({100, 100}); + + sdb.search(spec, candidates); + + TEST_EQUAL(candidates.size(), 0) + + END_SECTION + + START_SECTION(Searching Fragment it should not find because its bigger then all Fragments in Database) + + spec.clear(false); + + spec.push_back({2000, 100}); + + sdb.search(spec, candidates); + + TEST_EQUAL(candidates.size(), 0) + + END_SECTION + + START_SECTION(Testing filtering of Spectrum by best Peaks) + + spec.clear(true); + + prec.setMZ(1281.6); + + spec.setPrecursors({prec}); + spec.setMSLevel(2); + + spec.push_back({2000, 80}); + + spec.push_back({1500, 99}); + + spec.push_back({500, 85}); + + spec.push_back({1000, 90}); + + spec.push_back({100, 100}); + + spec.push_back({937.456, 5}); + + sdb.search(spec, candidates); + + TEST_EQUAL(candidates.size(), 0) + + END_SECTION + +END_SECTION + +START_SECTION(void search(MSExperiment& experiment, std::vector& candidates)) + + std::vector entries{ + {"test1", "test1", "LRLRACGLNFADLMARQGLY"}, + {"test2", "test2", "AAASPPLLRCLVLTGFGGYD"}, + {"test3", "test3", "KVKLQSRPAAPPAPGPGQLT"}}; + + FragmentIndex_test sdb(entries); + + MSExperiment exp; + + MSSpectrum spec; + + Precursor prec{}; + + prec.setCharge(1); + + prec.setMZ(1281.6); + + spec.setPrecursors({prec}); + spec.setMSLevel(2); + + spec.push_back({605.318, 100}); + + exp.addSpectrum(spec); + + spec.clear(true); + + prec.setMZ(894.529); + + spec.setPrecursors({prec}); + spec.setMSLevel(2); + + spec.push_back({175.119, 100}); + + exp.addSpectrum(spec); + + spec.clear(true); + + prec.setMZ(1655.89); + + spec.setPrecursors({prec}); + spec.setMSLevel(2); + + spec.push_back({1544.83, 100}); + + exp.addSpectrum(spec); + + std::vector candidates; + + sdb.search(exp, candidates); + + TEST_EQUAL(candidates.size(), 3) + +END_SECTION + +END_TEST \ No newline at end of file From d03757f7c89a66876795cfa0ee1938af198e93fe Mon Sep 17 00:00:00 2001 From: Maxalcer Date: Thu, 11 May 2023 12:11:33 +0200 Subject: [PATCH 16/19] fixed racing condition --- src/openms/source/ANALYSIS/ID/FragmentIndex.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/openms/source/ANALYSIS/ID/FragmentIndex.cpp b/src/openms/source/ANALYSIS/ID/FragmentIndex.cpp index 2f3b2d916a5..e74f07be944 100755 --- a/src/openms/source/ANALYSIS/ID/FragmentIndex.cpp +++ b/src/openms/source/ANALYSIS/ID/FragmentIndex.cpp @@ -94,8 +94,11 @@ std::vector FragmentIndex::generate_peptides_(const std { if (entries[i].sequence.substr(pep.first, pep.second).find('X') != string::npos) // filtering peptide with unknown AA, can't calculate MonoWeight { - ++skipped_peptides; - skipped_AAs += pep.second - pep.first; + #pragma omp critical + { + ++skipped_peptides; + skipped_AAs += (pep.second - pep.first); + } continue; } From 210723f8d706b954c21843db6ce4fef658f3521d Mon Sep 17 00:00:00 2001 From: Maxalcer Date: Thu, 11 May 2023 12:28:04 +0200 Subject: [PATCH 17/19] skipped modification part if possible modifications are empty --- .../source/ANALYSIS/ID/FragmentIndex.cpp | 22 ++++++++++--------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/src/openms/source/ANALYSIS/ID/FragmentIndex.cpp b/src/openms/source/ANALYSIS/ID/FragmentIndex.cpp index e74f07be944..e614f8fee04 100755 --- a/src/openms/source/ANALYSIS/ID/FragmentIndex.cpp +++ b/src/openms/source/ANALYSIS/ID/FragmentIndex.cpp @@ -102,22 +102,24 @@ std::vector FragmentIndex::generate_peptides_(const std continue; } - vector modified_peptides; + if(!(modifications_fixed_.empty() && modifications_variable_.empty())) + { + vector modified_peptides; - AASequence modified_pep = AASequence::fromString(entries[i].sequence.substr(pep.first, pep.second)); + AASequence modified_pep = AASequence::fromString(entries[i].sequence.substr(pep.first, pep.second)); - ModifiedPeptideGenerator::applyFixedModifications(fixed_modifications, modified_pep); - ModifiedPeptideGenerator::applyVariableModifications(variable_modifications, modified_pep, max_variable_mods_per_peptide_, modified_peptides); + ModifiedPeptideGenerator::applyFixedModifications(fixed_modifications, modified_pep); + ModifiedPeptideGenerator::applyVariableModifications(variable_modifications, modified_pep, max_variable_mods_per_peptide_, modified_peptides); - for (const auto & mod_pep : modified_peptides) - { - double mod_seq_mz = mod_pep.getMonoWeight(); + for (const auto & mod_pep : modified_peptides) + { + double mod_seq_mz = mod_pep.getMonoWeight(); - if (!contains(mod_seq_mz, peptide_min_mass_, peptide_max_mass_)) continue; // Skip peptides out of mass range + if (!contains(mod_seq_mz, peptide_min_mass_, peptide_max_mass_)) continue; // Skip peptides out of mass range - all_peptides_pvt.push_back({pep.first, pep.second, i, mod_seq_mz}); + all_peptides_pvt.push_back({pep.first, pep.second, i, mod_seq_mz}); + } } - double seq_mz = AASequence::fromString(entries[i].sequence.substr(pep.first, pep.second)).getMonoWeight(); if (!contains(seq_mz, peptide_min_mass_, peptide_max_mass_)) continue; // Skip peptides out of mass range From 0062d4b773111e0870ac9f67d3e939eb6d9aa421 Mon Sep 17 00:00:00 2001 From: Maxalcer Date: Thu, 11 May 2023 15:26:08 +0200 Subject: [PATCH 18/19] moved building the index into seperat function --- .../OpenMS/ANALYSIS/ID/FragmentIndex.h | 12 ++- .../source/ANALYSIS/ID/FragmentIndex.cpp | 35 ++++++--- .../openms/source/FragmentIndex_test.cpp | 76 +++++++++++++++++-- 3 files changed, 105 insertions(+), 18 deletions(-) diff --git a/src/openms/include/OpenMS/ANALYSIS/ID/FragmentIndex.h b/src/openms/include/OpenMS/ANALYSIS/ID/FragmentIndex.h index abfcf2ffb60..75c019438c1 100755 --- a/src/openms/include/OpenMS/ANALYSIS/ID/FragmentIndex.h +++ b/src/openms/include/OpenMS/ANALYSIS/ID/FragmentIndex.h @@ -74,10 +74,15 @@ class OPENMS_DLLAPI FragmentIndex : public DefaultParamHandler CandidatesWithIndex(std::vector&& cand, size_t spectrum_i): candidates(std::move(cand)), spectrum_index(spectrum_i){} }; + + /** @brief Sets default parameters + */ + FragmentIndex(); + /** @brief Builds up the Search Datastructure - * @param entries Input vector of FASTAFile-Entries to base SearchDatastructure on - */ - FragmentIndex(const std::vector& entries); + * @param entries Input vector of FASTAFile-Entries to base SearchDatastructure on + */ + void build(const std::vector& entries); /** @brief Searches Peaks of every MSSpectrum of the MSExperiment in the Database * @param experiment Input MSExperiment containing of MS2 Spectra (MS1 Spectra has an empty output) @@ -119,6 +124,7 @@ class OPENMS_DLLAPI FragmentIndex : public DefaultParamHandler ///generates sorted vector with all theoretical Fragments for all theoretical Peptides std::vector generate_fragments_(const std::vector& entries) const; + bool is_build_; std::string digestor_enzyme_; size_t missed_cleavages_; ///< number of missed cleavages double peptide_min_mass_; diff --git a/src/openms/source/ANALYSIS/ID/FragmentIndex.cpp b/src/openms/source/ANALYSIS/ID/FragmentIndex.cpp index e614f8fee04..d1f23982b7f 100755 --- a/src/openms/source/ANALYSIS/ID/FragmentIndex.cpp +++ b/src/openms/source/ANALYSIS/ID/FragmentIndex.cpp @@ -76,7 +76,6 @@ std::vector FragmentIndex::generate_peptides_(const std ModifiedPeptideGenerator::MapToResidueType variable_modifications = ModifiedPeptideGenerator::getModifications(modifications_variable_); size_t skipped_peptides = 0; - size_t skipped_AAs = 0; #pragma omp parallel { @@ -94,11 +93,9 @@ std::vector FragmentIndex::generate_peptides_(const std { if (entries[i].sequence.substr(pep.first, pep.second).find('X') != string::npos) // filtering peptide with unknown AA, can't calculate MonoWeight { - #pragma omp critical - { - ++skipped_peptides; - skipped_AAs += (pep.second - pep.first); - } + #pragma omp atomic + ++skipped_peptides; + continue; } @@ -131,7 +128,7 @@ std::vector FragmentIndex::generate_peptides_(const std #pragma omp critical all_peptides.insert(all_peptides.end(), all_peptides_pvt.begin(), all_peptides_pvt.end()); } - if (skipped_peptides > 0) OPENMS_LOG_WARN << skipped_peptides << " peptides with total length of " << skipped_AAs << " skipped due to unkown AA \n"; + if (skipped_peptides > 0) OPENMS_LOG_WARN << skipped_peptides << " peptides skipped due to unkown AA \n"; return all_peptides; } @@ -181,12 +178,13 @@ std::vector FragmentIndex::generate_fragments_(const s return all_frags; } -FragmentIndex::FragmentIndex(const std::vector& entries) : DefaultParamHandler("FragmentIndex") +FragmentIndex::FragmentIndex() : DefaultParamHandler("FragmentIndex") { vector all_enzymes; ProteaseDB::getInstance()->getAllNames(all_enzymes); vector all_mods; ModificationsDB::getInstance()->getAllSearchModifications(all_mods); + all_mods.push_back({}); vector tolerance_units{UNIT_DA, UNIT_PPM}; defaults_.setValue("digestor_enzyme", "Trypsin", "Enzyme for digestion"); defaults_.setValidStrings("digestor_enzyme", ListUtils::create(all_enzymes)); @@ -214,6 +212,11 @@ FragmentIndex::FragmentIndex(const std::vector& entries) defaultsToParam_(); + is_build_ = false; +} + +void FragmentIndex::build(const std::vector& entries) +{ if (entries.empty()) return; all_peptides_ = generate_peptides_(entries); @@ -237,6 +240,8 @@ FragmentIndex::FragmentIndex(const std::vector& entries) [&](const FragmentIndex::Fragment_& l, const FragmentIndex::Fragment_& r) -> bool {return (all_peptides_[l.peptide_index_].peptide_mz_ < all_peptides_[r.peptide_index_].peptide_mz_);}); } + + is_build_ = true; } void FragmentIndex::updateMembers_() @@ -262,7 +267,12 @@ void FragmentIndex::updateMembers_() } void FragmentIndex::search(MSSpectrum& spectrum, std::vector& candidates) const -{ +{ + if(!is_build_) + { + OPENMS_LOG_WARN << "FragmentIndex not yet build \n"; + return; + } candidates.clear(); if (spectrum.empty() || (spectrum.getMSLevel() != 2)) return; @@ -335,8 +345,13 @@ void FragmentIndex::search(MSSpectrum& spectrum, std::vector& candidates) const { + if(!is_build_) + { + OPENMS_LOG_WARN << "FragmentIndex not yet build \n"; + return; + } candidates.clear(); - if (experiment.size() == 0) return; + if (experiment.empty()) return; #pragma omp parallel { diff --git a/src/tests/class_tests/openms/source/FragmentIndex_test.cpp b/src/tests/class_tests/openms/source/FragmentIndex_test.cpp index d05170d5855..f13090af831 100755 --- a/src/tests/class_tests/openms/source/FragmentIndex_test.cpp +++ b/src/tests/class_tests/openms/source/FragmentIndex_test.cpp @@ -80,15 +80,17 @@ class FragmentIndex_test : public FragmentIndex START_TEST(FragmentIndex, "$Id:$") -START_SECTION(FragmentIndex(const std::vector& entries)) +START_SECTION(build(const std::vector& entries)) std::vector entries{ {"test1", "test1", "LRLRACGLNFADLMARQGLY"}, {"test2", "test2", "AAASPPLLRCLVLTGFGGYD"}, {"test3", "test3", "KVKLQSRPAAPPAPGPGQLT"}}; - FragmentIndex_test sdb(entries); + FragmentIndex_test sdb; + START_SECTION(test number of fragments) + sdb.build(entries); TEST_EQUAL(187, sdb.getAllFragments().size()) END_SECTION @@ -97,6 +99,48 @@ START_SECTION(FragmentIndex(const std::vector& entries)) TEST_TRUE(sdb.isSortedAllFragments()) END_SECTION + + START_SECTION(test without modifications) + + auto params = sdb.getParameters(); + + params.setValue("modifications_fixed", std::vector{}); + params.setValue("modifications_variable", std::vector{}); + + sdb.setParameters(params); + + sdb.build(entries); + TEST_NOT_EQUAL(187, sdb.getAllFragments().size()); + + END_SECTION + + START_SECTION(test peptide mz bounds) + + auto params = sdb.getParameters(); + + params.setValue("peptide_min_mass", 2000); + params.setValue("peptide_max_mass", 2000); + + sdb.setParameters(params); + + sdb.build(entries); + TEST_EQUAL(0, sdb.getAllFragments().size()); + + END_SECTION + + START_SECTION(test fragment mz bounds) + + auto params = sdb.getParameters(); + + params.setValue("fragment_min_mz", 500); + params.setValue("fragment_max_mz", 500); + + sdb.setParameters(params); + + sdb.build(entries); + TEST_EQUAL(0, sdb.getAllFragments().size()); + + END_SECTION END_SECTION @@ -109,13 +153,33 @@ START_SECTION(void search(MSSpectrum& spectrum, std::vector& candidat {"test2", "test2", "AAASPPLLRCLVLTGFGGYD"}, {"test3", "test3", "KVKLQSRPAAPPAPGPGQLT"}}; - FragmentIndex_test sdb(entries); + FragmentIndex_test sdb; + std::vector candidates; MSSpectrum spec; Precursor prec{}; - std::vector candidates; + START_SECTION(searching without building) + + prec.setCharge(1); + + prec.setMZ(1281.6); + + spec.setPrecursors({prec}); + spec.setMSLevel(2); + + spec.push_back({605.308, 100}); + spec.push_back({676.345, 100}); + spec.push_back({823.413, 100}); + + sdb.search(spec, candidates); + + TEST_EQUAL(candidates.size(), 0) + + END_SECTION + + sdb.build(entries); START_SECTION(Searching 3 Fragments it should find (with Da and ppm)) @@ -247,7 +311,9 @@ START_SECTION(void search(MSExperiment& experiment, std::vector Date: Fri, 12 May 2023 10:50:43 +0200 Subject: [PATCH 19/19] added modification Index --- .../include/OpenMS/ANALYSIS/ID/FragmentIndex.h | 4 ++++ .../source/ANALYSIS/ID/FragmentIndex.cpp | 18 +++++++++++++----- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/src/openms/include/OpenMS/ANALYSIS/ID/FragmentIndex.h b/src/openms/include/OpenMS/ANALYSIS/ID/FragmentIndex.h index 75c019438c1..c606f9278fe 100755 --- a/src/openms/include/OpenMS/ANALYSIS/ID/FragmentIndex.h +++ b/src/openms/include/OpenMS/ANALYSIS/ID/FragmentIndex.h @@ -63,6 +63,8 @@ class OPENMS_DLLAPI FragmentIndex : public DefaultParamHandler size_t peptide_start; ///< Start position of peptide in protein size_t peptide_end; ///< End position of peptide in protein size_t protein_index; ///< Index to Entry of std::vector + bool is_modified_; ///< true if candidate is modified + size_t modification_index_; ///< index to modified version of this peptide }; /** @brief Storing vector of found candiates from search with the Index of the MSSpectrum in the MSExperiment @@ -115,6 +117,8 @@ class OPENMS_DLLAPI FragmentIndex : public DefaultParamHandler size_t peptide_end_; size_t protein_index_; double peptide_mz_; + bool is_modified_; + size_t modification_index_; }; /// in-silico digest protein database diff --git a/src/openms/source/ANALYSIS/ID/FragmentIndex.cpp b/src/openms/source/ANALYSIS/ID/FragmentIndex.cpp index d1f23982b7f..8068bb01707 100755 --- a/src/openms/source/ANALYSIS/ID/FragmentIndex.cpp +++ b/src/openms/source/ANALYSIS/ID/FragmentIndex.cpp @@ -55,6 +55,7 @@ #include #include +#include #include #include #include @@ -108,20 +109,20 @@ std::vector FragmentIndex::generate_peptides_(const std ModifiedPeptideGenerator::applyFixedModifications(fixed_modifications, modified_pep); ModifiedPeptideGenerator::applyVariableModifications(variable_modifications, modified_pep, max_variable_mods_per_peptide_, modified_peptides); - for (const auto & mod_pep : modified_peptides) + for (size_t j = 0; j < modified_peptides.size(); ++j) { - double mod_seq_mz = mod_pep.getMonoWeight(); + double mod_seq_mz = modified_peptides[j].getMonoWeight(); if (!contains(mod_seq_mz, peptide_min_mass_, peptide_max_mass_)) continue; // Skip peptides out of mass range - all_peptides_pvt.push_back({pep.first, pep.second, i, mod_seq_mz}); + all_peptides_pvt.push_back({pep.first, pep.second, i, mod_seq_mz, true, j}); } } double seq_mz = AASequence::fromString(entries[i].sequence.substr(pep.first, pep.second)).getMonoWeight(); if (!contains(seq_mz, peptide_min_mass_, peptide_max_mass_)) continue; // Skip peptides out of mass range - all_peptides_pvt.push_back({pep.first, pep.second, i, seq_mz}); + all_peptides_pvt.push_back({pep.first, pep.second, i, seq_mz, false, ULONG_MAX}); } } @@ -339,7 +340,14 @@ void FragmentIndex::search(MSSpectrum& spectrum, std::vector