Skip to content

Commit

Permalink
Recover longest orf after fragment elimination (#832)
Browse files Browse the repository at this point in the history
* recover longest orf after fragment elimination

* delete unused variable
  • Loading branch information
LunaJang authored May 21, 2024
1 parent 998c50a commit 5b4c816
Show file tree
Hide file tree
Showing 6 changed files with 164 additions and 1 deletion.
6 changes: 5 additions & 1 deletion data/workflow/taxpercontig.sh
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,11 @@ if [ -n "${ORF_FILTER}" ]; then
fi

if notExists "${TMP_PATH}/orfs_aln.list"; then
awk '$3 > 1 { print $1 }' "${TMP_PATH}/orfs_aln.index" > "${TMP_PATH}/orfs_aln.list"
# shellcheck disable=SC2086
"$MMSEQS" recoverlongestorf "${ORFS_DB}" "${TMP_PATH}/orfs_aln" "${TMP_PATH}/orfs_aln_recovered.list" ${THREADS_PAR} \
|| fail "recoverlongestorf died"
awk '$3 > 1 { print $1 }' "${TMP_PATH}/orfs_aln.index" > "${TMP_PATH}/orfs_aln_remain.list"
cat "${TMP_PATH}/orfs_aln_recovered.list" "${TMP_PATH}/orfs_aln_remain.list" > "${TMP_PATH}/orfs_aln.list"
fi

if notExists "${TMP_PATH}/orfs_filter.dbtype"; then
Expand Down
1 change: 1 addition & 0 deletions src/CommandDeclarations.h
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@ extern int ungappedprefilter(int argc, const char **argv, const Command& command
extern int gappedprefilter(int argc, const char **argv, const Command& command);
extern int unpackdb(int argc, const char **argv, const Command& command);
extern int rbh(int argc, const char **argv, const Command& command);
extern int recoverlongestorf(int argc, const char **argv, const Command& command);
extern int result2flat(int argc, const char **argv, const Command& command);
extern int result2msa(int argc, const char **argv, const Command& command);
extern int result2dnamsa(int argc, const char **argv, const Command& command);
Expand Down
8 changes: 8 additions & 0 deletions src/MMseqsBase.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -907,6 +907,14 @@ std::vector<Command> baseCommands = {
"Eli Levy Karin <eli.levy.karin@gmail.com> ",
"<i:contigsSequenceDB> <i:extractedOrfsHeadersDB> <o:orfsAlignedToContigDB>",
CITATION_MMSEQS2, {{"",DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, NULL}}},
{"recoverlongestorf", recoverlongestorf, &par.onlythreads, COMMAND_EXPERT,
"Recover longest ORF for taxonomy annotation after elimination",
NULL,
"Sung-eun Jang",
"<i:orfDB> <i:resultDB> <o:tsvFile>",
CITATION_MMSEQS2, {{"orfDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb},
{"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb},
{"tsvFile", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile}}},
{"reverseseq", reverseseq, &par.reverseseq, COMMAND_SEQUENCE,
"Reverse (without complement) sequences",
NULL,
Expand Down
1 change: 1 addition & 0 deletions src/util/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ set(util_source_files
util/profile2pssm.cpp
util/profile2neff.cpp
util/profile2seq.cpp
util/recoverlongestorf.cpp
util/result2dnamsa.cpp
util/result2flat.cpp
util/result2msa.cpp
Expand Down
148 changes: 148 additions & 0 deletions src/util/recoverlongestorf.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
#include "Parameters.h"
#include "DBReader.h"
#include "DBWriter.h"
#include "Debug.h"
#include "Util.h"
#include "Orf.h"

#include <unordered_map>
#include <unordered_set>

#ifdef OPENMP
#include <omp.h>
#endif

int recoverlongestorf(int argc, const char **argv, const Command &command) {
Parameters &par = Parameters::getInstance();
par.parseParameters(argc, argv, command, true, 0, 0);

DBReader<unsigned int> headerReader(par.hdr1.c_str(), par.hdr1Index.c_str(), par.threads, DBReader<unsigned int>::USE_INDEX|DBReader<unsigned int>::USE_DATA);
headerReader.open(DBReader<unsigned int>::LINEAR_ACCCESS);

DBReader<unsigned int> resultReader(par.db2.c_str(), par.db2Index.c_str(), par.threads, DBReader<unsigned int>::USE_INDEX|DBReader<unsigned int>::USE_DATA);
resultReader.open(DBReader<unsigned int>::LINEAR_ACCCESS);

DBWriter writer(par.db3.c_str(), par.db3Index.c_str(), 1, false, Parameters::DBTYPE_OMIT_FILE);
writer.open();

Debug::Progress progress(resultReader.getSize());
std::unordered_map<unsigned int, std::pair<unsigned int, size_t>> contigToLongest;
#pragma omp parallel
{
unsigned int thread_idx = 0;
#ifdef OPENMP
thread_idx = static_cast<unsigned int>(omp_get_thread_num());

#endif
std::unordered_map<unsigned int, std::pair<unsigned int, size_t>> localContigToLongest;

#pragma omp for schedule(dynamic, 100)
for (size_t id = 0; id < headerReader.getSize(); ++id) {
progress.updateProgress();
unsigned int orfKey = headerReader.getDbKey(id);
char *orfHeader = headerReader.getData(id, thread_idx);
Orf::SequenceLocation orf = Orf::parseOrfHeader(orfHeader);
unsigned int contigKey = orf.id;
size_t orfLen = std::max(orf.from, orf.to) - std::min(orf.from, orf.to) + 1;
std::unordered_map<unsigned int, std::pair<unsigned int, size_t>>::iterator it = localContigToLongest.find(contigKey);
if (it != localContigToLongest.end()) {
std::pair<unsigned int, size_t> orfKeyToLength = it->second;
if (orfLen > orfKeyToLength.second) {
it->second = std::make_pair(orfKey, orfLen);
}
} else {
localContigToLongest.emplace(contigKey, std::make_pair(orfKey, orfLen));
}
}

#pragma omp critical
{
for (auto &entry : localContigToLongest) {
auto &contigKey = entry.first;
auto &localData = entry.second;
auto it = contigToLongest.find(contigKey);
if (it != contigToLongest.end()) {
if (localData.second > it->second.second) {
it->second = localData;
}
} else {
contigToLongest[contigKey] = localData;
}
}
}
}

progress.reset(resultReader.getSize());
std::unordered_set<unsigned int> acceptedContigs;
std::unordered_set<unsigned int> eliminatedContigs;
#pragma omp parallel
{
int thread_idx = 0;
#ifdef OPENMP
thread_idx = omp_get_thread_num();
#endif
std::string resultBuffer;
resultBuffer.reserve(1024 * 1024);

std::unordered_set<unsigned int> localAcceptedContigs;
std::unordered_set<unsigned int> localEliminatedContigs;
#pragma omp for schedule(dynamic, 5)
for (size_t i = 0; i < resultReader.getSize(); ++i) {
progress.updateProgress();

unsigned int key = resultReader.getDbKey(i);
size_t entryLength = resultReader.getEntryLen(i);
if (entryLength > 1) {
size_t id = headerReader.getId(key);
char *orfHeader = headerReader.getData(id, thread_idx);
Orf::SequenceLocation orf = Orf::parseOrfHeader(orfHeader);
unsigned int contigKey = orf.id;
localAcceptedContigs.emplace(contigKey);
}

size_t id = headerReader.getId(key);
char *orfHeader = headerReader.getData(id, thread_idx);
Orf::SequenceLocation orf = Orf::parseOrfHeader(orfHeader);
unsigned int contigKey = orf.id;
localEliminatedContigs.emplace(contigKey);
}

#pragma omp critical
{
acceptedContigs.insert(localAcceptedContigs.begin(), localAcceptedContigs.end());
eliminatedContigs.insert(localEliminatedContigs.begin(), localEliminatedContigs.end());
}
}

for (auto it = eliminatedContigs.begin(); it != eliminatedContigs.end(); ) {
if (acceptedContigs.find(*it) != acceptedContigs.end()) {
it = eliminatedContigs.erase(it);
} else {
++it;
}
}

std::string resultBuffer;
resultBuffer.reserve(1024 * 1024);
for (auto contigIt = eliminatedContigs.begin(); contigIt != eliminatedContigs.end(); ++contigIt) {
unsigned int contigKey = *contigIt;
std::unordered_map<unsigned int, std::pair<unsigned int, size_t>>::iterator it = contigToLongest.find(contigKey);
if (it != contigToLongest.end()) {
unsigned int orfKey = it->second.first;
resultBuffer.append(SSTR(orfKey));
resultBuffer.append(1, '\n');
writer.writeData(resultBuffer.c_str(), resultBuffer.length(), orfKey, 0, false, false);
resultBuffer.clear();
} else {
Debug(Debug::ERROR) << "Missing contig " << contigKey << "\n";
EXIT(EXIT_FAILURE);
}
}

writer.close(true);
FileUtil::remove(writer.getIndexFileName());
headerReader.close();
resultReader.close();

return EXIT_SUCCESS;
}
1 change: 1 addition & 0 deletions src/workflow/Taxonomy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ int taxonomy(int argc, const char **argv, const Command& command) {
cmd.addVariable("REMOVE_TMP", par.removeTmpFiles ? "TRUE" : NULL);
cmd.addVariable("RUNNER", par.runner.c_str());
cmd.addVariable("THREADS_COMP_PAR", par.createParameterString(par.threadsandcompression).c_str());
cmd.addVariable("THREADS_PAR", par.createParameterString(par.onlythreads).c_str());
cmd.addVariable("VERBOSITY", par.createParameterString(par.onlyverbosity).c_str());

if (searchMode & Parameters::SEARCH_MODE_FLAG_QUERY_TRANSLATED && !(searchMode & Parameters::SEARCH_MODE_FLAG_TARGET_TRANSLATED)) {
Expand Down

0 comments on commit 5b4c816

Please sign in to comment.