diff --git a/data/workflow/blastp.sh b/data/workflow/blastp.sh index 5722cfb95..5d587fd44 100755 --- a/data/workflow/blastp.sh +++ b/data/workflow/blastp.sh @@ -5,6 +5,33 @@ fail() { exit 1 } +abspath() { + if [ -d "$1" ]; then + (cd "$1"; pwd) + elif [ -f "$1" ]; then + if [ -z "${1##*/*}" ]; then + echo "$(cd "${1%/*}"; pwd)/${1##*/}" + else + echo "$(pwd)/$1" + fi + elif [ -d "$(dirname "$1")" ]; then + echo "$(cd "$(dirname "$1")"; pwd)/$(basename "$1")" + fi +} + +fake_pref() { + QDB="$1" + TDB="$2" + RES="$3" + # create link to data file which contains a list of all targets that should be aligned + ln -s "$(abspath "${TDB}.index")" "${RES}" + # create new index repeatedly pointing to same entry + INDEX_SIZE="$(wc -c < "${TDB}.index")" + awk -v size="$INDEX_SIZE" '{ print $1"\t0\t"size; }' "${QDB}.index" > "${RES}.index" + # create dbtype (7) + awk 'BEGIN { printf("%c%c%c%c",7,0,0,0); exit; }' > "${RES}.dbtype" +} + notExists() { [ ! -f "$1" ] } @@ -27,14 +54,23 @@ ALN_RES_MERGE="$TMP_PATH/aln_0" while [ "$STEP" -lt "$STEPS" ]; do SENS_PARAM=SENSE_${STEP} eval SENS="\$$SENS_PARAM" - # call prefilter module + + # 1. Prefilter hits if notExists "$TMP_PATH/pref_$STEP.dbtype"; then - # shellcheck disable=SC2086 - $RUNNER "$MMSEQS" prefilter "$INPUT" "$TARGET" "$TMP_PATH/pref_$STEP" $PREFILTER_PAR -s "$SENS" \ - || fail "Prefilter died" + if [ "$PREFMODE" = "EXHAUSTIVE" ]; then + fake_pref "${INPUT}" "${TARGET}" "$TMP_PATH/pref_$STEP" + elif [ "$PREFMODE" = "UNGAPPED" ]; then + # shellcheck disable=SC2086 + $RUNNER "$MMSEQS" ungappedprefilter "$INPUT" "$TARGET" "$TMP_PATH/pref_$STEP" $UNGAPPEDPREFILTER_PAR \ + || fail "Ungapped prefilter died" + else + # shellcheck disable=SC2086 + $RUNNER "$MMSEQS" prefilter "$INPUT" "$TARGET" "$TMP_PATH/pref_$STEP" $PREFILTER_PAR -s "$SENS" \ + || fail "Prefilter died" + fi fi - # call alignment module + # 2. alignment module if [ "$STEPS" -eq 1 ]; then if notExists "$3.dbtype"; then # shellcheck disable=SC2086 diff --git a/data/workflow/blastpgp.sh b/data/workflow/blastpgp.sh index 95a1d4441..806f680c9 100755 --- a/data/workflow/blastpgp.sh +++ b/data/workflow/blastpgp.sh @@ -5,6 +5,33 @@ fail() { exit 1 } +abspath() { + if [ -d "$1" ]; then + (cd "$1"; pwd) + elif [ -f "$1" ]; then + if [ -z "${1##*/*}" ]; then + echo "$(cd "${1%/*}"; pwd)/${1##*/}" + else + echo "$(pwd)/$1" + fi + elif [ -d "$(dirname "$1")" ]; then + echo "$(cd "$(dirname "$1")"; pwd)/$(basename "$1")" + fi +} + +fake_pref() { + QDB="$1" + TDB="$2" + RES="$3" + # create link to data file which contains a list of all targets that should be aligned + ln -s "$(abspath "${TDB}.index")" "${RES}" + # create new index repeatedly pointing to same entry + INDEX_SIZE="$(wc -c < "${TDB}.index")" + awk -v size="$INDEX_SIZE" '{ print $1"\t0\t"size; }' "${QDB}.index" > "${RES}.index" + # create dbtype (7) + awk 'BEGIN { printf("%c%c%c%c",7,0,0,0); exit; }' > "${RES}.dbtype" +} + notExists() { [ ! -f "$1" ] } @@ -28,15 +55,26 @@ STEP=0 while [ "$STEP" -lt "$NUM_IT" ]; do # call prefilter module if notExists "$TMP_PATH/pref_tmp_${STEP}.done"; then - PARAM="PREFILTER_PAR_$STEP" - eval TMP="\$$PARAM" + if [ "$PREFMODE" = "EXHAUSTIVE" ]; then + TMP="" + PREF="fake_pref" + elif [ "$PREFMODE" = "UNGAPPED" ]; then + PARAM="UNGAPPEDPREFILTER_PAR_$STEP" + eval TMP="\$$PARAM" + PREF="${MMSEQS} ungappedprefilter" + else + PARAM="PREFILTER_PAR_$STEP" + eval TMP="\$$PARAM" + PREF="${MMSEQS} prefilter" + fi + if [ "$STEP" -eq 0 ]; then # shellcheck disable=SC2086 - $RUNNER "$MMSEQS" prefilter "$QUERYDB" "$2" "$TMP_PATH/pref_$STEP" ${TMP} \ + $RUNNER $PREF "$QUERYDB" "$2" "$TMP_PATH/pref_$STEP" ${TMP} \ || fail "Prefilter died" else # shellcheck disable=SC2086 - $RUNNER "$MMSEQS" prefilter "$QUERYDB" "$2" "$TMP_PATH/pref_tmp_$STEP" ${TMP} \ + $RUNNER $PREF "$QUERYDB" "$2" "$TMP_PATH/pref_tmp_$STEP" ${TMP} \ || fail "Prefilter died" fi touch "$TMP_PATH/pref_tmp_${STEP}.done" diff --git a/src/commons/Parameters.cpp b/src/commons/Parameters.cpp index bab4330f8..fc289a8e0 100644 --- a/src/commons/Parameters.cpp +++ b/src/commons/Parameters.cpp @@ -163,6 +163,7 @@ Parameters::Parameters(): PARAM_NUM_ITERATIONS(PARAM_NUM_ITERATIONS_ID, "--num-iterations", "Search iterations", "Number of iterative profile search iterations", typeid(int), (void *) &numIterations, "^[1-9]{1}[0-9]*$", MMseqsParameter::COMMAND_PROFILE), PARAM_START_SENS(PARAM_START_SENS_ID, "--start-sens", "Start sensitivity", "Start sensitivity", typeid(float), (void *) &startSens, "^[0-9]*(\\.[0-9]+)?$"), PARAM_SENS_STEPS(PARAM_SENS_STEPS_ID, "--sens-steps", "Search steps", "Number of search steps performed from --start-sens to -s", typeid(int), (void *) &sensSteps, "^[1-9]{1}$"), + PARAM_PREF_MODE(PARAM_PREF_MODE_ID,"--prefilter-mode", "Prefilter mode", "prefilter mode: 0: kmer/ungapped 1: ungapped, 2: nofilter",typeid(int), (void *) &prefMode, "^[0-2]{1}$"), PARAM_EXHAUSTIVE_SEARCH(PARAM_EXHAUSTIVE_SEARCH_ID, "--exhaustive-search", "Exhaustive search mode", "For bigger profile DB, run iteratively the search by greedily swapping the search results", typeid(bool), (void *) &exhaustiveSearch, "", MMseqsParameter::COMMAND_PROFILE | MMseqsParameter::COMMAND_EXPERT), PARAM_EXHAUSTIVE_SEARCH_FILTER(PARAM_EXHAUSTIVE_SEARCH_FILTER_ID, "--exhaustive-search-filter", "Filter results during exhaustive search", "Filter result during search: 0: do not filter, 1: filter", typeid(int), (void *) &exhaustiveFilterMsa, "^[0-1]{1}$", MMseqsParameter::COMMAND_ALIGN | MMseqsParameter::COMMAND_EXPERT), @@ -1253,6 +1254,7 @@ Parameters::Parameters(): searchworkflow.push_back(&PARAM_NUM_ITERATIONS); searchworkflow.push_back(&PARAM_START_SENS); searchworkflow.push_back(&PARAM_SENS_STEPS); + searchworkflow.push_back(&PARAM_PREF_MODE); searchworkflow.push_back(&PARAM_EXHAUSTIVE_SEARCH); searchworkflow.push_back(&PARAM_EXHAUSTIVE_SEARCH_FILTER); searchworkflow.push_back(&PARAM_STRAND); @@ -2260,6 +2262,7 @@ void Parameters::setDefaults() { // search workflow numIterations = 1; + prefMode = PREF_MODE_KMER; startSens = 4; sensSteps = 1; exhaustiveSearch = false; diff --git a/src/commons/Parameters.h b/src/commons/Parameters.h index 4f9e742cd..1112e261c 100644 --- a/src/commons/Parameters.h +++ b/src/commons/Parameters.h @@ -306,6 +306,11 @@ class Parameters { static const int ID_MODE_KEYS = 0; static const int ID_MODE_LOOKUP = 1; + // prefilter mode + static const int PREF_MODE_KMER = 0; + static const int PREF_MODE_UNGAPPED = 1; + static const int PREF_MODE_EXHAUSTIVE = 2; + // unpackdb static const int UNPACK_NAME_KEY = 0; static const int UNPACK_NAME_ACCESSION = 1; @@ -450,6 +455,7 @@ class Parameters { // SEARCH WORKFLOW int numIterations; + int prefMode; float startSens; int sensSteps; bool exhaustiveSearch; @@ -872,6 +878,7 @@ class Parameters { PARAMETER(PARAM_NUM_ITERATIONS) PARAMETER(PARAM_START_SENS) PARAMETER(PARAM_SENS_STEPS) + PARAMETER(PARAM_PREF_MODE) PARAMETER(PARAM_EXHAUSTIVE_SEARCH) PARAMETER(PARAM_EXHAUSTIVE_SEARCH_FILTER) PARAMETER(PARAM_STRAND) diff --git a/src/workflow/Search.cpp b/src/workflow/Search.cpp index 750535d01..62ec132e2 100644 --- a/src/workflow/Search.cpp +++ b/src/workflow/Search.cpp @@ -256,6 +256,8 @@ int search(int argc, const char **argv, const Command& command) { } } + + const bool isUngappedMode = par.alignmentMode == Parameters::ALIGNMENT_MODE_UNGAPPED; if (isUngappedMode && (searchMode & (Parameters::SEARCH_MODE_FLAG_QUERY_PROFILE |Parameters::SEARCH_MODE_FLAG_TARGET_PROFILE ))) { par.printUsageMessage(command, MMseqsParameter::COMMAND_ALIGN | MMseqsParameter::COMMAND_PREFILTER); @@ -316,6 +318,19 @@ int search(int argc, const char **argv, const Command& command) { } else { cmd.addVariable("ALIGN_MODULE", "align"); } + + switch(par.prefMode){ + case Parameters::PREF_MODE_KMER: + cmd.addVariable("PREFMODE", "KMER"); + break; + case Parameters::PREF_MODE_UNGAPPED: + cmd.addVariable("PREFMODE", "UNGAPPED"); + break; + case Parameters::PREF_MODE_EXHAUSTIVE: + cmd.addVariable("PREFMODE", "EXHAUSTIVE"); + break; + } + cmd.addVariable("REMOVE_TMP", par.removeTmpFiles ? "TRUE" : NULL); std::string program; cmd.addVariable("RUNNER", par.runner.c_str()); @@ -334,7 +349,11 @@ int search(int argc, const char **argv, const Command& command) { par.covMode = Util::swapCoverageMode(par.covMode); size_t maxResListLen = par.maxResListLen; par.maxResListLen = std::max((size_t)300, queryDbSize); - cmd.addVariable("PREFILTER_PAR", par.createParameterString(par.prefilter).c_str()); + if(par.prefMode == Parameters::PREF_MODE_KMER){ + cmd.addVariable("PREFILTER_PAR", par.createParameterString(par.prefilter).c_str()); + } else if (par.prefMode == Parameters::PREF_MODE_UNGAPPED) { + cmd.addVariable("UNGAPPEDPREFILTER_PAR", par.createParameterString(par.ungappedprefilter).c_str()); + } par.maxResListLen = maxResListLen; double originalEvalThr = par.evalThr; par.evalThr = std::numeric_limits::max(); @@ -385,8 +404,13 @@ int search(int argc, const char **argv, const Command& command) { if (i == (par.numIterations - 1)) { par.evalThr = originalEval; } - cmd.addVariable(std::string("PREFILTER_PAR_" + SSTR(i)).c_str(), - par.createParameterString(par.prefilter).c_str()); + if (par.prefMode == Parameters::PREF_MODE_KMER) { + cmd.addVariable(std::string("PREFILTER_PAR_" + SSTR(i)).c_str(), + par.createParameterString(par.prefilter).c_str()); + } else if (par.prefMode == Parameters::PREF_MODE_UNGAPPED) { + cmd.addVariable(std::string("UNGAPPEDPREFILTER_PAR_" + SSTR(i)).c_str(), + par.createParameterString(par.ungappedprefilter).c_str()); + } if (isUngappedMode) { par.rescoreMode = Parameters::RESCORE_MODE_ALIGNMENT; cmd.addVariable(std::string("ALIGNMENT_PAR_" + SSTR(i)).c_str(), @@ -439,8 +463,13 @@ int search(int argc, const char **argv, const Command& command) { par.evalThr = originalEval; } - cmd.addVariable(std::string("PREFILTER_PAR_" + SSTR(i)).c_str(), - par.createParameterString(par.prefilter).c_str()); + if (par.prefMode == Parameters::PREF_MODE_KMER) { + cmd.addVariable(std::string("PREFILTER_PAR_" + SSTR(i)).c_str(), + par.createParameterString(par.prefilter).c_str()); + } else if (par.prefMode == Parameters::PREF_MODE_UNGAPPED) { + cmd.addVariable(std::string("UNGAPPEDPREFILTER_PAR_" + SSTR(i)).c_str(), + par.createParameterString(par.ungappedprefilter).c_str()); + } if (isUngappedMode) { par.rescoreMode = Parameters::RESCORE_MODE_ALIGNMENT; cmd.addVariable(std::string("ALIGNMENT_PAR_" + SSTR(i)).c_str(), @@ -487,7 +516,11 @@ int search(int argc, const char **argv, const Command& command) { prefilterWithoutS.push_back(par.prefilter[i]); } } - cmd.addVariable("PREFILTER_PAR", par.createParameterString(prefilterWithoutS).c_str()); + if (par.prefMode == Parameters::PREF_MODE_KMER) { + cmd.addVariable("PREFILTER_PAR", par.createParameterString(prefilterWithoutS).c_str()); + } else if (par.prefMode == Parameters::PREF_MODE_UNGAPPED) { + cmd.addVariable("UNGAPPEDPREFILTER_PAR", par.createParameterString(par.ungappedprefilter).c_str()); + } if (isUngappedMode) { par.rescoreMode = Parameters::RESCORE_MODE_ALIGNMENT; cmd.addVariable("ALIGNMENT_PAR", par.createParameterString(par.rescorediagonal).c_str());