Skip to content

Commit

Permalink
update to 2.1.10
Browse files Browse the repository at this point in the history
  • Loading branch information
KennthShang committed Dec 25, 2024
1 parent 306c8cc commit a7db148
Show file tree
Hide file tree
Showing 2 changed files with 81 additions and 46 deletions.
38 changes: 27 additions & 11 deletions src/phabox2/phabox2.py
Original file line number Diff line number Diff line change
Expand Up @@ -327,6 +327,7 @@ def main():
parser = argparse.ArgumentParser(add_help=False)
parser.add_argument('-h', '--help', action='store_true', help='Show this help message and exit')
parser.add_argument('--task', help='Select a program to run || (default end_to_end)', default = 'end_to_end')
parser.add_argument('--skip', help='Skip PhaMer in end_to_end task || (defailt N)', default = 'N')
parser.add_argument('-d', '--dbdir', help='Path of database directory || (required)', default = 'database/')
parser.add_argument('-o', '--outpth', help='Rootpth for the output folder || (required)', default='test_out/')
parser.add_argument('--contigs', help='Path of the input FASTA file || (required)', default = 'test_contigs.fa')
Expand Down Expand Up @@ -368,23 +369,38 @@ def main():
logger.info(f"PhaBOX2 is running with: {inputs.threads} threads!")
if inputs.task == "end_to_end":
phamer.run(inputs)
logger.info(f"PhaMer finished! please check the results in {os.path.join(inputs.outpth, 'final_prediction')}")
if inputs.skip == 'N':
logger.info(f"PhaMer finished! please check the results in {os.path.join(inputs.outpth, 'final_prediction')}")
else:
logger.info(f"Preprocessing finished! please check the results in {os.path.join(inputs.outpth, 'final_prediction')}")
phagcn.run(inputs)
logger.info(f"PhaGCN finished! please check the results in {os.path.join(inputs.outpth, 'final_prediction')}")
cherry.run(inputs)
logger.info(f"Cherry finished! please check the results in {os.path.join(inputs.outpth, 'final_prediction')}")
phatyp.run(inputs)
logger.info(f"PhaTYP finished! please check the results in {os.path.join(inputs.outpth, 'final_prediction')}")
df1 = pd.read_csv(os.path.join(inputs.outpth, 'final_prediction', 'phamer_prediction.tsv'), sep='\t')
df2 = pd.read_csv(os.path.join(inputs.outpth, 'final_prediction', 'phagcn_prediction.tsv'), sep='\t')
df3 = pd.read_csv(os.path.join(inputs.outpth, 'final_prediction', 'phatyp_prediction.tsv'), sep='\t')
df4 = pd.read_csv(os.path.join(inputs.outpth, 'final_prediction', 'cherry_prediction.tsv'), sep='\t')
df = df1.merge(df2, on=['Accession', 'Length'], how='outer') \
.merge(df3, on=['Accession', 'Length'], how='outer') \
.merge(df4, on=['Accession', 'Length'], how='outer')
df.fillna('NA', inplace=True)
df.to_csv(f'{inputs.outpth}/final_prediction/final_prediction_summary.tsv', index=False, sep='\t')
logger.info(f"Summarized finished! please check the results in {os.path.join(inputs.outpth, 'final_prediction', 'final_prediction_summary.tsv')}\n\n")
if inputs.skip == 'N':
df1 = pd.read_csv(os.path.join(inputs.outpth, 'final_prediction', 'phamer_prediction.tsv'), sep='\t')
df2 = pd.read_csv(os.path.join(inputs.outpth, 'final_prediction', 'phagcn_prediction.tsv'), sep='\t')
df3 = pd.read_csv(os.path.join(inputs.outpth, 'final_prediction', 'phatyp_prediction.tsv'), sep='\t')
df4 = pd.read_csv(os.path.join(inputs.outpth, 'final_prediction', 'cherry_prediction.tsv'), sep='\t')
df = df1.merge(df2, on=['Accession', 'Length'], how='outer') \
.merge(df3, on=['Accession', 'Length'], how='outer') \
.merge(df4, on=['Accession', 'Length'], how='outer')
df.fillna('NA', inplace=True)
df.replace('-', 'NA', inplace=True)
df.to_csv(f'{inputs.outpth}/final_prediction/final_prediction_summary.tsv', index=False, sep='\t')
logger.info(f"Summarized finished! please check the results in {os.path.join(inputs.outpth, 'final_prediction', 'final_prediction_summary.tsv')}\n\n")
else:
df2 = pd.read_csv(os.path.join(inputs.outpth, 'final_prediction', 'phagcn_prediction.tsv'), sep='\t')
df3 = pd.read_csv(os.path.join(inputs.outpth, 'final_prediction', 'phatyp_prediction.tsv'), sep='\t')
df4 = pd.read_csv(os.path.join(inputs.outpth, 'final_prediction', 'cherry_prediction.tsv'), sep='\t')
df = df2.merge(df3, on=['Accession', 'Length'], how='outer') \
.merge(df4, on=['Accession', 'Length'], how='outer')
df.fillna('NA', inplace=True)
df.replace('-', 'NA', inplace=True)
df.to_csv(f'{inputs.outpth}/final_prediction/final_prediction_summary.tsv', index=False, sep='\t')
logger.info(f"Summarized finished! please check the results in {os.path.join(inputs.outpth, 'final_prediction', 'final_prediction_summary.tsv')}\n\n")
elif inputs.task == "phamer":
phamer.run(inputs)
logger.info(f"PhaMer finished! please check the results in {os.path.join(inputs.outpth, 'final_prediction')}\n\n")
Expand Down
89 changes: 54 additions & 35 deletions src/phabox2/phamer.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,13 @@

def run(inputs):
logger = get_logger()
logger.info("Running program: PhaMer (virus identification)")
if inputs.skip == 'Y' and inputs.task == 'phamer':
logger.info("Error: parameter 'skip' should be 'Y' when running the task 'phamer'")
exit()
if inputs.skip == 'N':
logger.info("Running program: PhaMer (virus identification)")
else:
logger.info("Running program: Data preprocessing")
program_start = time.time()

contigs = inputs.contigs
Expand All @@ -34,7 +40,10 @@ def run(inputs):
print(f'Database directory {db_dir} missing or unreadable')
exit(1)

supplementary = 'phamer_supplementary'
if inputs.skip == 'N':
supplementary = 'phamer_supplementary'
else:
supplementary = 'preprocessing_supplementary'
check_path(os.path.join(rootpth, out_dir))
check_path(os.path.join(rootpth, out_dir, supplementary))
check_path(os.path.join(rootpth, midfolder))
Expand Down Expand Up @@ -226,43 +235,47 @@ def run(inputs):
contigs_list = list(contigs_list.keys()) + contigs_add

logger.info("[7/7] writing the results...")
pred_csv = pd.DataFrame({"Accession":contigs_list, "Length":length_list, "Pred":all_pred, "Proportion":all_proportion, "PhaMerScore":all_score, "PhaMerConfidence":all_confidence})
pred_csv.to_csv(f'{rootpth}/{out_dir}/phamer_prediction.tsv', index = False, sep='\t')
virus_list = {item:1 for item in pred_csv[pred_csv['Pred'] == 'virus']['Accession'].values}

virus_rec = []
low_confidence = {item:1 for item in pred_csv[pred_csv['PhaMerConfidence'] == 'low-confidence; please run contamination detection task']['Accession'].values}
low_confidence = {**low_confidence, **{item:1 for item in pred_csv[pred_csv['PhaMerConfidence'] == 'lower than viral score threshold; proteinal prophage, please run contamination detection task']['Accession'].values}}
low_virus_rec = []
for record in SeqIO.parse(f'{contigs}', 'fasta'):
try:
_ = low_confidence[record.id]
low_virus_rec.append(record)
except:
pass
try:
_ = virus_list[record.id]
virus_rec.append(record)
except:
pass
if inputs.skip == 'N':
pred_csv = pd.DataFrame({"Accession":contigs_list, "Length":length_list, "Pred":all_pred, "Proportion":all_proportion, "PhaMerScore":all_score, "PhaMerConfidence":all_confidence})
pred_csv.to_csv(f'{rootpth}/{out_dir}/phamer_prediction.tsv', index = False, sep='\t')
virus_list = {item:1 for item in pred_csv[pred_csv['Pred'] == 'virus']['Accession'].values}

virus_rec = []
low_confidence = {item:1 for item in pred_csv[pred_csv['PhaMerConfidence'] == 'low-confidence; please run contamination detection task']['Accession'].values}
low_confidence = {**low_confidence, **{item:1 for item in pred_csv[pred_csv['PhaMerConfidence'] == 'lower than viral score threshold; proteinal prophage, please run contamination detection task']['Accession'].values}}
low_virus_rec = []
for record in SeqIO.parse(f'{contigs}', 'fasta'):
try:
_ = low_confidence[record.id]
low_virus_rec.append(record)
except:
pass
try:
_ = virus_list[record.id]
virus_rec.append(record)
except:
pass



SeqIO.write(virus_rec, f'{rootpth}/{out_dir}/{supplementary}/predicted_virus.fa', 'fasta')
SeqIO.write(low_virus_rec, f'{rootpth}/{out_dir}/{supplementary}/uncertain_sequences_for_contamination_task.fa', 'fasta')
virus_protein_rec = []
check = {item: 1 for item in virus_list}
for record in SeqIO.parse(f'{rootpth}/{midfolder}/query_protein.fa', 'fasta'):
try:
_ = check[record.id.rsplit('_', 1)[0]]
virus_protein_rec.append(record)
except:
pass
SeqIO.write(virus_rec, f'{rootpth}/{out_dir}/{supplementary}/predicted_virus.fa', 'fasta')
SeqIO.write(low_virus_rec, f'{rootpth}/{out_dir}/{supplementary}/uncertain_sequences_for_contamination_task.fa', 'fasta')
virus_protein_rec = []
check = {item: 1 for item in virus_list}
for record in SeqIO.parse(f'{rootpth}/{midfolder}/query_protein.fa', 'fasta'):
try:
_ = check[record.id.rsplit('_', 1)[0]]
virus_protein_rec.append(record)
except:
pass

SeqIO.write(virus_protein_rec, f'{rootpth}/{out_dir}/{supplementary}/predicted_virus_protein.fa', 'fasta')

SeqIO.write(virus_protein_rec, f'{rootpth}/{out_dir}/{supplementary}/predicted_virus_protein.fa', 'fasta')
run_command(f"cp {rootpth}/filtered_contigs.fa {rootpth}/{out_dir}/{supplementary}/all_predicted_contigs.fa")
run_command(f"cp {rootpth}/{out_dir}/{supplementary}/predicted_virus.fa {rootpth}/filtered_contigs.fa")
run_command(f"cp {rootpth}/filtered_contigs.fa {rootpth}/{out_dir}/{supplementary}/all_predicted_contigs.fa")
run_command(f"cp {rootpth}/{out_dir}/{supplementary}/predicted_virus.fa {rootpth}/filtered_contigs.fa")

else:
run_command(f"cp {rootpth}/filtered_contigs.fa {rootpth}/{out_dir}/{supplementary}/all_predicted_contigs.fa")

run_command(f"cp {rootpth}/{midfolder}/query_protein.fa {rootpth}/{out_dir}/{supplementary}/all_predicted_protein.fa")
run_command(f"cp {rootpth}/{midfolder}/db_results.tab {rootpth}/{out_dir}/{supplementary}/alignment_results.tab")
Expand Down Expand Up @@ -290,10 +303,16 @@ def run(inputs):
f.write(f'{genome}\t{gene}\t{genes[gene].start}\t{genes[gene].end}\t{genes[gene].strand}\t{genes[gene].gc}\t{genes[gene].anno}\t{genes[gene].pident:.2f}\t{genes[gene].coverage:.2f}\n')


phavip_dump_result(genomes, rootpth, out_dir, logger, supplementary = 'phamer_supplementary')
phavip_dump_result(genomes, rootpth, out_dir, logger, supplementary = supplementary)

logger.info("Run time: %s seconds\n" % round(time.time() - program_start, 2))

if inputs.skip == 'N':
if not virus_rec:
logger.info("PhaMer finished! No virus found in the contigs!")
logger.info(f"Please check the results in {os.path.join(rootpth,out_dir, 'phamer_prediction.tsv')}")
exit()




Expand Down

0 comments on commit a7db148

Please sign in to comment.