Skip to content

Commit

Permalink
Latest changes before releasing v1.0.1
Browse files Browse the repository at this point in the history
- Added option to `align` step to ``--collect_only` the extracted markers and exit
-
  • Loading branch information
edgardomortiz committed Mar 2, 2024
1 parent c0b66f2 commit da21567
Show file tree
Hide file tree
Showing 3 changed files with 69 additions and 33 deletions.
90 changes: 58 additions & 32 deletions captus/align.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,9 +172,14 @@ def align(full_command, args):

collect_extracted_markers(markers, formats, args.max_paralogs, args.min_samples,
extracted_sample_dirs, out_dir, settings.ALN_DIRS["UNAL"],
refs_paths, args.overwrite, show_less)
refs_paths, args.collect_only, args.overwrite, show_less)
log.log("")

if args.collect_only:
successful_exit(
"Captus-assembly: ALIGN -> successfully completed, '--collect_only' was enabled"
f" [{elapsed_time(time.time() - captus_start)}]"
)


################################################################################################
Expand Down Expand Up @@ -858,7 +863,7 @@ def make_output_dirtree(markers, formats, out_dir, base_dir, margin):

def collect_extracted_markers(
markers, formats, max_paralogs, min_samples, extracted_sample_dirs, out_dir, base_dir,
refs_paths, overwrite, show_less
refs_paths, collect_only, overwrite, show_less
):
source_files = [Path(settings.MARKER_DIRS[m], f"{m}{settings.FORMAT_SUFFIXES[f]}")
for m in markers.split(",") for f in formats.split(",")
Expand Down Expand Up @@ -894,20 +899,24 @@ def collect_extracted_markers(

# Collect markers per sample's source FASTAs into a dictionary that can be updated in parallel
fastas_per_marker = {}
collect_marker_names_params = []
collect_sample_markers_params = []
for source_fasta_path in fastas_per_sample:
collect_marker_names_params.append([
collect_sample_markers_params.append([
fastas_per_marker,
source_fasta_path,
fastas_per_sample[source_fasta_path]["sample_name"],
fastas_per_sample[source_fasta_path]["destination"],
fastas_per_sample[source_fasta_path]["suffix"],
max_paralogs,
])
tqdm_serial_run(collect_sample_markers, collect_marker_names_params,
tqdm_serial_run(collect_sample_markers, collect_sample_markers_params,
"Collecting extracted markers",
"Collection of extracted markers finished",
"source", show_less)
if max_paralogs == -1:
max_paralog_rank = get_max_paralog_rank(collect_sample_markers_params)
else:
max_paralog_rank = max_paralogs

# Write FASTAs compiled per marker when they have at least four samples
log.log("")
Expand Down Expand Up @@ -937,36 +946,37 @@ def collect_extracted_markers(
f" {skipped} already existed and were skipped"
)

# Add references to all the possible collected FASTAs
if refs_paths is not None:
refs = [
refs_paths["NUC"]["AA_path"], refs_paths["NUC"]["NT_path"],
refs_paths["PTD"]["AA_path"], refs_paths["PTD"]["NT_path"],
refs_paths["MIT"]["AA_path"], refs_paths["MIT"]["NT_path"],
refs_paths["DNA"]["NT_path"], refs_paths["DNA"]["NT_path"],
refs_paths["CLR"]["NT_path"], refs_paths["CLR"]["NT_path"],
]
mrks = ["NUC", "NUC", "PTD", "PTD", "MIT", "MIT", "DNA", "DNA", "CLR", "CLR"]
fmts = [ "AA", "NT", "AA", "NT", "AA", "NT", "MA", "MF", "MA", "MF"]
add_refs_params = []
manager = Manager()
shared_ref_names = manager.list()
for r, m, f in zip(refs, mrks, fmts):
if all([r, m in markers.upper().split(","), f in formats.upper().split(",")]):
add_refs_params.append((
r,
Path(out_dir, base_dir, settings.MARKER_DIRS[m], settings.FORMAT_DIRS[f]),
shared_ref_names,
))
if bool(add_refs_params):
log.log("")
tqdm_serial_run(add_refs, add_refs_params,
"Adding reference markers", "Addition of reference markers finished",
"reference", show_less)
manager = Manager()
shared_ref_names = manager.list()
if not collect_only:
# Add references to all the possible collected FASTAs
if refs_paths is not None:
refs = [
refs_paths["NUC"]["AA_path"], refs_paths["NUC"]["NT_path"],
refs_paths["PTD"]["AA_path"], refs_paths["PTD"]["NT_path"],
refs_paths["MIT"]["AA_path"], refs_paths["MIT"]["NT_path"],
refs_paths["DNA"]["NT_path"], refs_paths["DNA"]["NT_path"],
refs_paths["CLR"]["NT_path"], refs_paths["CLR"]["NT_path"],
]
mrks = ["NUC", "NUC", "PTD", "PTD", "MIT", "MIT", "DNA", "DNA", "CLR", "CLR"]
fmts = [ "AA", "NT", "AA", "NT", "AA", "NT", "MA", "MF", "MA", "MF"]
add_refs_params = []
for r, m, f in zip(refs, mrks, fmts):
if all([r, m in markers.upper().split(","), f in formats.upper().split(",")]):
add_refs_params.append((
r,
Path(out_dir, base_dir, settings.MARKER_DIRS[m], settings.FORMAT_DIRS[f]),
shared_ref_names,
))
if bool(add_refs_params):
log.log("")
tqdm_serial_run(add_refs, add_refs_params,
"Adding reference markers", "Addition of reference markers finished",
"reference", show_less)

# Write ASTRAL-Pro sequence to sample equivalence tsv file
astral_pro_tsv = write_astral_pro_seq_to_sam(out_dir,
max_paralogs,
max_paralog_rank,
shared_ref_names,
sample_names)
if astral_pro_tsv:
Expand Down Expand Up @@ -1013,6 +1023,22 @@ def collect_sample_markers(
return message


def get_max_paralog_rank(collect_sample_markers_params: list):
max_paralog_rank = 0
for params in collect_sample_markers_params:
fasta_in = fasta_to_dict(params[1])
for seq_name_full in fasta_in:
seq_name_parts = seq_name_full.split(settings.SEQ_NAME_SEP)
seq_name = seq_name_parts[0]
if len(seq_name_parts) == 3:
paralog_rank = int(seq_name_parts[2])
else:
paralog_rank = 0
if paralog_rank > max_paralog_rank:
max_paralog_rank = paralog_rank
return max_paralog_rank


def add_refs(ref_path, dest_dir, shared_ref_names):
start = time.time()
fastas_in_dest = list(Path(dest_dir).rglob("*.f[an]a"))
Expand Down
9 changes: 8 additions & 1 deletion captus/captus_assembly.py
Original file line number Diff line number Diff line change
Expand Up @@ -1142,7 +1142,7 @@ def align(self):
help="Maximum number of secondary hits (copies) per sample to import from the"
" extraction step. Large numbers of marker copies per sample can increase"
" alignment times. Hits (copies) are ranked from best to worst during the"
" 'extract' step. -1 disables the initial removal of paralogs and aligns which"
" 'extract' step. -1 disables the removal of paralogs and aligns them all, which"
" might be useful if you expect very high ploidy levels for example"
)
input_group.add_argument(
Expand Down Expand Up @@ -1306,6 +1306,13 @@ def align(self):
)

other_group = parser.add_argument_group("Other")
other_group.add_argument(
"--collect_only",
action="store_true",
dest="collect_only",
help="Only collect the markers from the extraction folder and exit (skips addition of"
" reference target sequences and subsequent steps)"
)
other_group.add_argument(
"--redo_from",
action="store",
Expand Down
3 changes: 3 additions & 0 deletions docs/content/assembly/align/options.md
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,9 @@ This argument is optional, the default is **0**.
___
## *Other*
___
### **`--collect_only`**
Only collect the markers from the extraction folder and exit, it skips the addition of reference target sequences and subsequent steps
___
### **`--redo_from`**
You can repeat the analysis without undoing all the steps. These are the points from which you can restart the `align` command:
- `alignment` = Delete all subdirectories with alignments and restart.
Expand Down

0 comments on commit da21567

Please sign in to comment.