Skip to content

Commit

Permalink
fix(wip): resolving diffs between v253
Browse files Browse the repository at this point in the history
 - allow sync between variants
 - fix precision/recall GT (no 0|1 -> 1|1 FP vars)
 - work on summary VCF GTs
  • Loading branch information
TimD1 committed Nov 2, 2024
1 parent b5b448c commit 5519868
Show file tree
Hide file tree
Showing 6 changed files with 126 additions and 46 deletions.
4 changes: 2 additions & 2 deletions src/cluster.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -382,7 +382,7 @@ void superclusterData::supercluster() {
int total_vars = 0;
int most_vars = 0;
int total_bases = 0;
for (std::string ctg : this->contigs) {
for (const std::string & ctg : this->contigs) {

// skip empty contigs
int nvars = 0;
Expand All @@ -392,7 +392,7 @@ void superclusterData::supercluster() {
if (!nvars) continue;

// for each cluster of variants (merge query and truth haps)
auto & vars = this->superclusters[ctg]->callset_vars;
auto const & vars = this->superclusters[ctg]->callset_vars;
std::vector<int> brks(CALLSETS, 0); // start of current supercluster
while (true) {

Expand Down
68 changes: 58 additions & 10 deletions src/dist.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,21 @@ int calc_prec_recall_aln(
}
}

// allow bottom-right corner to move diagonally into next truth and query nodes
// NOTE: separating this case out allows sync points between adjacent variants
if (x.qi == int(graph->qseqs[x.qni].length())-1 && x.ti < int(graph->tseqs[x.tni].length()) &&
x.ti == int(graph->tseqs[x.tni].length())-1 && x.qi < int(graph->qseqs[x.qni].length())) {
for (int qni : graph->qnexts[x.qni]) { // for all next nodes
for (int tni : graph->tnexts[x.tni]) {
idx4 z(qni, tni, 0, 0);
if (!contains(done, z) && !contains(curr_wave, z)) {
/* if (print) printf(" z = node (%d, %d) cell (%d, %d)\n", z.qni, z.tni, z.qi, z.ti); */
queue.push(z); curr_wave.insert(z); ptrs[z] = x;
}
}
}
}

// allow last row to move into first row of all next query nodes
if (x.qi == int(graph->qseqs[x.qni].length())-1 && x.ti < int(graph->tseqs[x.tni].length())) {
for (int qni : graph->qnexts[x.qni]) { // for all next nodes
Expand Down Expand Up @@ -265,11 +280,11 @@ void calc_prec_recall(
graph->qtypes[prev.qni] != TYPE_REF) { // node is variant
if (print) printf("new query variant\n");
int qvar_idx = graph->qidxs[prev.qni];
qvars->set_var_calcgt_on_hap(qvar_idx, truth_hap);
qvars->set_var_calcgt_on_hap(qvar_idx, truth_hap, true);
sync_qvars.push_back(qvar_idx);
}
// if we move into a query variant, include it in sync group and ref dist calc
else if (prev.tni != curr.tni && // new truth node
if (prev.tni != curr.tni && // new truth node
graph->ttypes[prev.tni] != TYPE_REF) { // node is variant
if (print) printf("new truth variant\n");
int tvar_idx = graph->tidxs[prev.tni];
Expand All @@ -292,19 +307,23 @@ void calc_prec_recall(
}
}
// if the alignment is a substitution, insertion, or deletion
else if (prev.qni == curr.qni && prev.tni == curr.tni && // same matrix
if (prev.qni == curr.qni && prev.tni == curr.tni && // same matrix
(prev.qi == curr.qi || prev.ti == curr.ti || // insertion or deletion
graph->tseqs[curr.tni][curr.ti] != graph->qseqs[curr.qni][curr.qi]) // substitution
) {
if (print) printf("non-match step\n");
query_dist += 1;
}
// check if this movement is a sync point
else if (prev.qni == curr.qni && prev.tni == curr.tni && // same submatrix
prev.qi+1 == curr.qi && prev.ti+1 == curr.ti && // diagonal movement
graph->ttypes[curr.tni] == TYPE_REF && // not in truth var
graph->qtypes[curr.qni] == TYPE_REF && // not in query var
graph->tbegs[curr.tni] + curr.ti == graph->qbegs[curr.qni] + curr.qi) { // main diag
// check if this movement is a sync point (ref main diag mvmt or between var main diag mvmt)
if ((prev.qni == curr.qni && prev.tni == curr.tni && // same submatrix
prev.qi+1 == curr.qi && prev.ti+1 == curr.ti && // diagonal movement
graph->ttypes[curr.tni] == TYPE_REF && // not in truth var
graph->qtypes[curr.qni] == TYPE_REF && // not in query var
graph->tbegs[curr.tni] + curr.ti == graph->qbegs[curr.qni] + curr.qi) ||
(prev.qni != curr.qni && prev.tni != curr.tni && // between vars
curr.qi == 0 && curr.ti == 0 &&
graph->tbegs[curr.tni] == graph->qbegs[curr.qni]) // on main diag
) {
if (print) printf("potential sync point\n");

// add sync point
Expand Down Expand Up @@ -817,7 +836,7 @@ void precision_recall_wrapper(
}

//////////////////////////////////////////////////////
// PRECISION-RECALL: allow skipping called variants //
// PRECISION-RECALL: truth to query graph alignment //
//////////////////////////////////////////////////////

// calculate two forward-pass alignments, saving path
Expand All @@ -831,6 +850,35 @@ void precision_recall_wrapper(
calc_prec_recall_aln(graph, ptrs, print);
calc_prec_recall(graph, ptrs, hi, print);
}

// don't allow query 0|1 -> 1|1 if second allele is a FP
fix_prec_recall_genotype(sc, sc_idx);
}
}


/******************************************************************************/

/* The precision-recall calculation allows the calculated GT to be anything (including 0|0 or 1|1).
We need to do some post-processing to fix this and set it to the most reasonable value.
*/
void fix_prec_recall_genotype(std::shared_ptr<ctgSuperclusters> sc, int sc_idx) {
int cluster_beg = sc->superclusters[QUERY][sc_idx];
int cluster_end = sc->superclusters[QUERY][sc_idx+1];
for (int ci = cluster_beg; ci < cluster_end; ci++) {
std::shared_ptr<ctgVariants> vars = sc->callset_vars[QUERY];
for (int vi = vars->clusters[ci]; vi < vars->clusters[ci+1]; vi++) {

// don't allow calculated GT to be 1|1 if either is a FP (convert to 0/1)
if (vars->calc_gts[vi] == GT_ALT1_ALT1
&& (vars->orig_gts[vi] == GT_REF_ALT1 || vars->orig_gts[vi] == GT_ALT1_REF)) {
if (vars->errtypes[HAP1][vi] == ERRTYPE_FP) {
vars->set_var_calcgt_on_hap(vi, HAP1, false);
} else if (vars->errtypes[HAP2][vi] == ERRTYPE_FP) {
vars->set_var_calcgt_on_hap(vi, HAP2, false);
}
}
}
}
}

Expand Down
2 changes: 2 additions & 0 deletions src/dist.h
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,8 @@ void calc_prec_recall(
bool print = false
);

void fix_prec_recall_genotype(std::shared_ptr<ctgSuperclusters> sc, int sc_idx);

/******************************************************************************/

int wf_swg_max_reach(
Expand Down
51 changes: 27 additions & 24 deletions src/phase.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -124,21 +124,22 @@ void phaseblockData::write_summary_vcf(std::string out_vcf_fn) {
if (vars[QUERY]->refs[ptrs[QUERY]] == vars[TRUTH]->refs[ptrs[TRUTH]] &&
vars[QUERY]->alts[ptrs[QUERY]] == vars[TRUTH]->alts[ptrs[TRUTH]]) { // query matches truth
// print data for each haplotype
for (int hi = 0; hi < HAPS; hi++) {
int calc_hi = hi ^ vars[QUERY]->calcgt_is_swapped(ptrs[QUERY]);
if (vars[QUERY]->var_on_hap(ptrs[QUERY], hi) ||
vars[TRUTH]->var_on_hap(ptrs[TRUTH], hi)) {
for (int thi = 0; thi < HAPS; thi++) {
bool swap = vars[QUERY]->calcgt_is_swapped(ptrs[QUERY]);
int qhi = thi ^ swap ^ phase_flip ^ phase_switch;
if (vars[QUERY]->var_on_hap(ptrs[QUERY], qhi, swap) ||
vars[TRUTH]->var_on_hap(ptrs[TRUTH], thi)) {
vars[QUERY]->print_var_info(out_vcf, this->ref, ctg, ptrs[QUERY]);
if (vars[TRUTH]->var_on_hap(ptrs[TRUTH], hi)) { // print truth
vars[TRUTH]->print_var_sample(out_vcf, ptrs[TRUTH], hi,
ploidy == 1 ? "1" : (hi ? "0|1" : "1|0"),
if (vars[TRUTH]->var_on_hap(ptrs[TRUTH], thi)) { // print truth
vars[TRUTH]->print_var_sample(out_vcf, ptrs[TRUTH], thi,
ploidy == 1 ? "1" : (thi ? "0|1" : "1|0"),
sc_idx, phase_block, phase_switch, phase_flip);
} else {
vars[TRUTH]->print_var_empty(out_vcf, sc_idx, phase_block);
}
if (vars[QUERY]->var_on_hap(ptrs[QUERY], hi)) { // print query
vars[QUERY]->print_var_sample(out_vcf, ptrs[QUERY], calc_hi,
ploidy == 1 ? "1" : (hi ? "0|1" : "1|0"),
if (vars[QUERY]->var_on_hap(ptrs[QUERY], qhi, swap)) { // print query
vars[QUERY]->print_var_sample(out_vcf, ptrs[QUERY], qhi,
ploidy == 1 ? "1" : ((qhi ^ swap) ? "0|1" : "1|0"),
sc_idx, phase_block, phase_switch, phase_flip, true);
} else {
vars[QUERY]->print_var_empty(out_vcf, sc_idx, phase_block, true);
Expand All @@ -147,37 +148,39 @@ void phaseblockData::write_summary_vcf(std::string out_vcf_fn) {
}
ptrs[QUERY]++; ptrs[TRUTH]++;
} else { // positional tie, diff vars, just print query
for (int hi = 0; hi < HAPS; hi++) {
int calc_hi = hi ^ vars[QUERY]->calcgt_is_swapped(ptrs[QUERY]);
if (vars[QUERY]->var_on_hap(ptrs[QUERY], hi)) {
for (int thi = 0; thi < HAPS; thi++) {
bool swap = vars[QUERY]->calcgt_is_swapped(ptrs[QUERY]);
int qhi = thi ^ swap ^ phase_flip ^ phase_switch;
if (vars[QUERY]->var_on_hap(ptrs[QUERY], thi)) {
vars[QUERY]->print_var_info(out_vcf, this->ref, ctg, ptrs[QUERY]);
vars[TRUTH]->print_var_empty(out_vcf, sc_idx, phase_block);
vars[QUERY]->print_var_sample(out_vcf, ptrs[QUERY], calc_hi,
ploidy == 1 ? "1" : (hi ? "0|1" : "1|0"),
vars[QUERY]->print_var_sample(out_vcf, ptrs[QUERY], qhi,
ploidy == 1 ? "1" : ((qhi ^ swap) ? "0|1" : "1|0"),
sc_idx, phase_block, phase_switch, phase_flip, true);
}
}
ptrs[QUERY]++;
}
} else { // query is next
for (int hi = 0; hi < HAPS; hi++) {
int calc_hi = hi ^ vars[QUERY]->calcgt_is_swapped(ptrs[QUERY]);
if (vars[QUERY]->var_on_hap(ptrs[QUERY], hi)) {
for (int thi = 0; thi < HAPS; thi++) {
bool swap = vars[QUERY]->calcgt_is_swapped(ptrs[QUERY]);
int qhi = thi ^ swap ^ phase_flip ^ phase_switch;
if (vars[QUERY]->var_on_hap(ptrs[QUERY], thi)) {
vars[QUERY]->print_var_info(out_vcf, this->ref, ctg, ptrs[QUERY]);
vars[TRUTH]->print_var_empty(out_vcf, sc_idx, phase_block);
vars[QUERY]->print_var_sample(out_vcf, ptrs[QUERY], calc_hi,
ploidy == 1 ? "1" : (hi ? "0|1" : "1|0"),
vars[QUERY]->print_var_sample(out_vcf, ptrs[QUERY], qhi,
ploidy == 1 ? "1" : ((qhi ^ swap) ? "0|1" : "1|0"),
sc_idx, phase_block, phase_switch, phase_flip, true);
}
}
ptrs[QUERY]++;
}
} else if (next[TRUTH]) {
for (int hi = 0; hi < HAPS; hi++) {
if (vars[TRUTH]->var_on_hap(ptrs[TRUTH], hi)) {
for (int thi = 0; thi < HAPS; thi++) {
if (vars[TRUTH]->var_on_hap(ptrs[TRUTH], thi)) {
vars[TRUTH]->print_var_info(out_vcf, this->ref, ctg, ptrs[TRUTH]);
vars[TRUTH]->print_var_sample(out_vcf, ptrs[TRUTH], hi,
ploidy == 1 ? "1" : (hi ? "0|1" : "1|0"),
vars[TRUTH]->print_var_sample(out_vcf, ptrs[TRUTH], thi,
ploidy == 1 ? "1" : (thi ? "0|1" : "1|0"),
sc_idx, phase_block, phase_switch, phase_flip);
vars[QUERY]->print_var_empty(out_vcf, sc_idx, phase_block, true);
}
Expand Down
45 changes: 36 additions & 9 deletions src/variant.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -365,25 +365,52 @@ bool ctgVariants::var_on_hap(int var_idx, int hap, bool calc) const {

/*******************************************************************************/

void ctgVariants::set_var_calcgt_on_hap(int var_idx, int hap) {
/* Set (or unset) the presence of an alternate allele in a variant's genotype.
*/
void ctgVariants::set_var_calcgt_on_hap(int var_idx, int hap, bool set) {
if (hap > 1) ERROR("Unexpected hap idx %d in set_var_calcgt_on_hap()", hap);

if (this->calc_gts[var_idx] == GT_REF_REF) {
this->calc_gts[var_idx] = hap == 0 ? GT_ALT1_REF : GT_REF_ALT1;
if (set) {
this->calc_gts[var_idx] = hap == 0 ? GT_ALT1_REF : GT_REF_ALT1;
} else {
ERROR("Variant calc_gt already unset for variant %d at %s:%d hap %d",
var_idx, this->ctg.data(), this->poss[var_idx], hap);
}

} else if (this->calc_gts[var_idx] == GT_REF_ALT1) {
if (hap == 1) ERROR("Variant calc_gt already set for variant %d at %s:%d hap %d",
var_idx, this->ctg.data(), this->poss[var_idx], hap);
this->calc_gts[var_idx] = GT_ALT1_ALT1;
if (set) {
if (hap == 1) ERROR("Variant calc_gt already set for variant %d at %s:%d hap %d",
var_idx, this->ctg.data(), this->poss[var_idx], hap);
this->calc_gts[var_idx] = GT_ALT1_ALT1;
} else {
if (hap == 0) ERROR("Variant calc_gt already unset for variant %d at %s:%d hap %d",
var_idx, this->ctg.data(), this->poss[var_idx], hap);
this->calc_gts[var_idx] = GT_REF_REF;
}

} else if (this->calc_gts[var_idx] == GT_ALT1_REF) {
if (hap == 0) ERROR("Variant calc_gt already set for variant %d at %s:%d hap %d",
var_idx, this->ctg.data(), this->poss[var_idx], hap);
this->calc_gts[var_idx] = GT_ALT1_ALT1;
if (set) {
if (hap == 0) ERROR("Variant calc_gt already set for variant %d at %s:%d hap %d",
var_idx, this->ctg.data(), this->poss[var_idx], hap);
this->calc_gts[var_idx] = GT_ALT1_ALT1;
} else {
if (hap == 1) ERROR("Variant calc_gt already unset for variant %d at %s:%d hap %d",
var_idx, this->ctg.data(), this->poss[var_idx], hap);
this->calc_gts[var_idx] = GT_REF_REF;
}

} else if (this->calc_gts[var_idx] == GT_ALT1_ALT1) {
if (set) {
ERROR("Variant calc_gt already set for variant %d at %s:%d hap %d",
var_idx, this->ctg.data(), this->poss[var_idx], hap);
} else {
this->calc_gts[var_idx] = hap == 0 ? GT_REF_ALT1 : GT_ALT1_REF;
}

} else {
ERROR("Unexpected calc_gts value '%s' in set_var_calcgt_on_hap() for variant %d at %s:%d",
type_strs[this->calc_gts[var_idx]].data(), var_idx, this->ctg.data(), this->poss[var_idx]);
gt_strs[this->calc_gts[var_idx]].data(), var_idx, this->ctg.data(), this->poss[var_idx]);
}
}

Expand Down
2 changes: 1 addition & 1 deletion src/variant.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ class ctgVariants {
void print_var_sample(FILE* out_vcf, int var_idx, int hap_idx, const std::string & gt, int sc_idx,
int phase_block, bool phase_switch, bool phase_flip, bool query = false);
bool var_on_hap(int var_idx, int hap_idx, bool calc = false) const;
void set_var_calcgt_on_hap(int var_idx, int hap);
void set_var_calcgt_on_hap(int var_idx, int hap, bool set = true);
bool calcgt_is_swapped(int var_idx) const;

// originally parsed data (size n)
Expand Down

0 comments on commit 5519868

Please sign in to comment.