From 24e4e3115041a55fe4f9dec7059576c9087d1781 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Mon, 7 Oct 2024 11:47:10 +0100 Subject: [PATCH] Fix CRAM embed_ref=2 with seqs overlapping ref end. If the sequences align off the end of the reference and we are creating consensus on the fly, then the consensus generated also steps beyond the reference length. Although this longer reference is embedded, it is trimmed back by the CRAM decoder which validates against the declared reference length in SQ LN, leading to Ns appearing in the decoder. Therefore we now validate in the encoder too, which also needed refs_from_header updates to parse the LN tag so the encoder can trim. Note we already overloaded r->length==0 for an indication that we've not parsed the fa/fai file yet, so we can't just naively fill this out from the SQ LN header. We could hold this information elsewhere via a proper flag and modify all the places that utilise that knowledge, but the simplest (and safest) fix is to have a separate variable used for this one specific case. An example of failure could be seen in: ./test/test_view -C -o embed_ref=2 test/c1#bounds.sam |\ ./test/test_view - --- cram/cram_encode.c | 4 +++- cram/cram_io.c | 8 +++++--- cram/cram_structs.h | 3 ++- test/test.pl | 8 ++++++++ 4 files changed, 18 insertions(+), 5 deletions(-) diff --git a/cram/cram_encode.c b/cram/cram_encode.c index 5d22db54d..3cad124ce 100644 --- a/cram/cram_encode.c +++ b/cram/cram_encode.c @@ -1761,7 +1761,6 @@ static int cram_generate_reference(cram_container *c, cram_slice *s, int r1) { c->ref_start = ref_start+1; c->ref_end = ref_end+1; c->ref_free = 1; - return 0; err: @@ -1997,6 +1996,9 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { fd->no_ref_counter -= (fd->no_ref_counter > 0); pthread_mutex_unlock(&fd->ref_lock); } + + if (c->ref_end > fd->refs->ref_id[c->ref_id]->LN_length) + c->ref_end = fd->refs->ref_id[c->ref_id]->LN_length; } // Iterate through records creating the cram blocks for some diff --git a/cram/cram_io.c b/cram/cram_io.c index 7f7ffca49..94b31f0c4 100644 --- a/cram/cram_io.c +++ b/cram/cram_io.c @@ -2787,10 +2787,12 @@ static int refs_from_header(cram_fd *fd) { /* Initialise likely filename if known */ if ((ty = sam_hrecs_find_type_id(h->hrecs, "SQ", "SN", h->hrecs->ref[i].name))) { - if ((tag = sam_hrecs_find_key(ty, "M5", NULL))) { + if ((tag = sam_hrecs_find_key(ty, "M5", NULL))) r->ref_id[j]->fn = string_dup(r->pool, tag->str+3); - //fprintf(stderr, "Tagging @SQ %s / %s\n", r->ref_id[h]->name, r->ref_id[h]->fn); - } + + if ((tag = sam_hrecs_find_key(ty, "LN", NULL))) + // LN tag used when constructing consensus reference + r->ref_id[j]->LN_length = strtoll(tag->str+3, NULL, 0); } k = kh_put(refs, r->h_meta, r->ref_id[j]->name, &n); diff --git a/cram/cram_structs.h b/cram/cram_structs.h index 9540b5618..49c2e81d5 100644 --- a/cram/cram_structs.h +++ b/cram/cram_structs.h @@ -671,7 +671,8 @@ struct cram_slice { typedef struct ref_entry { char *name; char *fn; - int64_t length; + int64_t length; // if 0 this indicates we haven't loaded it yet + int64_t LN_length; // @SQ LN length, used to trim consensus ref int64_t offset; int bases_per_line; int line_length; diff --git a/test/test.pl b/test/test.pl index b5f52bdfb..a286195c3 100755 --- a/test/test.pl +++ b/test/test.pl @@ -865,6 +865,14 @@ sub test_view testv $opts, "./test_view $tv_args -C -p $ercram $ersam"; testv $opts, "./test_view $tv_args -p $ersam2 $ercram"; testv $opts, "./compare_sam.pl $ersam $ersam2"; + + $ersam = "c1#bounds.sam"; + $ercram = "c1#bounds_er.tmp.cram"; + $ersam2 = "${ercram}.sam"; + testv $opts, "./test_view $tv_args -C -p $ercram $ersam"; + testv $opts, "./test_view $tv_args -p $ersam2 $ercram"; + testv $opts, "./compare_sam.pl $ersam $ersam2"; + if ($test_view_failures == 0) { passed($opts, "embed_ref=2 tests"); } else {