Skip to content

Commit

Permalink
Add support for matching VCF lines by ID
Browse files Browse the repository at this point in the history
  • Loading branch information
pd3 committed Oct 2, 2024
1 parent c12496f commit 09251d2
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 5 deletions.
14 changes: 11 additions & 3 deletions bcf_sr_sort.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
Copyright (C) 2017-2021 Genome Research Ltd.
Copyright (C) 2017-2024 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
Expand Down Expand Up @@ -32,6 +32,7 @@
#include "htslib/khash_str2int.h"
#include "htslib/kbitset.h"

// Variant types and pair-wise compatibility of their combinations, see bcf_sr_init_scores()
#define SR_REF 1
#define SR_SNP 2
#define SR_INDEL 4
Expand Down Expand Up @@ -366,7 +367,7 @@ static int bcf_sr_sort_set(bcf_srs_t *readers, sr_sort_t *srt, const char *chr,
// group VCFs into groups, each with a unique combination of variants in the duplicate lines
int ireader,ivar,irec,igrp,ivset,iact;
for (ireader=0; ireader<readers->nreaders; ireader++) srt->vcf_buf[ireader].nrec = 0;
for (iact=0; iact<srt->nactive; iact++)
for (iact=0; iact<srt->nactive; iact++) // process each of the active readers, ie which still have a record to process
{
ireader = srt->active[iact];
bcf_sr_t *reader = &readers->readers[ireader];
Expand All @@ -384,6 +385,11 @@ static int bcf_sr_sort_set(bcf_srs_t *readers, sr_sort_t *srt, const char *chr,
srt->off[srt->noff++] = srt->str.l;
size_t beg = srt->str.l;
int end_pos = -1;
if ( srt->pair & BCF_SR_PAIR_ID )
{
kputs(line->d.id,&srt->str);
kputc(':',&srt->str);
}
for (ivar=1; ivar<line->n_allele; ivar++)
{
if ( ivar>1 ) kputc(',',&srt->str);
Expand Down Expand Up @@ -417,7 +423,8 @@ static int bcf_sr_sort_set(bcf_srs_t *readers, sr_sort_t *srt, const char *chr,
}

// Create new variant or attach to existing one. But careful, there can be duplicate
// records with the same POS,REF,ALT (e.g. in dbSNP-b142)
// records with the same POS,REF,ALT (e.g. in dbSNP-b142). In such case, append a
// numeric index (var_idx)
char *var_str = beg + srt->str.s;
int ret, var_idx = 0, var_end = srt->str.l;
while ( 1 )
Expand All @@ -435,6 +442,7 @@ static int bcf_sr_sort_set(bcf_srs_t *readers, sr_sort_t *srt, const char *chr,
}
if ( ret==-1 )
{
// the variant is not present, insert
ivar = srt->nvar++;
hts_expand0(var_t,srt->nvar,srt->mvar,srt->var);
srt->var[ivar].nvcf = 0;
Expand Down
10 changes: 9 additions & 1 deletion bcf_sr_sort.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,9 @@ typedef struct
}
var_t;

// Group is a set of variants in duplicate records within one VCF. They are identified with a key (used only
// for debugging), such as C>A,C>G;C>T. Commas separate alleles in a multiallelic record, semicolons separate
// VCF lines.
typedef struct
{
char *key; // only for debugging
Expand All @@ -67,7 +70,7 @@ typedef struct
{
int nvar, mvar, *var; // list of compatible variants that can be output together
int cnt; // number of readers in this group
kbitset_t *mask; // which groups are populated in this set (replace with expandable bitmask)
kbitset_t *mask; // which groups are populated in this set
}
varset_t;

Expand Down Expand Up @@ -100,8 +103,13 @@ sr_sort_t;
sr_sort_t *bcf_sr_sort_init(sr_sort_t *srt);
void bcf_sr_sort_reset(sr_sort_t *srt);
int bcf_sr_sort_next(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, hts_pos_t pos);

// initialize a new position using the i-th reader
int bcf_sr_sort_set_active(sr_sort_t *srt, int i);

// add i-th reader with the same position, assumed bcf_sr_sort_set_active() was called with another reader
int bcf_sr_sort_add_active(sr_sort_t *srt, int i);

void bcf_sr_sort_destroy(sr_sort_t *srt);
void bcf_sr_sort_remove_reader(bcf_srs_t *readers, sr_sort_t *srt, int i);

Expand Down
3 changes: 2 additions & 1 deletion htslib/synced_bcf_reader.h
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
/// @file htslib/synced_bcf_reader.h
/// Stream through multiple VCF files.
/*
Copyright (C) 2012-2017, 2019-2023 Genome Research Ltd.
Copyright (C) 2012-2017, 2019-2024 Genome Research Ltd.
Author: Petr Danecek <pd3@sanger.ac.uk>
Expand Down Expand Up @@ -89,6 +89,7 @@ extern "C" {
#define BCF_SR_PAIR_SNP_REF (1<<4) // allow REF-only records with SNPs
#define BCF_SR_PAIR_INDEL_REF (1<<5) // allow REF-only records with indels
#define BCF_SR_PAIR_EXACT (1<<6) // require the exact same set of alleles in all files
#define BCF_SR_PAIR_ID (1<<7) // require matching IDs (overlap)
#define BCF_SR_PAIR_BOTH (BCF_SR_PAIR_SNPS|BCF_SR_PAIR_INDELS)
#define BCF_SR_PAIR_BOTH_REF (BCF_SR_PAIR_SNPS|BCF_SR_PAIR_INDELS|BCF_SR_PAIR_SNP_REF|BCF_SR_PAIR_INDEL_REF)

Expand Down

0 comments on commit 09251d2

Please sign in to comment.