Skip to content

Commit

Permalink
32bits reading
Browse files Browse the repository at this point in the history
  • Loading branch information
horta committed Sep 3, 2020
1 parent e79f82a commit 9347daa
Show file tree
Hide file tree
Showing 9 changed files with 468 additions and 123 deletions.
15 changes: 15 additions & 0 deletions include/bgen/genotype.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,21 @@ BGEN_EXPORT int bgen_genotype_read(struct bgen_genotype* genotype, double* proba
* @return `0` if it succeeds; `1` otherwise.
*/
BGEN_EXPORT int bgen_genotype_read64(struct bgen_genotype* genotype, double* probabilities);
/** Read the probabilities of each possible genotype (32-bits).
*
* The length of this array is equal to the product of the values obtained by calling
* the functions @ref bgen_file_nsamples and @ref bgen_genotype_ncombs.
* \rst
* .. seealso::
* Please, refer to the corresponding section **Probability data storage** of the
* |bgen format specification| for more information.
* \endrst
*
* @param genotype Variant genotype handler.
* @param probabilities Array of probabilities.
* @return `0` if it succeeds; `1` otherwise.
*/
BGEN_EXPORT int bgen_genotype_read32(struct bgen_genotype* genotype, float* probabilities);
/** Get the number of alleles.
*
* @param genotype Variant genotype handler.
Expand Down
13 changes: 13 additions & 0 deletions src/genotype.c
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,19 @@ int bgen_genotype_read64(struct bgen_genotype* genotype, double* probabilities)
return bgen_genotype_read(genotype, probabilities);
}

int bgen_genotype_read32(struct bgen_genotype* genotype, float* probabilities)
{
if (genotype->layout == 1) {
bgen_layout1_read_genotype32(genotype, probabilities);
} else if (genotype->layout == 2) {
bgen_layout2_read_genotype32(genotype, probabilities);
} else {
bgen_error("unrecognized layout type %d", genotype->layout);
return 1;
}
return 0;
}

uint16_t bgen_genotype_nalleles(struct bgen_genotype const* genotype)
{
return genotype->nalleles;
Expand Down
225 changes: 125 additions & 100 deletions src/layout2.c
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,10 @@

#define BIT(var, bit) ((var & (1 << bit)) != 0)

static void read_phased_genotype(struct bgen_genotype* genotype, double* probs);
static void read_unphased_genotype(struct bgen_genotype* genotype, double* probs);
static void read_phased_genotype64(struct bgen_genotype* genotype, double* probs);
static void read_phased_genotype32(struct bgen_genotype* genotype, float* probs);
static void read_unphased_genotype64(struct bgen_genotype* genotype, double* probs);
static void read_unphased_genotype32(struct bgen_genotype* genotype, float* probs);
static char* decompress(struct bgen_file* bgen_file);

static inline uint8_t read_ploidy(uint8_t ploidy_miss) { return ploidy_miss & 127; }
Expand All @@ -31,6 +33,12 @@ static inline void set_array_nan64(double* p, size_t n)
p[i] = NAN;
}

static inline void set_array_nan32(float* p, size_t n)
{
for (size_t i = 0; i < n; ++i)
p[i] = NAN;
}

int bgen_layout2_read_header(struct bgen_file* bgen_file, struct bgen_genotype* genotype)
{
uint32_t nsamples = 0;
Expand Down Expand Up @@ -123,112 +131,129 @@ int bgen_layout2_read_header(struct bgen_file* bgen_file, struct bgen_genotype*
void bgen_layout2_read_genotype64(struct bgen_genotype* genotype, double* probs)
{
if (genotype->phased)
read_phased_genotype(genotype, probs);
read_phased_genotype64(genotype, probs);
else
read_unphased_genotype(genotype, probs);
read_unphased_genotype64(genotype, probs);
}

static void read_phased_genotype(struct bgen_genotype* genotype, double* probs)
void bgen_layout2_read_genotype32(struct bgen_genotype* genotype, float* probs)
{
unsigned nbits = genotype->nbits;
unsigned nalleles = genotype->nalleles;
uint8_t max_ploidy = genotype->max_ploidy;
double denom = (double)((((uint64_t)1 << nbits)) - 1);

uint64_t sample_start = 0;
for (uint32_t j = 0; j < genotype->nsamples; ++j) {
unsigned ploidy = read_ploidy(genotype->ploidy_missingness[j]);
double* pend = probs + max_ploidy * nalleles;

if (read_missingness(genotype->ploidy_missingness[j]) != 0) {
set_array_nan64(probs, (size_t)(pend - probs));
probs = pend;
sample_start += nbits * ploidy * (nalleles - 1);
continue;
}

uint64_t haplo_start = 0;
for (uint8_t i = 0; i < ploidy; ++i) {

uint64_t uip_sum = 0;
uint64_t allele_start = 0;
for (uint16_t ii = 0; ii < nalleles - 1; ++ii) {

uint64_t ui_prob = 0;
uint64_t offset = sample_start + haplo_start + allele_start;

for (uint8_t bi = 0; bi < nbits; ++bi) {

if (get_bit(genotype->chunk_ptr, bi + offset)) {
ui_prob |= ((uint64_t)1 << bi);
}
}

*probs = (double)ui_prob / denom;
++probs;
uip_sum += ui_prob;
allele_start += nbits;
}
*probs = (denom - (double)uip_sum) / denom;
++probs;
haplo_start += nbits * (nalleles - 1);
}
sample_start += nbits * ploidy * (nalleles - 1);
set_array_nan64(probs, (size_t)(pend - probs));
probs = pend;
}
if (genotype->phased)
read_phased_genotype32(genotype, probs);
else
read_unphased_genotype32(genotype, probs);
}

static void read_unphased_genotype(struct bgen_genotype* genotype, double* probs)
{
uint8_t nbits = genotype->nbits;
uint16_t nalleles = genotype->nalleles;
uint8_t max_ploidy = genotype->max_ploidy;

double denom = (double)((((uint64_t)1 << nbits)) - 1);
unsigned max_ncombs =
choose(nalleles + (unsigned)(max_ploidy - 1), (unsigned)(nalleles - 1));

uint64_t sample_start = 0;
for (uint32_t j = 0; j < genotype->nsamples; ++j) {
double* pend = probs + max_ncombs;
uint8_t ploidy = read_ploidy(genotype->ploidy_missingness[j]);
unsigned ncombs = choose(nalleles + (unsigned)(ploidy - 1), (unsigned)(nalleles - 1));

if (read_missingness(genotype->ploidy_missingness[j]) != 0) {
set_array_nan64(probs, (size_t)(pend - probs));
probs = pend;
sample_start += nbits * (ncombs - 1);
continue;
}

uint64_t uip_sum = 0;

uint64_t geno_start = 0;
for (uint8_t i = 0; i < (size_t)(ncombs - 1); ++i) {

uint64_t ui_prob = 0;
uint64_t offset = sample_start + geno_start;

for (uint8_t bi = 0; bi < (size_t)nbits; ++bi) {

if (get_bit(genotype->chunk_ptr, bi + offset)) {
ui_prob |= ((uint64_t)1 << bi);
}
}
#define MAKE_READ_PHASED_GENOTYPE(BITS, FPTYPE) \
static void read_phased_genotype##BITS(struct bgen_genotype* genotype, FPTYPE* probs) \
{ \
unsigned nbits = genotype->nbits; \
unsigned nalleles = genotype->nalleles; \
uint8_t max_ploidy = genotype->max_ploidy; \
FPTYPE denom = (FPTYPE)((((uint64_t)1 << nbits)) - 1); \
\
uint64_t sample_start = 0; \
for (uint32_t j = 0; j < genotype->nsamples; ++j) { \
unsigned ploidy = read_ploidy(genotype->ploidy_missingness[j]); \
FPTYPE* pend = probs + max_ploidy * nalleles; \
\
if (read_missingness(genotype->ploidy_missingness[j]) != 0) { \
set_array_nan##BITS(probs, (size_t)(pend - probs)); \
probs = pend; \
sample_start += nbits * ploidy * (nalleles - 1); \
continue; \
} \
\
uint64_t haplo_start = 0; \
for (uint8_t i = 0; i < ploidy; ++i) { \
\
uint##BITS##_t uip_sum = 0; \
uint64_t allele_start = 0; \
for (uint16_t ii = 0; ii < nalleles - 1; ++ii) { \
\
uint64_t ui_prob = 0; \
uint64_t offset = sample_start + haplo_start + allele_start; \
\
for (uint8_t bi = 0; bi < nbits; ++bi) { \
\
if (get_bit(genotype->chunk_ptr, bi + offset)) { \
ui_prob |= ((uint64_t)1 << bi); \
} \
} \
\
*probs = (FPTYPE)ui_prob / denom; \
++probs; \
uip_sum += ui_prob; \
allele_start += nbits; \
} \
*probs = (denom - (FPTYPE)uip_sum) / denom; \
++probs; \
haplo_start += nbits * (nalleles - 1); \
} \
sample_start += nbits * ploidy * (nalleles - 1); \
set_array_nan##BITS(probs, (size_t)(pend - probs)); \
probs = pend; \
} \
}

*probs = (double)ui_prob / denom;
++probs;
uip_sum += ui_prob;
geno_start += nbits;
}
*probs = (denom - (double)uip_sum) / denom;
++probs;
sample_start += nbits * (ncombs - 1);
set_array_nan64(probs, (size_t)(pend - probs));
probs = pend;
MAKE_READ_PHASED_GENOTYPE(64, double)
MAKE_READ_PHASED_GENOTYPE(32, float)

#define MAKE_UNPHASED_GENOTYPE(BITS, FPTYPE) \
static void read_unphased_genotype##BITS(struct bgen_genotype* genotype, FPTYPE* probs) \
{ \
uint8_t nbits = genotype->nbits; \
uint16_t nalleles = genotype->nalleles; \
uint8_t max_ploidy = genotype->max_ploidy; \
\
FPTYPE denom = (FPTYPE)((((uint64_t)1 << nbits)) - 1); \
unsigned max_ncombs = \
choose(nalleles + (unsigned)(max_ploidy - 1), (unsigned)(nalleles - 1)); \
\
uint64_t sample_start = 0; \
for (uint32_t j = 0; j < genotype->nsamples; ++j) { \
FPTYPE* pend = probs + max_ncombs; \
uint8_t ploidy = read_ploidy(genotype->ploidy_missingness[j]); \
unsigned ncombs = \
choose(nalleles + (unsigned)(ploidy - 1), (unsigned)(nalleles - 1)); \
\
if (read_missingness(genotype->ploidy_missingness[j]) != 0) { \
set_array_nan##BITS(probs, (size_t)(pend - probs)); \
probs = pend; \
sample_start += nbits * (ncombs - 1); \
continue; \
} \
\
uint##BITS##_t uip_sum = 0; \
\
uint64_t geno_start = 0; \
for (uint8_t i = 0; i < (uint8_t)(ncombs - 1); ++i) { \
\
uint##BITS##_t ui_prob = 0; \
uint64_t offset = sample_start + geno_start; \
\
for (uint8_t bi = 0; bi < (uint8_t)nbits; ++bi) { \
\
if (get_bit(genotype->chunk_ptr, bi + offset)) { \
ui_prob |= ((uint##BITS##_t)1 << bi); \
} \
} \
\
*probs = (FPTYPE)ui_prob / denom; \
++probs; \
uip_sum += ui_prob; \
geno_start += nbits; \
} \
*probs = (denom - (FPTYPE)uip_sum) / denom; \
++probs; \
sample_start += nbits * (ncombs - 1); \
set_array_nan##BITS(probs, (size_t)(pend - probs)); \
probs = pend; \
} \
}
}

MAKE_UNPHASED_GENOTYPE(64, double)
MAKE_UNPHASED_GENOTYPE(32, float)

static char* decompress(struct bgen_file* bgen_file)
{
Expand Down
1 change: 1 addition & 0 deletions src/layout2.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,6 @@ struct bgen_genotype;

int bgen_layout2_read_header(struct bgen_file* bgen_file, struct bgen_genotype* genotype);
void bgen_layout2_read_genotype64(struct bgen_genotype* genotype, double* probs);
void bgen_layout2_read_genotype32(struct bgen_genotype* genotype, float* probs);

#endif
24 changes: 16 additions & 8 deletions test/src/assert_interface_2.c
Original file line number Diff line number Diff line change
Expand Up @@ -61,14 +61,22 @@ int main(void)
cass_equal_int(bgen_genotype_ncombs(genotype), 3);
cass_equal_int(bgen_genotype_phased(genotype), 0);

double* probs = malloc(500 * 3 * sizeof(double));
bgen_genotype_read(genotype, probs);
cass_close(probs[0], 0.00488311054141488121);
cass_close(probs[1], 0.02838308002197399704);
cass_close(probs[2], 0.96673380943661113562);
cass_close(probs[3], 0.99047793444424092613);

free(probs);
double* probs64 = malloc(500 * 3 * sizeof(*probs64));
bgen_genotype_read(genotype, probs64);
cass_close(probs64[0], 0.00488311054141488121);
cass_close(probs64[1], 0.02838308002197399704);
cass_close(probs64[2], 0.96673380943661113562);
cass_close(probs64[3], 0.99047793444424092613);

float* probs32 = malloc(500 * 3 * sizeof(*probs32));
bgen_genotype_read32(genotype, probs32);
cass_close2(probs32[0], 0.00488311054141488121, 1e-7, 0);
cass_close2(probs32[1], 0.02838308002197399704, 1e-7, 0);
cass_close2(probs32[2], 0.96673380943661113562, 1e-7, 0);
cass_close2(probs32[3], 0.99047793444424092613, 1e-7, 0);

free(probs64);
free(probs32);
bgen_partition_destroy(partition);
bgen_genotype_close(genotype);
bgen_metafile_close(metafile);
Expand Down
Loading

0 comments on commit 9347daa

Please sign in to comment.