diff --git a/include/bgen/genotype.h b/include/bgen/genotype.h index 49b135a..c609c83 100644 --- a/include/bgen/genotype.h +++ b/include/bgen/genotype.h @@ -50,6 +50,21 @@ BGEN_EXPORT int bgen_genotype_read(struct bgen_genotype* genotype, double* proba * @return `0` if it succeeds; `1` otherwise. */ BGEN_EXPORT int bgen_genotype_read64(struct bgen_genotype* genotype, double* probabilities); +/** Read the probabilities of each possible genotype (32-bits). + * + * The length of this array is equal to the product of the values obtained by calling + * the functions @ref bgen_file_nsamples and @ref bgen_genotype_ncombs. + * \rst + * .. seealso:: + * Please, refer to the corresponding section **Probability data storage** of the + * |bgen format specification| for more information. + * \endrst + * + * @param genotype Variant genotype handler. + * @param probabilities Array of probabilities. + * @return `0` if it succeeds; `1` otherwise. + */ +BGEN_EXPORT int bgen_genotype_read32(struct bgen_genotype* genotype, float* probabilities); /** Get the number of alleles. * * @param genotype Variant genotype handler. diff --git a/src/genotype.c b/src/genotype.c index 47b49e9..668947b 100644 --- a/src/genotype.c +++ b/src/genotype.c @@ -30,6 +30,19 @@ int bgen_genotype_read64(struct bgen_genotype* genotype, double* probabilities) return bgen_genotype_read(genotype, probabilities); } +int bgen_genotype_read32(struct bgen_genotype* genotype, float* probabilities) +{ + if (genotype->layout == 1) { + bgen_layout1_read_genotype32(genotype, probabilities); + } else if (genotype->layout == 2) { + bgen_layout2_read_genotype32(genotype, probabilities); + } else { + bgen_error("unrecognized layout type %d", genotype->layout); + return 1; + } + return 0; +} + uint16_t bgen_genotype_nalleles(struct bgen_genotype const* genotype) { return genotype->nalleles; diff --git a/src/layout2.c b/src/layout2.c index 96d961c..9a68c44 100644 --- a/src/layout2.c +++ b/src/layout2.c @@ -11,8 +11,10 @@ #define BIT(var, bit) ((var & (1 << bit)) != 0) -static void read_phased_genotype(struct bgen_genotype* genotype, double* probs); -static void read_unphased_genotype(struct bgen_genotype* genotype, double* probs); +static void read_phased_genotype64(struct bgen_genotype* genotype, double* probs); +static void read_phased_genotype32(struct bgen_genotype* genotype, float* probs); +static void read_unphased_genotype64(struct bgen_genotype* genotype, double* probs); +static void read_unphased_genotype32(struct bgen_genotype* genotype, float* probs); static char* decompress(struct bgen_file* bgen_file); static inline uint8_t read_ploidy(uint8_t ploidy_miss) { return ploidy_miss & 127; } @@ -31,6 +33,12 @@ static inline void set_array_nan64(double* p, size_t n) p[i] = NAN; } +static inline void set_array_nan32(float* p, size_t n) +{ + for (size_t i = 0; i < n; ++i) + p[i] = NAN; +} + int bgen_layout2_read_header(struct bgen_file* bgen_file, struct bgen_genotype* genotype) { uint32_t nsamples = 0; @@ -123,112 +131,129 @@ int bgen_layout2_read_header(struct bgen_file* bgen_file, struct bgen_genotype* void bgen_layout2_read_genotype64(struct bgen_genotype* genotype, double* probs) { if (genotype->phased) - read_phased_genotype(genotype, probs); + read_phased_genotype64(genotype, probs); else - read_unphased_genotype(genotype, probs); + read_unphased_genotype64(genotype, probs); } -static void read_phased_genotype(struct bgen_genotype* genotype, double* probs) +void bgen_layout2_read_genotype32(struct bgen_genotype* genotype, float* probs) { - unsigned nbits = genotype->nbits; - unsigned nalleles = genotype->nalleles; - uint8_t max_ploidy = genotype->max_ploidy; - double denom = (double)((((uint64_t)1 << nbits)) - 1); - - uint64_t sample_start = 0; - for (uint32_t j = 0; j < genotype->nsamples; ++j) { - unsigned ploidy = read_ploidy(genotype->ploidy_missingness[j]); - double* pend = probs + max_ploidy * nalleles; - - if (read_missingness(genotype->ploidy_missingness[j]) != 0) { - set_array_nan64(probs, (size_t)(pend - probs)); - probs = pend; - sample_start += nbits * ploidy * (nalleles - 1); - continue; - } - - uint64_t haplo_start = 0; - for (uint8_t i = 0; i < ploidy; ++i) { - - uint64_t uip_sum = 0; - uint64_t allele_start = 0; - for (uint16_t ii = 0; ii < nalleles - 1; ++ii) { - - uint64_t ui_prob = 0; - uint64_t offset = sample_start + haplo_start + allele_start; - - for (uint8_t bi = 0; bi < nbits; ++bi) { - - if (get_bit(genotype->chunk_ptr, bi + offset)) { - ui_prob |= ((uint64_t)1 << bi); - } - } - - *probs = (double)ui_prob / denom; - ++probs; - uip_sum += ui_prob; - allele_start += nbits; - } - *probs = (denom - (double)uip_sum) / denom; - ++probs; - haplo_start += nbits * (nalleles - 1); - } - sample_start += nbits * ploidy * (nalleles - 1); - set_array_nan64(probs, (size_t)(pend - probs)); - probs = pend; - } + if (genotype->phased) + read_phased_genotype32(genotype, probs); + else + read_unphased_genotype32(genotype, probs); } -static void read_unphased_genotype(struct bgen_genotype* genotype, double* probs) -{ - uint8_t nbits = genotype->nbits; - uint16_t nalleles = genotype->nalleles; - uint8_t max_ploidy = genotype->max_ploidy; - - double denom = (double)((((uint64_t)1 << nbits)) - 1); - unsigned max_ncombs = - choose(nalleles + (unsigned)(max_ploidy - 1), (unsigned)(nalleles - 1)); - - uint64_t sample_start = 0; - for (uint32_t j = 0; j < genotype->nsamples; ++j) { - double* pend = probs + max_ncombs; - uint8_t ploidy = read_ploidy(genotype->ploidy_missingness[j]); - unsigned ncombs = choose(nalleles + (unsigned)(ploidy - 1), (unsigned)(nalleles - 1)); - - if (read_missingness(genotype->ploidy_missingness[j]) != 0) { - set_array_nan64(probs, (size_t)(pend - probs)); - probs = pend; - sample_start += nbits * (ncombs - 1); - continue; - } - - uint64_t uip_sum = 0; - - uint64_t geno_start = 0; - for (uint8_t i = 0; i < (size_t)(ncombs - 1); ++i) { - - uint64_t ui_prob = 0; - uint64_t offset = sample_start + geno_start; - - for (uint8_t bi = 0; bi < (size_t)nbits; ++bi) { - - if (get_bit(genotype->chunk_ptr, bi + offset)) { - ui_prob |= ((uint64_t)1 << bi); - } - } +#define MAKE_READ_PHASED_GENOTYPE(BITS, FPTYPE) \ + static void read_phased_genotype##BITS(struct bgen_genotype* genotype, FPTYPE* probs) \ + { \ + unsigned nbits = genotype->nbits; \ + unsigned nalleles = genotype->nalleles; \ + uint8_t max_ploidy = genotype->max_ploidy; \ + FPTYPE denom = (FPTYPE)((((uint64_t)1 << nbits)) - 1); \ + \ + uint64_t sample_start = 0; \ + for (uint32_t j = 0; j < genotype->nsamples; ++j) { \ + unsigned ploidy = read_ploidy(genotype->ploidy_missingness[j]); \ + FPTYPE* pend = probs + max_ploidy * nalleles; \ + \ + if (read_missingness(genotype->ploidy_missingness[j]) != 0) { \ + set_array_nan##BITS(probs, (size_t)(pend - probs)); \ + probs = pend; \ + sample_start += nbits * ploidy * (nalleles - 1); \ + continue; \ + } \ + \ + uint64_t haplo_start = 0; \ + for (uint8_t i = 0; i < ploidy; ++i) { \ + \ + uint##BITS##_t uip_sum = 0; \ + uint64_t allele_start = 0; \ + for (uint16_t ii = 0; ii < nalleles - 1; ++ii) { \ + \ + uint64_t ui_prob = 0; \ + uint64_t offset = sample_start + haplo_start + allele_start; \ + \ + for (uint8_t bi = 0; bi < nbits; ++bi) { \ + \ + if (get_bit(genotype->chunk_ptr, bi + offset)) { \ + ui_prob |= ((uint64_t)1 << bi); \ + } \ + } \ + \ + *probs = (FPTYPE)ui_prob / denom; \ + ++probs; \ + uip_sum += ui_prob; \ + allele_start += nbits; \ + } \ + *probs = (denom - (FPTYPE)uip_sum) / denom; \ + ++probs; \ + haplo_start += nbits * (nalleles - 1); \ + } \ + sample_start += nbits * ploidy * (nalleles - 1); \ + set_array_nan##BITS(probs, (size_t)(pend - probs)); \ + probs = pend; \ + } \ + } - *probs = (double)ui_prob / denom; - ++probs; - uip_sum += ui_prob; - geno_start += nbits; - } - *probs = (denom - (double)uip_sum) / denom; - ++probs; - sample_start += nbits * (ncombs - 1); - set_array_nan64(probs, (size_t)(pend - probs)); - probs = pend; +MAKE_READ_PHASED_GENOTYPE(64, double) +MAKE_READ_PHASED_GENOTYPE(32, float) + +#define MAKE_UNPHASED_GENOTYPE(BITS, FPTYPE) \ + static void read_unphased_genotype##BITS(struct bgen_genotype* genotype, FPTYPE* probs) \ + { \ + uint8_t nbits = genotype->nbits; \ + uint16_t nalleles = genotype->nalleles; \ + uint8_t max_ploidy = genotype->max_ploidy; \ + \ + FPTYPE denom = (FPTYPE)((((uint64_t)1 << nbits)) - 1); \ + unsigned max_ncombs = \ + choose(nalleles + (unsigned)(max_ploidy - 1), (unsigned)(nalleles - 1)); \ + \ + uint64_t sample_start = 0; \ + for (uint32_t j = 0; j < genotype->nsamples; ++j) { \ + FPTYPE* pend = probs + max_ncombs; \ + uint8_t ploidy = read_ploidy(genotype->ploidy_missingness[j]); \ + unsigned ncombs = \ + choose(nalleles + (unsigned)(ploidy - 1), (unsigned)(nalleles - 1)); \ + \ + if (read_missingness(genotype->ploidy_missingness[j]) != 0) { \ + set_array_nan##BITS(probs, (size_t)(pend - probs)); \ + probs = pend; \ + sample_start += nbits * (ncombs - 1); \ + continue; \ + } \ + \ + uint##BITS##_t uip_sum = 0; \ + \ + uint64_t geno_start = 0; \ + for (uint8_t i = 0; i < (uint8_t)(ncombs - 1); ++i) { \ + \ + uint##BITS##_t ui_prob = 0; \ + uint64_t offset = sample_start + geno_start; \ + \ + for (uint8_t bi = 0; bi < (uint8_t)nbits; ++bi) { \ + \ + if (get_bit(genotype->chunk_ptr, bi + offset)) { \ + ui_prob |= ((uint##BITS##_t)1 << bi); \ + } \ + } \ + \ + *probs = (FPTYPE)ui_prob / denom; \ + ++probs; \ + uip_sum += ui_prob; \ + geno_start += nbits; \ + } \ + *probs = (denom - (FPTYPE)uip_sum) / denom; \ + ++probs; \ + sample_start += nbits * (ncombs - 1); \ + set_array_nan##BITS(probs, (size_t)(pend - probs)); \ + probs = pend; \ + } \ } -} + +MAKE_UNPHASED_GENOTYPE(64, double) +MAKE_UNPHASED_GENOTYPE(32, float) static char* decompress(struct bgen_file* bgen_file) { diff --git a/src/layout2.h b/src/layout2.h index d27719e..0d65156 100644 --- a/src/layout2.h +++ b/src/layout2.h @@ -6,5 +6,6 @@ struct bgen_genotype; int bgen_layout2_read_header(struct bgen_file* bgen_file, struct bgen_genotype* genotype); void bgen_layout2_read_genotype64(struct bgen_genotype* genotype, double* probs); +void bgen_layout2_read_genotype32(struct bgen_genotype* genotype, float* probs); #endif diff --git a/test/src/assert_interface_2.c b/test/src/assert_interface_2.c index 5c146c4..e61646e 100644 --- a/test/src/assert_interface_2.c +++ b/test/src/assert_interface_2.c @@ -61,14 +61,22 @@ int main(void) cass_equal_int(bgen_genotype_ncombs(genotype), 3); cass_equal_int(bgen_genotype_phased(genotype), 0); - double* probs = malloc(500 * 3 * sizeof(double)); - bgen_genotype_read(genotype, probs); - cass_close(probs[0], 0.00488311054141488121); - cass_close(probs[1], 0.02838308002197399704); - cass_close(probs[2], 0.96673380943661113562); - cass_close(probs[3], 0.99047793444424092613); - - free(probs); + double* probs64 = malloc(500 * 3 * sizeof(*probs64)); + bgen_genotype_read(genotype, probs64); + cass_close(probs64[0], 0.00488311054141488121); + cass_close(probs64[1], 0.02838308002197399704); + cass_close(probs64[2], 0.96673380943661113562); + cass_close(probs64[3], 0.99047793444424092613); + + float* probs32 = malloc(500 * 3 * sizeof(*probs32)); + bgen_genotype_read32(genotype, probs32); + cass_close2(probs32[0], 0.00488311054141488121, 1e-7, 0); + cass_close2(probs32[1], 0.02838308002197399704, 1e-7, 0); + cass_close2(probs32[2], 0.96673380943661113562, 1e-7, 0); + cass_close2(probs32[3], 0.99047793444424092613, 1e-7, 0); + + free(probs64); + free(probs32); bgen_partition_destroy(partition); bgen_genotype_close(genotype); bgen_metafile_close(metafile); diff --git a/test/src/complex.c b/test/src/complex.c index b8316c0..9170129 100644 --- a/test/src/complex.c +++ b/test/src/complex.c @@ -2,15 +2,17 @@ #include "cass.h" #include -void test_complex(void); +void test_complex64(void); +void test_complex32(void); int main(void) { - test_complex(); + test_complex64(); + test_complex32(); return cass_status(); } -void test_complex(void) +void test_complex64(void) { const char filename[] = "data/complex.23bits.bgen"; struct bgen_file* bgen; @@ -114,7 +116,7 @@ void test_complex(void) struct bgen_genotype* vg = bgen_file_open_genotype(bgen, vm->genotype_offset); cass_cond(bgen_genotype_phased(vg) == phased[i]); - probabilities = malloc(nsamples * bgen_genotype_ncombs(vg) * sizeof(double)); + probabilities = malloc(nsamples * bgen_genotype_ncombs(vg) * sizeof(*probabilities)); bgen_genotype_read(vg, probabilities); double* p = probabilities; @@ -138,3 +140,132 @@ void test_complex(void) cass_cond(bgen_metafile_close(mf) == 0); bgen_file_close(bgen); } + +void test_complex32(void) +{ + const char filename[] = "data/complex.23bits.bgen"; + struct bgen_file* bgen; + uint32_t nsamples, nvariants; + float* probabilities; + + cass_cond((bgen = bgen_file_open(filename)) != NULL); + cass_cond((nsamples = bgen_file_nsamples(bgen)) == 4); + cass_cond((nvariants = bgen_file_nvariants(bgen)) == 10); + + struct bgen_samples* samples = bgen_file_read_samples(bgen); + + cass_cond(bgen_string_equal(BGEN_STRING("sample_0"), *bgen_samples_get(samples, 0))); + cass_cond(bgen_string_equal(BGEN_STRING("sample_3"), *bgen_samples_get(samples, 3))); + + bgen_samples_destroy(samples); + + struct bgen_metafile* mf = + bgen_metafile_create(bgen, "complex.tmp/complex.23bits.bgen.metafile", 1, 0); + + struct bgen_partition const* partition = bgen_metafile_read_partition(mf, 0); + cass_cond(nvariants == 10); + + struct bgen_variant const* vm = bgen_partition_get_variant(partition, 0); + cass_cond(bgen_string_equal(BGEN_STRING("V1"), *vm->rsid)); + vm = bgen_partition_get_variant(partition, 9); + cass_cond(bgen_string_equal(BGEN_STRING("M10"), *vm->rsid)); + + uint32_t position[] = {1, 2, 3, 4, 5, 7, 7, 8, 9, 10}; + uint16_t correct_nalleles[] = {2, 2, 2, 3, 2, 4, 6, 7, 8, 2}; + char* allele_ids[] = {"A", "G", "A", "G", "A", "G", "A", "G", + "T", "A", "G", "A", "G", "GT", "GTT", "A", + "G", "GT", "GTT", "GTTT", "GTTTT", "A", "G", "GT", + "GTT", "GTTT", "GTTTT", "GTTTTT", "A", "G", "GT", "GTT", + "GTTT", "GTTTT", "GTTTTT", "GTTTTTT", "A", "G"}; + + uint32_t jj = 0; + for (uint32_t i = 0; i < nvariants; ++i) { + vm = bgen_partition_get_variant(partition, i); + cass_cond(vm->nalleles == correct_nalleles[i]); + cass_cond(vm->position == position[i]); + + for (uint16_t j = 0; j < vm->nalleles; ++j) { + + cass_cond(bgen_string_equal(BGEN_STRING(allele_ids[jj]), *vm->allele_ids[j])); + ++jj; + } + } + + int phased[] = {0, 1, 1, 0, 1, 1, 1, 1, 0, 0}; + + int ploidys[] = {1, 2, 2, 2, 1, 1, 1, 1, 1, 2, 2, 2, 1, 2, 2, 2, 1, 3, 3, 2, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 4, 4, 4, 4}; + float real_probs[] = { + 1.000000, 0.000000, NAN, 1.000000, 0.000000, 0.000000, 1.000000, 0.000000, + 0.000000, 0.000000, 1.000000, 0.000000, 1.000000, 0.000000, 1.000000, 0.000000, + 1.000000, 0.000000, 0.000000, 1.000000, 1.000000, 0.000000, NAN, NAN, + 0.000000, 1.000000, 0.000000, 1.000000, 1.000000, 0.000000, 1.000000, 0.000000, + 1.000000, 0.000000, 0.000000, 1.000000, 1.000000, 0.000000, 0.000000, NAN, + NAN, NAN, 1.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, + 0.000000, 1.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, + 0.000000, 1.000000, 0.000000, 0.000000, 1.000000, 0.000000, NAN, NAN, + NAN, NAN, 1.000000, 0.000000, 1.000000, 0.000000, 1.000000, 0.000000, + 1.000000, 0.000000, 1.000000, 0.000000, 0.000000, 1.000000, 1.000000, 0.000000, + 0.000000, 1.000000, NAN, NAN, 1.000000, 0.000000, 0.000000, 0.000000, + 0.000000, 1.000000, 0.000000, 0.000000, 0.000000, 0.000000, 1.000000, 0.000000, + 0.000000, 0.000000, 0.000000, 1.000000, 1.000000, 0.000000, 0.000000, 0.000000, + 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 1.000000, 0.000000, 0.000000, + 0.000000, 0.000000, 0.000000, 0.000000, 1.000000, 0.000000, 0.000000, 0.000000, + 0.000000, 0.000000, 0.000000, 1.000000, 1.000000, 0.000000, 0.000000, 0.000000, + 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 1.000000, 0.000000, + 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 1.000000, 0.000000, + 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 1.000000, 0.000000, + 1.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, + NAN, NAN, NAN, NAN, NAN, NAN, NAN, NAN, + NAN, NAN, NAN, NAN, NAN, NAN, NAN, NAN, + NAN, NAN, NAN, NAN, NAN, NAN, NAN, NAN, + NAN, NAN, NAN, NAN, 0.000000, 0.000000, 0.000000, 0.000000, + 0.000000, 0.000000, 1.000000, 0.000000, NAN, NAN, NAN, NAN, + NAN, NAN, NAN, NAN, NAN, NAN, NAN, NAN, + NAN, NAN, NAN, NAN, NAN, NAN, NAN, NAN, + NAN, NAN, NAN, NAN, NAN, NAN, NAN, NAN, + 0.000000, 0.000000, 0.000000, 0.000000, 1.000000, 0.000000, 0.000000, 0.000000, + NAN, NAN, NAN, NAN, NAN, NAN, NAN, NAN, + NAN, NAN, NAN, NAN, NAN, NAN, NAN, NAN, + NAN, NAN, NAN, NAN, NAN, NAN, NAN, NAN, + NAN, NAN, NAN, NAN, 0.000000, 0.000000, 0.000000, 0.000000, + 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, + 0.000000, 0.000000, 0.000000, 0.000000, 1.000000, 0.000000, 0.000000, 0.000000, + 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, + 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, + 1.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 1.000000, 0.000000, + 0.000000, 0.000000, 0.000000, 0.000000, 1.000000, 0.000000, 0.000000, 0.000000, + 0.000000, 0.000000, 1.000000, 0.000000}; + + float* rp = real_probs; + + jj = 0; + for (uint32_t i = 0; i < nvariants; ++i) { + vm = bgen_partition_get_variant(partition, i); + struct bgen_genotype* vg = bgen_file_open_genotype(bgen, vm->genotype_offset); + cass_cond(bgen_genotype_phased(vg) == phased[i]); + + probabilities = malloc(nsamples * bgen_genotype_ncombs(vg) * sizeof(*probabilities)); + bgen_genotype_read32(vg, probabilities); + + float* p = probabilities; + for (uint16_t j = 0; j < nsamples; ++j) { + + cass_cond(ploidys[jj] == bgen_genotype_ploidy(vg, j)); + cass_cond(bgen_genotype_missing(vg, j) == 0); + + for (unsigned ii = 0; ii < bgen_genotype_ncombs(vg); ++ii) { + cass_cond(!(*rp != *p && !(isnan(*rp) && isnan(*p)))); + ++rp; + ++p; + } + ++jj; + } + bgen_genotype_close(vg); + free(probabilities); + } + + bgen_partition_destroy(partition); + cass_cond(bgen_metafile_close(mf) == 0); + bgen_file_close(bgen); +} diff --git a/test/src/haplotype.c b/test/src/haplotype.c index 15c0bcd..24387f5 100644 --- a/test/src/haplotype.c +++ b/test/src/haplotype.c @@ -5,15 +5,17 @@ #include #include -void test_haplotype(void); +void test_haplotype64(void); +void test_haplotype32(void); int main(void) { - test_haplotype(); + test_haplotype64(); + test_haplotype32(); return cass_status(); } -void test_haplotype(void) +void test_haplotype64(void) { const char filename[] = "data/haplotypes.bgen"; struct bgen_file* bgen; @@ -71,7 +73,8 @@ void test_haplotype(void) vm = bgen_partition_get_variant(partition, i); vg = bgen_file_open_genotype(bgen, vm->genotype_offset); - double* probabilities = malloc(nsamples * bgen_genotype_ncombs(vg) * sizeof(double)); + double* probabilities = + malloc(nsamples * bgen_genotype_ncombs(vg) * sizeof(*probabilities)); cass_cond(bgen_genotype_read(vg, probabilities) == 0); @@ -89,3 +92,81 @@ void test_haplotype(void) cass_cond(bgen_metafile_close(mf) == 0); bgen_file_close(bgen); } + +void test_haplotype32(void) +{ + const char filename[] = "data/haplotypes.bgen"; + struct bgen_file* bgen; + uint32_t nsamples, nvariants; + + cass_cond((bgen = bgen_file_open(filename)) != NULL); + cass_cond((nsamples = bgen_file_nsamples(bgen)) == 4); + cass_cond((nvariants = bgen_file_nvariants(bgen)) == 4); + + struct bgen_samples* samples = bgen_file_read_samples(bgen); + + cass_cond(bgen_string_equal(BGEN_STRING("sample_0"), *bgen_samples_get(samples, 0))); + + bgen_samples_destroy(samples); + + struct bgen_metafile* mf = + bgen_metafile_create(bgen, "haplotype.tmp/complex.23bits.bgen.metafile", 1, 0); + + cass_cond(mf != NULL); + cass_cond(bgen_metafile_npartitions(mf) == 1); + + struct bgen_partition const* partition = bgen_metafile_read_partition(mf, 0); + cass_cond(partition != NULL); + + struct bgen_variant const* vm = bgen_partition_get_variant(partition, 0); + cass_cond(bgen_string_equal(BGEN_STRING("RS1"), *vm->rsid)); + cass_cond(vm->nalleles == 2); + + for (uint32_t i = 0; i < nvariants; ++i) { + vm = bgen_partition_get_variant(partition, i); + cass_cond(bgen_string_equal(BGEN_STRING("A"), *vm->allele_ids[0])); + cass_cond(bgen_string_equal(BGEN_STRING("G"), *vm->allele_ids[1])); + } + + vm = bgen_partition_get_variant(partition, 1); + struct bgen_genotype* vg = bgen_file_open_genotype(bgen, vm->genotype_offset); + + cass_cond(bgen_genotype_ncombs(vg) == 4); + cass_cond(bgen_genotype_ploidy(vg, 0) == 2); + cass_cond(bgen_genotype_min_ploidy(vg) == 2); + cass_cond(bgen_genotype_max_ploidy(vg) == 2); + cass_cond(bgen_genotype_nalleles(vg) == 2); + cass_cond(bgen_genotype_phased(vg) == 1); + + bgen_genotype_close(vg); + + float real_probs[] = {1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, + 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, + 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, + 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, + 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0}; + + uint32_t jj = 0; + for (uint32_t i = 0; i < nvariants; ++i) { + vm = bgen_partition_get_variant(partition, i); + vg = bgen_file_open_genotype(bgen, vm->genotype_offset); + + float* probabilities = + malloc(nsamples * bgen_genotype_ncombs(vg) * sizeof(*probabilities)); + + cass_cond(bgen_genotype_read32(vg, probabilities) == 0); + + for (uint32_t ii = 0; ii < nsamples; ++ii) { + for (unsigned j = 0; j < bgen_genotype_ncombs(vg); ++j) { + cass_cond(probabilities[ii * bgen_genotype_ncombs(vg) + j] == real_probs[jj]); + jj++; + } + } + bgen_genotype_close(vg); + free(probabilities); + } + + bgen_partition_destroy(partition); + cass_cond(bgen_metafile_close(mf) == 0); + bgen_file_close(bgen); +} diff --git a/test/src/open_genotype.c b/test/src/open_genotype.c index 4b60349..570fae3 100644 --- a/test/src/open_genotype.c +++ b/test/src/open_genotype.c @@ -194,7 +194,7 @@ void test_geno(void) vg = bgen_file_open_genotype(bgen, vm->genotype_offset); double* probabilities = - malloc(nsamples * bgen_genotype_ncombs(vg) * sizeof(double)); + malloc(nsamples * bgen_genotype_ncombs(vg) * sizeof(*probabilities)); double* p = probabilities; bgen_genotype_read(vg, probabilities); diff --git a/test/src/v12_zlib_layout2.c b/test/src/v12_zlib_layout2.c index 9c1fdcc..e3b1824 100644 --- a/test/src/v12_zlib_layout2.c +++ b/test/src/v12_zlib_layout2.c @@ -10,8 +10,10 @@ unsigned get_nexamples(void); unsigned ipow(unsigned base, unsigned exp); void test_read_metadata(struct bgen_file* bgen, struct bgen_samples* samples, struct bgen_metafile* metafile); -void test_read_probabilities(struct bgen_file* bgen, struct bgen_metafile* metafile, - uint32_t nsamples, unsigned prec); +void test_read_probabilities64(struct bgen_file* bgen, struct bgen_metafile* metafile, + uint32_t nsamples, unsigned prec); +void test_read_probabilities32(struct bgen_file* bgen, struct bgen_metafile* metafile, + uint32_t nsamples, unsigned prec); void test_read(struct bgen_file* bgen, struct bgen_metafile* metafile, unsigned precision); int main(void) @@ -75,8 +77,8 @@ void test_read_metadata(struct bgen_file* bgen, struct bgen_samples* samples, bgen_partition_destroy(partition); } -void test_read_probabilities(struct bgen_file* bgen, struct bgen_metafile* metafile, - uint32_t nsamples, unsigned prec) +void test_read_probabilities64(struct bgen_file* bgen, struct bgen_metafile* metafile, + uint32_t nsamples, unsigned prec) { double prob[3]; double abs_tol = 1. / ipow(2, prec) + 1. / ipow(2, prec) / 3.; @@ -100,7 +102,7 @@ void test_read_probabilities(struct bgen_file* bgen, struct bgen_metafile* metaf unsigned ncombs = bgen_genotype_ncombs(vg); cass_cond(ncombs == 3); - double* probabilities = calloc(nsamples * ncombs, sizeof(double)); + double* probabilities = calloc(nsamples * ncombs, sizeof(*probabilities)); cass_cond(bgen_genotype_read(vg, probabilities) == 0); @@ -143,13 +145,82 @@ void test_read_probabilities(struct bgen_file* bgen, struct bgen_metafile* metaf fclose(f); } +void test_read_probabilities32(struct bgen_file* bgen, struct bgen_metafile* metafile, + uint32_t nsamples, unsigned prec) +{ + double prob[3]; + double abs_tol = 1. / ipow(2, prec) + 1. / ipow(2, prec) / 3.; + double rel_tol = 1e-09; + int e; + size_t i, j; + + FILE* f = fopen("data/example.matrix", "r"); + cass_cond(f != NULL); + + uint32_t ii = 0; + i = 0; + for (uint32_t part = 0; part < bgen_metafile_npartitions(metafile); ++part) { + struct bgen_partition const* partition = bgen_metafile_read_partition(metafile, part); + for (ii = 0; ii < bgen_partition_nvariants(partition); ++ii, ++i) { + struct bgen_variant const* vm = bgen_partition_get_variant(partition, ii); + struct bgen_genotype* vg = bgen_file_open_genotype(bgen, vm->genotype_offset); + + cass_cond(bgen_genotype_max_ploidy(vg) == 2); + + unsigned ncombs = bgen_genotype_ncombs(vg); + cass_cond(ncombs == 3); + + float* probabilities = calloc(nsamples * ncombs, sizeof(*probabilities)); + + cass_cond(bgen_genotype_read32(vg, probabilities) == 0); + + for (j = 0; j < 500; ++j) { + + char string[100]; + char* tmp = NULL; + + e = fscanf(f, "%s", string); + cass_cond(e == 1); + prob[0] = strtod(string, &tmp); + + e = fscanf(f, "%s", string); + cass_cond(e == 1); + prob[1] = strtod(string, &tmp); + + e = fscanf(f, "%s", string); + cass_cond(e == 1); + prob[2] = strtod(string, &tmp); + + if ((prob[0] == 0) && (prob[1] == 0) && (prob[2] == 0)) { + prob[0] = NAN; + prob[1] = NAN; + prob[2] = NAN; + cass_cond(isnan(probabilities[j * 3 + 0])); + cass_cond(isnan(probabilities[j * 3 + 1])); + cass_cond(isnan(probabilities[j * 3 + 2])); + } else { + cass_close2(probabilities[j * 3 + 0], prob[0], rel_tol, abs_tol); + cass_close2(probabilities[j * 3 + 1], prob[1], rel_tol, abs_tol); + cass_close2(probabilities[j * 3 + 2], prob[2], rel_tol, abs_tol); + } + } + free(probabilities); + bgen_genotype_close(vg); + } + bgen_partition_destroy(partition); + } + + fclose(f); +} + void test_read(struct bgen_file* bgen, struct bgen_metafile* metafile, unsigned precision) { struct bgen_samples* samples = bgen_file_read_samples(bgen); test_read_metadata(bgen, samples, metafile); bgen_samples_destroy(samples); - test_read_probabilities(bgen, metafile, 500, precision); + test_read_probabilities64(bgen, metafile, 500, precision); + test_read_probabilities32(bgen, metafile, 500, precision); } const char* examples[] = {"data/example.1bits.bgen", "data/example.14bits.bgen",