-
Notifications
You must be signed in to change notification settings - Fork 0
/
vcfReader.cpp
1361 lines (1070 loc) · 44.6 KB
/
vcfReader.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#include "vcfReader.h"
#include "jgtmat.h"
#include "bootstrap.h"
#include "metadata.h"
void bblocks_match_contig(bblocks_t* bblocks, const char* this_contigName, const size_t vcf_contig_i) {
size_t bb_contig_i;
if (bblocks->contig_names->find(this_contigName, &bb_contig_i)) {
if (bb_contig_i != vcf_contig_i) {
if (PROGRAM_HAS_INPUT_BLOCKS) {
ERROR("Block contigs do not have the same order as the VCF contigs. Please make sure the contigs in the block file are in the same order as the contigs in the VCF file.");
} else {
NEVER; // we generated the blocks; something went wrong
}
} // else OK
} else {
if (PROGRAM_HAS_INPUT_BLOCKS) {
ERROR("Contig %s not found in block file", this_contigName);
} else {
NEVER; // we generated the blocks; something went wrong
}
}
return;
}
gldata_t* gldata_init(void) {
gldata_t* gldata = (gldata_t*)malloc(sizeof(gldata_t));
ASSERT(gldata != NULL);
gldata->n_gls = 0;
gldata->n_ind = 0;
gldata->size = 0;
gldata->step_size = 0;
gldata->type = ARG_GLDATA_TYPE_UNMOD;
gldata->d = NULL;
gldata->mis = NULL;
return(gldata);
}
static gldata_t* gldata_alloc(const uint8_t n_gls, const size_t n_ind, const size_t init_n_sites, const uint8_t type) {
gldata_t* gldata = gldata_init();
gldata->n_gls = n_gls;
gldata->n_ind = n_ind;
gldata->size = init_n_sites;
gldata->step_size = init_n_sites;
gldata->type = type;
if (n_gls != 3) {
NEVER;
}
gldata->d = (double***)malloc(n_ind * sizeof(double**));
ASSERT(gldata->d != NULL);
gldata->mis = (bool**)malloc(n_ind * sizeof(bool*));
ASSERT(gldata->mis != NULL);
for (size_t i = 0;i < n_ind;++i) {
gldata->d[i] = (double**)malloc(init_n_sites * sizeof(double*));
ASSERT(gldata->d[i] != NULL);
for (size_t s = 0;s < init_n_sites;++s) {
gldata->d[i][s] = NULL;
// alloc in-place if !mis[i][s]
}
gldata->mis[i] = NULL;
gldata->mis[i] = (bool*)malloc(init_n_sites * sizeof(bool));
ASSERT(gldata->mis[i] != NULL);
for (size_t j = 0;j < init_n_sites;++j) {
gldata->mis[i][j] = true; // init to true to exclude the unused sites at the end if array size is larger than the actual number of sites
}
}
return(gldata);
}
static void gldata_realloc(gldata_t* gldata) {
const size_t new_size = gldata->size + gldata->step_size;
for (size_t i = 0; i < gldata->n_ind; ++i) {
gldata->d[i] = (double**)realloc(gldata->d[i], new_size * sizeof(double*));
ASSERT(gldata->d[i] != NULL);
for (size_t j = gldata->size; j < new_size; ++j) {
gldata->d[i][j] = NULL;
}
gldata->mis[i] = (bool*)realloc(gldata->mis[i], new_size * sizeof(bool));
ASSERT(gldata->mis[i] != NULL);
for (size_t j = gldata->size; j < new_size; ++j) {
gldata->mis[i][j] = true;
}
}
gldata->size = new_size;
return;
}
/// @param indends - array of pointers to individual ends (ptr to the last allocated site)
void gldata_destroy(gldata_t* gldata) {
for (size_t i = 0; i < gldata->n_ind; ++i) {
for (size_t j = 0; j < gldata->size; ++j) {
if (NULL != gldata->d[i][j]) {
// isnull if mis[i][j]
FREE(gldata->d[i][j]);
}
}
FREE(gldata->d[i]);
FREE(gldata->mis[i]);
}
FREE(gldata->d);
FREE(gldata->mis);
FREE(gldata);
return;
}
const char* vcfData::get_contig_name(void) {
const char* ret = bcf_hdr_id2name(this->hdr, this->rec ? this->rec->rid : -1);
if (NULL == ret) {
ERROR("Could not retrieve contig name for record %d.", this->rec->rid);
}
return (ret);
}
const char* vcfData::get_contig_name(const int32_t i) {
const char* ret = bcf_hdr_id2name(this->hdr, this->rec ? i : -1);
if (NULL == ret) {
ERROR("Could not retrieve contig name for record %d.", i);
}
return (ret);
}
// -1 missing
// -2
int require_unpack() {
int val = 0;
// BCF_UN_STR: if we need to fetch alleles from bcf->d.allele
if (2 == args->doDist) {
if (val < BCF_UN_STR) {
val = BCF_UN_STR;
}
} else if (1 == args->doDist) {
if (val < BCF_UN_STR) {
val = BCF_UN_STR;
}
}
// TODO set to all for now
val = BCF_UN_ALL;
return(val);
}
int require_index(paramStruct* pars) {
if (PROGRAM_HAS_INPUT_VCF) {
if (NULL != args->in_region) {
return (IDX_CSI);
}
}
if (PROGRAM_HAS_INPUT_DM) {
return (IDX_NONE);
}
return (IDX_NONE);
}
extern const int acgt_charToInt[256] = {
0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 15
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 31
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 47
0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 63
4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, // 79
4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 95
4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, // 111
4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 127
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 143
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 159
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 175
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 191
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 207
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 223
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 239
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 // 255
};
// return 1: skip site (for all individuals)
// assume ref=allele1 (major/anc) and alt=allele2 (minor/der)
int read_site_with_alleles_ref_alt(vcfData* vcfd, paramStruct* pars, const bool contig_changed) {
const int64_t this_pos = vcfd->rec->pos;
const int nAlleles = vcfd->rec->n_allele;
bool allow_invar_site = false; //TODO determine based on analysistype and args
if (0 == nAlleles) {
IO::vprint(2, "Skipping site at %s:%ld. Reason: No data found at site.\n", vcfd->get_contig_name(), this_pos + 1);
return(1);
}
if (!allow_invar_site) {
if (1 == nAlleles) {
IO::vprint(2, "Skipping site at %s:%ld. Reason: Site is not variable.\n", vcfd->get_contig_name(), this_pos + 1);
return(1);
}
}
vcfd->allele_unobserved = -1;
for (int i = 0; i < nAlleles; ++i) {
// if allele is single char
if ('\0' == vcfd->rec->d.allele[i][1]) {
if ('*' == vcfd->rec->d.allele[i][0]) {
// '*' symbol = allele missing due to overlapping deletion
// source: VCF specs v4.3
IO::vprint(2, "Skipping site at %s:%ld. Reason: REF allele is set to '*' (allele missing due to overlapping deletion).\n", vcfd->get_contig_name(), this_pos + 1);
return(1);
}
if ('.' == vcfd->rec->d.allele[i][0]) {
// '.' symbol = MISSING value (no variant)
// source: VCF specs v4.3
IO::vprint(2, "Skipping site at %s:%ld. Reason: REF allele is set to '*' (allele missing due to overlapping deletion).\n", vcfd->get_contig_name(), this_pos + 1);
return(1);
}
continue;
}
// -> allele is not a single char
// if not a single char, it can be one of these:
// 1) An unobserved allele notation (<*> or <NON_REF>)
// 2) Insertion/deletion
// otherwise we do not support such a site
if (vcfd->rec->d.allele[i][0] != '<') {
if (0 == i) {
// if at allele REF and all chars are one of ACGT, it is indel
int j = 0;
while (vcfd->rec->d.allele[i][j] != '\0') {
if (vcfd->rec->d.allele[i][j] != 'A' && vcfd->rec->d.allele[i][j] != 'C' && vcfd->rec->d.allele[i][j] != 'G' && vcfd->rec->d.allele[i][j] != 'T') {
ERROR("Found bad allele '%s' at %s:%ld. If you believe this is a valid allele, please contact the developers.\n", vcfd->rec->d.allele[i], vcfd->get_contig_name(), this_pos + 1);
break;
}
j++;
}
IO::vprint(2, "Skipping site at %s:%ld. Reason: REF allele is an INDEL.\n", vcfd->get_contig_name(), this_pos + 1);
return(1);
}
ERROR("Found bad allele '%s' at %s:%ld. If you believe this is a valid allele, please contact the developers.\n", vcfd->rec->d.allele[i], vcfd->get_contig_name(), this_pos + 1);
}
// -> allele starts with '<'
if (3 == strlen(vcfd->rec->d.allele[i])) {
// if unobserved allele <*>
if ('*' == vcfd->rec->d.allele[i][1] && '>' == vcfd->rec->d.allele[i][2]) {
vcfd->allele_unobserved = i;
continue;
} else {
ERROR("Found bad allele '%s' at %s:%ld. If you believe this is a valid allele, please contact the developers.\n", vcfd->rec->d.allele[i], vcfd->get_contig_name(), this_pos + 1);
}
} else if (9 == strlen(vcfd->rec->d.allele[i])) {
if ('N' == vcfd->rec->d.allele[i][1] && 'O' == vcfd->rec->d.allele[i][2] && 'N' == vcfd->rec->d.allele[i][3] && '_' == vcfd->rec->d.allele[i][4] && 'R' == vcfd->rec->d.allele[i][5] && 'E' == vcfd->rec->d.allele[i][6] && 'F' == vcfd->rec->d.allele[i][7] && '>' == vcfd->rec->d.allele[i][8]) {
// if unobserved allele <NON_REF>
vcfd->allele_unobserved = i;
continue;
} else {
ERROR("Found bad allele '%s' at %s:%ld. If you believe this is a valid allele, please contact the developers.\n", vcfd->rec->d.allele[i], vcfd->get_contig_name(), this_pos + 1);
}
}
NEVER;
}
// if REF==ALLELE_UNOBSERVED
if (0 == vcfd->allele_unobserved) {
if (PROGRAM_WILL_USE_ALLELES_REF_ALT1) {
IO::vprint(2, "Skipping site at %s:%ld. Reason: Program will use REF allele as major allele, but REF allele is set to <NON_REF> or <*>.\n", vcfd->get_contig_name(), this_pos + 1);
return(1);
}
if (nAlleles > 1) {
ERROR("Sites with REF allele set to <NON_REF> or <*> are not supported in the current version of the program.");
//TODO this is only supported if major minor file or refalt file or an estimation method for these is defined; add proper check and handling
}
// check if ALT is set to any allele, maybe site has data but no ref allele specified
// if 1==nAlleles ALT is not set to any allele, so no data at site
// this is already skipped above
if (2 == nAlleles) {
// ALT is set but REF is not, so probably an invariable site with no REF allele specified
if (args->rmInvarSites) {
IO::vprint(2, "Skipping site at %s:%ld. Reason: Site is not variable.\n", vcfd->get_contig_name(), this_pos + 1);
return(1);
}
}
// b1 = BASE_UNOBSERVED;
// b2 = acgt_charToInt[(int)vcfd->rec->d.allele[1][0]];
} else if (1 == vcfd->allele_unobserved) {
// if ALT==ALLELE_UNOBSERVED
if (PROGRAM_WILL_USE_ALLELES_REF_ALT1) {
// sample one allele to set as minor that is not equal to the major
// int base_major=acgt_charToInt[(int)vcfd->rec->d.allele[0][0]];
// int base_minor;
// drand48 interval: [0.0, 1.0) ->OK
// while((base_minor=floor(4* drand48()))==base_major);
// N.B. currently we can just move forward using NON_REF as minor allele since this makes no difference
}
if (2 == nAlleles) {
if (args->rmInvarSites) {
IO::vprint(2, "Skipping site at %s:%ld. Reason: Site is not variable.\n", vcfd->get_contig_name(), this_pos + 1);
return(1);
}
}
} else if (-1 == vcfd->allele_unobserved) {
// -> no unobserved alt allele is found
if (1 == nAlleles) {
if (args->rmInvarSites) {
IO::vprint(2, "Skipping site at %s:%ld. Reason: Site is not variable.\n", vcfd->get_contig_name(), this_pos + 1);
return(1);
}
} else {
// b2 = acgt_charToInt[(int)vcfd->rec->d.allele[1][0]];
// a2 = 1;
}
// b1 = acgt_charToInt[(int)vcfd->rec->d.allele[0][0]];
// a1 = 0;
} else {
// unobserved allele notation is found but is not a1 or a2, so we have at least 3 alleles (0,1, unobserved)
// b1 = acgt_charToInt[(int)vcfd->rec->d.allele[0][0]];
// a1 = 0;
// b2 = acgt_charToInt[(int)vcfd->rec->d.allele[1][0]];
// a2 = 1;
}
if (args->rmInvarSites && PROGRAM_WILL_USE_BCF_FMT_GL) {
int32_t* ads = NULL;
int32_t n_ads = 0;
int32_t n = 0;
n = bcf_get_format_int32(vcfd->hdr, vcfd->rec, "AD", &ads, &n_ads);
if (n <= 0) {
ERROR("Could not read AD tag from the VCF file.");
}
// make sure that AD is non-0 for at least 2 alleles
int n_non0[nAlleles];
for (int i = 0; i < nAlleles; ++i) {
n_non0[i] = 0;
}
for (size_t i = 0; i < pars->names->len; ++i) {
for (size_t j = 0; j < (size_t)nAlleles; ++j) {
if (ads[i * nAlleles + j] > 0) {
n_non0[j]++;
}
}
}
int tot_non0 = 0;
for (size_t i = 0; i < (size_t)nAlleles; ++i) {
if (n_non0[i] > 0) {
tot_non0++;
}
}
FREE(ads);
if (tot_non0 < 2) {
IO::vprint(2, "Skipping site at %s:%ld. Reason: Site is not variable.\n", vcfd->get_contig_name(), this_pos + 1);
return(1);
}
}
if (PROGRAM_WILL_USE_ALLELES_REF_ALT1) {
return(0); // pars->a1a2 already initted to {0,1}
}
static alleles_t* alleles = NULL;
if (pars->majorminor != NULL) {
alleles = pars->majorminor;
} else if (pars->ancder != NULL) {
alleles = pars->ancder;
}
size_t pos_search_start = pars->alleles_posidx + 1; // start from the next position
if (-1 == pars->alleles_contigidx) {
// -> first contig, first time we are using alleles
const char* contigname = bcf_hdr_id2name(vcfd->hdr, vcfd->rec ? vcfd->rec->rid : -1);
if (NULL == contigname) {
ERROR("Could not retrieve contig name for record %d.", vcfd->rec->rid);
}
size_t cnamesidx;
if (alleles->cnames->find(contigname, &cnamesidx)) {
pars->alleles_contigidx = cnamesidx;
pars->alleles_posidx = alleles->cposidx->d[cnamesidx];
pos_search_start = pars->alleles_posidx; // start from the first pos of the contig instead
} else {
IO::vprint(2, "Skipping site at %s:%ld. Reason: Contig %s not found in alleles file.\n", contigname, this_pos + 1, contigname);
return(1);
}
} else {
if (contig_changed) {
const char* contigname = bcf_hdr_id2name(vcfd->hdr, vcfd->rec ? vcfd->rec->rid : -1);
if (NULL == contigname) {
ERROR("Could not retrieve contig name for record %d.", vcfd->rec->rid);
}
// find the new contig, start with the last contig idx+1
size_t cnamesidx;
if (alleles->cnames->find_from(contigname, &cnamesidx, pars->alleles_contigidx)) {
pars->alleles_contigidx = cnamesidx;
pars->alleles_posidx = alleles->cposidx->d[cnamesidx];
pos_search_start = pars->alleles_posidx; // start from the first pos of the contig instead
} else {
IO::vprint(2, "Skipping site at %s:%ld. Reason: Contig %s not found in alleles file.\n", contigname, this_pos + 1, contigname);
return(1);
}
}
}
// if not the last contig, only search until the start of next contig
size_t pos_search_end = (pars->alleles_contigidx < (int)alleles->cnames->len - 1) ? alleles->cposidx->d[pars->alleles_contigidx + 1] : alleles->pos->len;
bool foundSite;
foundSite = false;
for (size_t posidx = pos_search_start; posidx < pos_search_end; posidx++) {
if (alleles->pos->d[posidx] == (size_t)this_pos) {
pars->alleles_posidx = posidx;
foundSite = true;
break;
}
}
if (!foundSite) {
IO::vprint(2, "Skipping site at %s:%ld. Reason: Site not found in alleles file.\n", vcfd->get_contig_name(), this_pos + 1);
return(1);
}
char allele1[2];
char allele2[2];
// the 64bit block containing the alleles for the position
uint64_t tmp = alleles->d[(pars->alleles_posidx / 16)];
// the index of the position in the 64bit block
uint64_t j = (pars->alleles_posidx % 16) * 4;
int baseint1 = (tmp >> j) & 3;
int baseint2 = (tmp >> (j + 2)) & 3;
*allele1 = "ACGT"[baseint1];
*allele2 = "ACGT"[baseint2];
// -> reset
pars->a1a2[0] = -1;
pars->a1a2[1] = -1;
// index of vcf allele in vcf->rec->d.allele that is ref according to alleles_t
for (size_t vcfal = 0; vcfal < vcfd->rec->n_allele; ++vcfal) {
// no need to check if allele is not single char
if ('\0' == vcfd->rec->d.allele[vcfal][1]) {
if (vcfd->rec->d.allele[vcfal][0] == *allele1) {
pars->a1a2[0] = vcfal;
// DEVPRINT("The allele1 (%c)'s position in vcf rec->d.allele (REF:%s ALT:%s) is %d", *allele1, vcfd->rec->d.allele[0], vcfd->rec->d.allele[1], pars->a1a2[0]);
}
if (vcfd->rec->d.allele[vcfal][0] == *allele2) {
pars->a1a2[1] = vcfal;
// DEVPRINT("The allele2 (%c)'s position in vcf rec->d.allele (REF:%s ALT:%s) is %d", *allele2, vcfd->rec->d.allele[0], vcfd->rec->d.allele[1], pars->a1a2[1]);
}
}
}
// -> make sure allele1 allele2 exists in vcfd->rec->d.allele
if (pars->a1a2[0] == -1) {
IO::vprint(2, "Skipping site at %s:%ld. Reason: Allele1 (%c) not found in vcf rec->d.allele (REF:%s ALT:%s).\n", vcfd->get_contig_name(), this_pos + 1, *allele1, vcfd->rec->d.allele[0], vcfd->rec->d.allele[1]);
return(1);
}
if (pars->a1a2[1] == -1) {
IO::vprint(2, "Skipping site at %s:%ld. Reason: Allele2 (%c) not found in vcf rec->d.allele (REF:%s ALT:%s).\n", vcfd->get_contig_name(), this_pos + 1, *allele2, vcfd->rec->d.allele[0], vcfd->rec->d.allele[1]);
return(1);
}
ASSERT(pars->a1a2[0] != pars->a1a2[1]);
return(0);
}
// return 1: skip site for all individuals
static int site_read_3GL(vcfData* vcfd, paramStruct* pars, const size_t block_i, const bool contig_changed) {
int ret;
if (0 != (ret = read_site_with_alleles_ref_alt(vcfd, pars, contig_changed))) {
return(ret);
}
const size_t site_idx = pars->nSites;
const int ngt_to_use = vcfd->nGT;
// DEVPRINT("n_gls: %d nGT:%d", lgls.n_gls, vcfd->nGT);
// ngls is observed number of genotypes as they appear in gl data
// ngt_to_use is what we need to construct the pairwise genotype counts matrix (i.e. 3 or 10)
float* lgls = NULL;
int size_e = 0;
int n_gls = 0;
int n_missing_ind = 0;
int n_values = bcf_get_format_float(vcfd->hdr, vcfd->rec, "GL", &lgls, &size_e);
if (n_values <= 0) {
ERROR("Could not read GL tag from the VCF file.");
}
if (n_values % pars->names->len != 0) {
NEVER;
}
n_gls = n_values / pars->names->len;
if (n_gls < ngt_to_use) {
IO::vprint(2, "Skipping site at %s:%ld. Reason: Number of GLs is less than the number of GLs needed to perform the specified analysis (%d).\n", vcfd->get_contig_name(), vcfd->rec->pos + 1, ngt_to_use);
return(1);
}
float* sample_lgls = NULL;
int skipSite = 0;
bool** isMissing = vcfd->gldata->mis;
uint64_t ordered_gt_fetch[3];
ordered_gt_fetch[0] = bcf_alleles2gt(pars->a1a2[0], pars->a1a2[0]);
ordered_gt_fetch[1] = bcf_alleles2gt(pars->a1a2[0], pars->a1a2[1]);
ordered_gt_fetch[2] = bcf_alleles2gt(pars->a1a2[1], pars->a1a2[1]);
double* iptr;
double max;
const size_t nInd = pars->names->len;
for (size_t i = 0; i < nInd; i++) {
isMissing[i][site_idx] = false;
sample_lgls = lgls + i * n_gls;
// ----------------------------------------------------------------- //
// -> check for missing data
if (bcf_float_is_missing(sample_lgls[0])) {
// missing data check 1
isMissing[i][site_idx] = true;
} else {
// missing data check 2
int z = 1;
for (int j = 1; j < n_gls; ++j) {
if (sample_lgls[0] == sample_lgls[j]) {
++z;
}
}
if (z == n_gls) {
// missing data (all gl values are the same)
isMissing[i][site_idx] = true;
}
}
if (isMissing[i][site_idx]) {
// if minInd 0 (the program will only use sites shared across all ividuals)
// skip site when you encounter any missing data
if (0 == args->minInd) {
skipSite = 1;
break;
}
// skip site if there are 2 inds and at least one of them is missing
if (2 == nInd) {
skipSite = 1;
break;
}
n_missing_ind++;
if (((int)i == (int)nInd - 1) && ((int)nInd == n_missing_ind)) {
// check if all ividuals are missing
skipSite = 1;
break;
}
// skip site if minInd is defined and #non-missing inds=<nInd
if (args->minInd != 2) {
if (((int)nInd - n_missing_ind) < args->minInd) {
IO::vprint(2, "Skipping site at %s:%ld. Reason: Number of non-missing ividuals is less than the minimum number of ividuals -minInd.\n", vcfd->get_contig_name(), vcfd->rec->pos + 1);
skipSite = 1;
break;
}
}
continue;
}
// -> not missing
DEVASSERT(!isMissing[i][site_idx]);
sample_lgls = lgls + (i * n_gls);
vcfd->gldata->d[i][site_idx] = (double*)malloc(n_gls * sizeof(double));
ASSERT(vcfd->gldata->d[i][site_idx] != NULL);
iptr = vcfd->gldata->d[i][site_idx];
if (vcfd->gldata->type == ARG_GLDATA_TYPE_UNMOD) {
iptr[0] = (double)sample_lgls[ordered_gt_fetch[0]];
iptr[1] = (double)sample_lgls[ordered_gt_fetch[1]];
iptr[2] = (double)sample_lgls[ordered_gt_fetch[2]];
} else {
if (vcfd->gldata->type == ARG_GLDATA_TYPE_LOG10GL) {
iptr[0] = (double)sample_lgls[ordered_gt_fetch[0]];
iptr[1] = (double)sample_lgls[ordered_gt_fetch[1]];
iptr[2] = (double)sample_lgls[ordered_gt_fetch[2]];
} else if (vcfd->gldata->type == ARG_GLDATA_TYPE_LNGL) {
iptr[0] = (double)LOG2LN(sample_lgls[ordered_gt_fetch[0]]);
iptr[1] = (double)LOG2LN(sample_lgls[ordered_gt_fetch[1]]);
iptr[2] = (double)LOG2LN(sample_lgls[ordered_gt_fetch[2]]);
} else {
NEVER;
}
max = iptr[0];
for (size_t i = 1; i < 3; i++) {
max = (max < iptr[i]) ? iptr[i] : max;
}
for (size_t i = 0; i < 3; i++) {
iptr[i] -= max;
}
}
}
FREE(lgls);
return(skipSite);
}
// return 1: skip site for all individuals
int site_read_GT(jgtmat_t* jgtmat, vcfData* vcfd, paramStruct* pars, const bool contig_changed) {
int ret;
if (0 != (ret = read_site_with_alleles_ref_alt(vcfd, pars, contig_changed))) {
return(ret);
}
const int nInd = pars->names->len;
int n_missing_ind = 0;
int32_t* gts = NULL;
int size_e = 0;
int n_values = bcf_get_genotypes(vcfd->hdr, vcfd->rec, >s, &size_e);
if (n_values <= 0) {
ERROR("Could not read GT tag from the VCF file.");
}
if (n_values % pars->names->len != 0) {
NEVER;
}
if ((n_values / pars->names->len) != PROGRAM_PLOIDY) {
ERROR("Ploidy %ld not supported.", n_values / pars->names->len);
}
int32_t* sample_gts = NULL;
bool hasData[nInd];
int skipSite = 0;
int32_t* i1_gts = NULL;
int32_t* i2_gts = NULL;
int8_t i1_nder = -1;
int8_t i2_nder = -1;
for (int indi = 0; indi < nInd; ++indi) {
hasData[indi] = false;
sample_gts = gts + indi * PROGRAM_PLOIDY;
if (1 == bcf_gt_is_missing(sample_gts[0]) || 1 == bcf_gt_is_missing(sample_gts[1])) {
// minInd 0
// only use sites shared across all individuals
// so skip site when you first encounter nan
if (0 == args->minInd) {
skipSite = 1;
break;
}
// skip site if there are 2 inds and at least one of them is missing
if (2 == nInd) {
skipSite = 1;
break;
}
n_missing_ind++;
// all individuals are missing
if (nInd == n_missing_ind) {
skipSite = 1;
break;
}
// skip site if minInd is defined and #non-missing inds=<nInd
if (args->minInd != 2) {
if ((nInd - n_missing_ind) < args->minInd) {
IO::vprint(2, "Skipping site at %s:%ld. Reason: Number of non-missing individuals is less than the minimum number of individuals -minInd.\n", vcfd->get_contig_name(), vcfd->rec->pos + 1);
skipSite = 1;
break;
}
}
continue;
} else {
// -> not missing
hasData[indi] = true;
}
}
if (!skipSite && args->rmInvarSites != 0) {
// -> rm invar sites based on GT
// @dragon
int tot_nder = 0;
for (int i = 0; i < nInd; ++i) {
if (!hasData[i]) {
continue;
}
i1_gts = gts + i * PROGRAM_PLOIDY;
if (bcf_gt_allele(i1_gts[0]) == pars->a1a2[1]) {
tot_nder++;
} else if (bcf_gt_allele(i1_gts[0]) == pars->a1a2[0]) {
//
} else {
continue;
}
if (bcf_gt_allele(i1_gts[1]) == pars->a1a2[1]) {
tot_nder++;
} else if (bcf_gt_allele(i1_gts[1]) == pars->a1a2[0]) {
//
} else {
continue;
}
}
if (tot_nder == 0) {
IO::vprint(2, "Skipping site at %s:%ld. Reason: Site is not variable.\n", vcfd->get_contig_name(), vcfd->rec->pos + 1);
skipSite = 1;
} else if (tot_nder == nInd * PROGRAM_PLOIDY) {
IO::vprint(2, "Skipping site at %s:%ld. Reason: Site is not variable.\n", vcfd->get_contig_name(), vcfd->rec->pos + 1);
skipSite = 1;
}
}
do {
if (skipSite) {
break;
}
int gt1 = -1;
int gt2 = -1;
int pidx = 0;
int skipInd;
for (int i1 = 1; i1 < nInd; ++i1) {
if (!hasData[i1]) {
++pidx;
continue;
}
skipInd = 0;
i1_nder = 0;
i1_gts = gts + i1 * PROGRAM_PLOIDY;
gt1 = bcf_gt_allele(i1_gts[0]);
gt2 = bcf_gt_allele(i1_gts[1]);
if (gt1 == pars->a1a2[0]) {
// first base in genotype is ref allele
} else if (gt1 == pars->a1a2[1]) {
i1_nder++;
} else {
skipInd = 1;
}
if (gt2 == pars->a1a2[0]) {
// second base in genotype is ref allele
} else if (gt2 == pars->a1a2[1]) {
i1_nder++;
} else {
skipInd = 2;
}
if (skipInd) {
IO::vprint(2, "Skipping individual %d (%s) at site %s:%ld. Reason: Allele %d in genotype (%c) is not one of alleles to use (%c %c).", i1, vcfd->hdr->samples[i1], vcfd->get_contig_name(), vcfd->rec->pos + 1, skipInd, vcfd->rec->d.allele[skipInd][0], vcfd->rec->d.allele[0][0], vcfd->rec->d.allele[1][0]);
hasData[i1] = false;
++pidx;
continue;
}
for (int i2 = 0;i2 < i1; ++i2) {
if (!hasData[i2]) {
++pidx;
continue;
}
skipInd = 0;
i2_nder = 0;
i2_gts = gts + i2 * PROGRAM_PLOIDY;
gt1 = bcf_gt_allele(i2_gts[0]);
gt2 = bcf_gt_allele(i2_gts[1]);
if (gt1 == pars->a1a2[0]) {
// first base in genotype is ref allele
} else if (gt1 == pars->a1a2[1]) {
i2_nder++;
} else {
skipInd = 1;
}
if (gt2 == pars->a1a2[0]) {
// second base in genotype is ref allele
} else if (gt2 == pars->a1a2[1]) {
i2_nder++;
} else {
skipInd = 2;
}
if (skipInd) {
IO::vprint(2, "Skipping individual %d (%s) at site %s:%ld. Reason: Allele %d in genotype (%c) is not one of alleles to use (%c %c).", i1, vcfd->hdr->samples[i1], vcfd->get_contig_name(), vcfd->rec->pos + 1, skipInd, vcfd->rec->d.allele[skipInd][0], vcfd->rec->d.allele[0][0], vcfd->rec->d.allele[1][0]);
hasData[i2] = false;
++pidx;
continue;
}
jgtmat->m[pidx][(i1_nder * 3) + i2_nder]++;
++pidx;
}
}
} while (0);
FREE(gts);
return(skipSite);
}
vcfData* vcfData_init(paramStruct* pars, metadata_t* metadata) {
vcfData* vcfd = new vcfData;
vcfd->DO_BCF_UNPACK = require_unpack();
vcfd->in_fp = bcf_open(args->in_vcf_fn, "r");
if (vcfd->in_fp == NULL) {
ERROR("Could not open bcf file: %s\n", args->in_vcf_fn);
}
vcfd->hdr = bcf_hdr_read(vcfd->in_fp);
vcfd->rec = bcf_init();
if (require_index(pars) & IDX_CSI) {
vcfd->idx = IO::load_bcf_csi_idx(args->in_vcf_fn);
}
if (require_index(pars) & IDX_TBI) {
vcfd->tbx = IO::load_vcf_tabix_idx(args->in_vcf_fn);
vcfd->itr = tbx_itr_querys(vcfd->tbx, args->in_region);
}
if (NULL != args->in_region) {
vcfd->itr = bcf_itr_querys(vcfd->idx, vcfd->hdr, args->in_region);
if (NULL == vcfd->itr) {
ERROR("Could not parse region: %s. Please make sure region exists and defined in the correct format.", args->in_region);
}
}
if (NULL != args->in_regions_bed_fn) {
ERROR("Not implemented yet");
}
if (NULL != args->in_regions_tab_fn) {
ERROR("Not implemented yet");
}
if ((ARG_DODIST_CALCULATE == args->doDist)||(ARG_DODIST_WITH_BOOTSTRAP == args->doDist)) {
// EM using 3 GL values
if (args->doEM == 1) {
vcfd->nGT = 3;
vcfd->nJointClasses = 9;
}
// EM using 10 GL values
else if (args->doEM == 2) {
NEVER;//TODO
// vcfd->nGT = 10;
// vcfd->nJointClasses = 100;
} else {
if (args->bcfSrc & ARG_INTPLUS_BCFSRC_FMT_GL) {
NEVER; //TODO check this before here
}
}
} else if (2 == args->doDist) {
NEVER;
} else if (1 == args->doIbd) {
// using 3 GL values
vcfd->nGT = 3;
vcfd->nJointClasses = 9;
} else if (2 == args->doIbd) {
// GT
vcfd->nGT = 3;
vcfd->nJointClasses = 9;
} else {
NEVER;
}
const size_t in_vcf_nind = bcf_hdr_nsamples(vcfd->hdr);
if (metadata == NULL) {
pars->names = strArray_alloc(in_vcf_nind);
for (size_t i = 0;i < in_vcf_nind;++i) {
pars->names->add(vcfd->hdr->samples[i]);
}
DEVASSERT(pars->names->len == in_vcf_nind);
LOG("Found %ld individuals in the VCF file", pars->names->len);
} else {
// -> set pars with vcfdata
const size_t mtd_nind = metadata->indNames->len;
size_t midx;
bool badorder = false;
uint64_t samples_metad2vcf[mtd_nind];
size_t newvidx = 0;
for (size_t vidx = 0;vidx < in_vcf_nind;++vidx) {
if (metadata->indNames->find(vcfd->hdr->samples[vidx], &midx)) {
samples_metad2vcf[midx] = vidx;
if (newvidx != midx) {
badorder = true;
break;
}
++newvidx;
} else {
LOG("Skipping individual %s in the VCF file. Reason: Individual not present in metadata file.", vcfd->hdr->samples[vidx]);
}
}
if (newvidx != mtd_nind) {
ERROR("All individuals listed in metadata file must be present in the VCF file. Program could only find %ld out of %ld individuals.", newvidx, mtd_nind);
}
if (badorder) {
ERROR("The order of individuals in the VCF file does not match the order of individuals in the metadata file. Please sort the individuals in the metadata file to match the order of individuals in the VCF file.");
}
pars->names = metadata->indNames;
kstring_t tmp = KS_INITIALIZE;