From e19d38869a0dbcad89ceda989df23d7ae2cd5842 Mon Sep 17 00:00:00 2001 From: Irina Abnizova Date: Tue, 5 Jan 2016 12:04:31 +0000 Subject: [PATCH 01/17] added contamination src directory --- src/contamination/get_mixture.c | 498 ++++++++++++++++++++++++++++++++ 1 file changed, 498 insertions(+) create mode 100755 src/contamination/get_mixture.c diff --git a/src/contamination/get_mixture.c b/src/contamination/get_mixture.c new file mode 100755 index 0000000..fcc5ffe --- /dev/null +++ b/src/contamination/get_mixture.c @@ -0,0 +1,498 @@ + /* File: get_mixture.c // computes maf distribution over variant positions from vcfq files + * Authors: designed by Irina Abnizova (ia1)edited by Steve Leonard (srl) + * + + Last edited: + + 28 April-to remove training printfs, to output mixture=0 if not any + 25 April: incorporate info about (100-mix)% to increase proly of detection + 24 April-output in infoAF 4 fields for each variant position, last is its actual coverage sDP4 + 22 April: max sginif or first max if not signif + all 100 AAF to see 'complimentary' + 10 April: if ( DP4[0] >= THR[0] && DP4[1]>= THR[0] && DP4[2]>= THR[1] && DP4[3]>= THR[1])// 0 ref alt are ok + thr_sk + 10 March 2014 for skewness to first 25% AAF + 3 feb-reads thr from a file; 29 Jan 2014 - get autom thr, starting with default ones 1,5 + still old fields of vcf input + 21 Nov 2013, for two thresholds + + *------------------------------------------------------------------- + * Description: computes maf distrib over variant positions from just extracted 6 fileld + + * Exported functions: + * HISTORY: + +skewness to first 25% of AAF ----------introduced 10 March +computes AF (alternative to Ref allele frequency) for each error base call from vcf + +applies minimum depth threshold, both for Ref and Alternative alleles fwd rev (total min depth will be +4*MIN_DEPTH + +input1 thrfile (thRR thAA) +input2: extracted vcf, + + +output1: info AF.txt= + +pos %AF Q30frac sumDP4 (for this position) +1546 100 0.89 101 +4469 100 0.86 90 +4532 100 0.78 87 +4559 97 0.95 60 + +output2: distrAF.tsigt=histogram of AF with significance for each bin +% count signif +8 0 0 +9 0 0 +10 385 1 +11 780 3 + + +usage:./get_mixture ra_i_j.thr extracted.vcf .infAF .distrAF + +*/ + +#include +//#include "conio.h" +#include +#include +#include + +// ******** predefined user constants, 22 october + +#define NRID 6 // Number of variant ids=fields: pos, chromosome, Depth,DP4,(no PV4+maf) in esigtracted_vcf files +#define NPAR 5 // masig Number of PARameters and arguments +#define Nthr 2 // Number of thresholds + + +// ******** declarations of functions +int GetAf (int []);// computes AF percent for a variant position +float GetMu( int data[]); +float GetStd( int data[]); + +int GetMaxPeak2(int data[]); +float ComplimentaryPeaks(int data[], int perc); + +//=====================================================MAIN + +int main (int argc, char *argcv[]) { + + // flags + int firstSixAreOK = 1; + int canWriteAF = 1; + int canWriteDistrib = 1; + + FILE *extract_vcfFile, *afFile, *distribFile, *thrFile; //file handles + + int n,i,perc;//, cntZero=0, counter=0; + //char chrom[50]; + int count_afterF=0;// after filtering with min depths + int count_beforeF=0;// before filtering with min depths + int DP4[4];// + int D,pos,AF;// Depth, genome pos of error + int sumD=0,sumDP4=0;//across all pos! + int sumDbefore=0; + int thR,thA;// from file + int less50; + int more25;// sum AF (25,50) + int less25; + int bw25_40; + int sDP4;//act coverage of a base + + float sDP4f;//act coverage of a base as float + float mu,std;//mean and std of alternative frequency, AF + float avDbefore; + float avD,avDP4,Q30frac; + + float perc_left; + float sig,conf;//confidence (0,1) + float sk25;//skewness to first 25% of AAF ----------introduced 10 March + float thr_sk; + float fprolly; + + int histAF[100];// to store mafs percentages + int THR[2];// define from input1 thr file + + + if(argc < NPAR)//four input_output files are submitted + { + printf("not enough of parms |input_output files\n"); + printf("usage:./get_signifAF name.thr input2 output1 output2\n"); + return -1; + } + + + thrFile=fopen(argcv[1],"r"); + if (thrFile == NULL) { + printf("cannot open first input _threshold file %s\n", argcv[1]); + return -1; + } + + extract_vcfFile=fopen(argcv[2],"r"); + if (extract_vcfFile == NULL) { + printf("cannot open first input _vcf.tsigt file %s\n", argcv[2]); + return -1; + } + + afFile=fopen(argcv[3],"w"); + if (afFile == NULL) { + printf("cannot open first output1 infoAF.tsigt file %s for writing\n", argcv[3]); + return -1; + } + + distribFile=fopen(argcv[4],"w"); + if (distribFile == NULL) { + printf("cannot open second output file distAF %s for writing\n", argcv[4]); + return -1; + } + + printf("get_peak_thrFile\n"); + + // give value for thr_sk= 0.55 for human; 1.0 for pathogens + thr_sk=0.6; + + // initiate zero vector for histograme + for(i=0;i<100;i++)// + { + histAF[i]=0; + } + // initiate zero vector for threshold + for(i=0;i<1;i++)// + { + THR[i]=0; + } + + // scan thr file and assign current precomputed thresholds for filtering + while( (n = fscanf(thrFile,"%d %d", &thR, &thA )) >= 0) + // until the end of the input threshold file + { + if (n != Nthr) // incorrect format, NCOLS=number of columns in pipeCT input + { + printf ("corrupted input thrFile format\n"); + return -1; + } + + THR[0]=thR; + THR[1]=thA; + } + + printf("recommended precomputed Ref threshold= %d\n", thR); + printf("recommended precomputed Alternative threshold= %d\n", thA); + + //6 fields of input file // no PV4 + + while( (n = fscanf(extract_vcfFile,"%d,%d,%d,%d,%d,%d", &pos, &D, &DP4[0], &DP4[1], &DP4[2], &DP4[3])) >= 0 && firstSixAreOK == 1 && canWriteAF == 1) + // read the Read Title + { + if( n != NRID ) // incorrect format + { + firstSixAreOK = 0; + break; + } + count_beforeF++; + sumDbefore=sumDbefore+D; + + // f1 Josie : filter for DP4 separately for ref and alternative alleles + + if ( DP4[0] >= THR[0] && DP4[1]>= THR[0] && DP4[2]>= THR[1] && DP4[3]>= THR[1])// 0 ref alt are ok + { + AF = GetAf(DP4); + histAF[AF-1]++; + /*for(i=0;i<100;i++)// + { + if (AF==i+1) + histAF[i]=histAF[i]+1;// sh be +1 + } + */ + // compute average run depth default (Q13), D across pos,sumD + // compute average run depth actual after Q30-25, sum(DP4) across pos,sumDP4 + count_afterF++; + sumD=sumD+D; + sDP4=DP4[0]+DP4[1]+DP4[2]+DP4[3]; + sumDP4=sumDP4+sDP4; + + sDP4f=sDP4; + Q30frac=sDP4f/D; + + // output check:4 columns position AF% highQ frac, actual depth=sDP4 + if( fprintf(afFile,"%d %d %.2f %d\n", pos, AF, Q30frac,sDP4) < 0 ) + { + canWriteAF = 0; + break; + } + + }// end min_depth filter of DP4 + + if (canWriteAF == 0) {break;} + + // removing space symbols before carriage return + do { + n = fgetc (extract_vcfFile); + } + while ((char)n != '\n'); + + } //END of outer while loop across vcfq file + + // to find significat peak of AF histo! + mu=GetMu(histAF); + std=GetStd(histAF); + + + less50=histAF[0]; + for(i=0;i<101;i++)//we used to consider only 49, because we need only lower freq, esp for diploid organisms + { + sig=0;// significance of histo bin + if (std > 0) { + sig=(histAF[i]-mu)/std; + less50+=histAF[i]; + } + + if( fprintf(distribFile,"%d %d %.2f\n",i, histAF[i],sig) <= 0 ) { + canWriteDistrib = 0; + } + + }// for i=100 cycle + + less25=histAF[0]; + for(i=0;i<25;i++)//only because we need only lower freq, esp for diploid organisms + { + less25+=histAF[i]; + } + bw25_40=histAF[26]; + for(i=27;i<40;i++)//only because we need only lower freq, esp for diploid organisms + { + bw25_40+=histAF[i]; + } + +//compute 25% AF skewness + //if (bw25_40==0){ + sk25=0.00;// if no bases for bw25_40 + //} + if (bw25_40 >0 ) + { + sk25=1.0*less25/bw25_40;// if proportion of less 25% AF is high (more 1?), more likely a mixture + } + + // find first max peak among first 25% AAF + + perc=GetMaxPeak2(histAF);//peaks in first 25% AAF + + // condition on mixture percent: + //if more than 25, it is likely to be diploid allele : 17 Nov+ amount AF<25% skewed (too high) + + if (perc < 25 && sk25 >= thr_sk)// thr_sk was 0.95 for path, 0.62 for human + { + printf("percent mixture= %d\n", perc); + } + if (perc >=25 || sk25 < thr_sk ) + { + perc=0.0; + printf("percent mixture= %d\n", perc); + printf("not likely there is a low percent mixture here at this min depth\n"); + } + + // compute avarege depth :default and after filtering + avD=sumD/count_afterF; + avDP4=sumDP4/count_afterF;//actual after RA filter + avDbefore=sumDbefore/count_beforeF; + perc_left=(100.0*count_afterF/count_beforeF); + printf("actual average depth= %.2f\n", avDP4); + + // confidence of mixture estimating + conf=(1-1/avDP4)*0.7; + + if (sk25>0.8)// larger any how! + {conf=(1-1/avDP4)*0.9;//(sigsig[perc]+1)*(1-1/avDP4);// 0.8 is MY constant + } + + fprolly=ComplimentaryPeaks(histAF, perc); + + conf=conf*fprolly; + if (conf>1) + {conf=1.0;} + + printf("Confidence of non-zero mixture = %.2f\n", conf); + printf("skewness of first 25 perc AF = %.2f\n", sk25); + + fclose(distribFile); + fclose(extract_vcfFile); + fclose(afFile); + + if( firstSixAreOK == 0 || + canWriteAF == 0 || canWriteDistrib ==0) { + printf ("Error during execution. Details: \n"); + printf ("\tfirstSixAreOK %d\n", firstSixAreOK); + printf ("\tcanWriteAF %d\n", canWriteAF); + printf ("\tcanWriteDistrib %d\n", canWriteDistrib); + printf ("Execution aborted\n"); + return -1; + } + + printf(" done.\n"); + return 0; +}//main + +/***************************************************** Functions******************************/ + + +//////////////////////////////////////////////////// +// Calculates AF=alternative allele frequebcy from data=DP4 counts +// input - array (4-vector) of DP4 counts +//output -one float number from (0,1) +//////////////////////////////////////////////////// +int GetAf (int data[]) +{ + float Isum,AF1; + int i,AF; + + Isum = data[0]; + for (i=1; i<4; i++) + { + Isum += data[i]; + } + AF1=(data[2]+data[3])/Isum; + AF=(long) (100*AF1+0.5); + //(long) (sig+0.5) + return AF; +} +// GetMu.c calculates mu for all non-zero elements of 100-vector +float GetMu( int data[]) + { + float mu,Isum; + int i; + int cnz=0; + + Isum = data[0]; + //Isum2 = data[0]*data[0]; + for (i=1; i<101; i++) + { + if (data[i] > 0) + { + cnz++; + Isum += data[i]; + //Isum2 += data[i]*data[i]; + } + } + mu=Isum/cnz; + //std=Isum2/cnz-mu*mu; + return mu; +} + +// ========================stand deviation +float GetStd( int data[]) + { + float mu,std,Isum,Isum2; + int i; + int cnz=0; + + Isum = data[0]; + Isum2 = data[0]*data[0]; + for (i=1; i<101; i++) + { + if (data[i] > 0) + { + cnz++; + Isum += data[i]; + Isum2 += data[i]*data[i]; + } + } + mu=Isum/cnz; + std=sqrt(Isum2/cnz-mu*mu); + return std; +} + + +//----------------------max from the first 25 + int GetMaxPeak2(int data[]) + { + + int Imax; + int i,perc,k; + int peak[25], per_peak[25]; + + // initialise perc + + perc=0.0; + +// find max peak among peaks, first 25 percent only + + // find peaks + k=0; + for (i=1; i<25; i++) + { + if ( ((data[i]-data[i-1]) >0 ) && ((data[i+1]-data[i]) <=0))// && ( Imax < data[i]))//here peak in [i] + { + + peak[k] = data[i]; + per_peak[k]=i; + //printf("peak = %d\n", peak[k]); + //printf("per_peak = %d\n", per_peak[k]); + k=k+1; + + } + } +// find max peak + Imax = peak[0]; + + for (i=1; i0 ) && ((data[i+1]-data[i]) <=0))// && ( Imax < data[i]))//here peak in [i] + { + + peak[k] = data[i]; + per_peak[k]=i; + //printf("peak = %d\n", peak[k]); + //printf("per_peak = %d\n", per_peak[k]); + k=k+1; + + } + } + + // see if these peaks are complimentary to detected perc=% mixture <25% + for (i=0; i (100-perc-around)) && (per_peak[i] < (100-perc+around))) + fprolly=1.1; + } + //&& dp(i,1)>(100-perc-around) & dp(i,1)<(100-perc_ms+m1) + + + return fprolly; + +} \ No newline at end of file From 2eaf06a52b23e9672e8d428dfe2ed6ff46093f8e Mon Sep 17 00:00:00 2001 From: Irina Abnizova Date: Thu, 18 Feb 2016 09:32:08 +0000 Subject: [PATCH 02/17] introduced two modes for contamination check: mode1: mix <25% mode2: mix from (25%,50%) --- src/contamination/get_mixture.c | 866 +++++++++++++++++++++++--------- 1 file changed, 639 insertions(+), 227 deletions(-) diff --git a/src/contamination/get_mixture.c b/src/contamination/get_mixture.c index fcc5ffe..5fbfcb7 100755 --- a/src/contamination/get_mixture.c +++ b/src/contamination/get_mixture.c @@ -1,53 +1,59 @@ - /* File: get_mixture.c // computes maf distribution over variant positions from vcfq files + /* File: get_mixture.c // computes and analyses AAF( maf) distribution over variant positions from vcfq files * Authors: designed by Irina Abnizova (ia1)edited by Steve Leonard (srl) * Last edited: + 25 jan simpler confidences: conf25 + + 11 Jan 2016 : look at ploidy, get hard codes inside corresponding functions + void get_bins_ploidy (int bins_ploid[], int plo);// // how to consider first n25 depending on ploidy + void get_skew_ploidy (float thr_sk[], int plo);// // how to consider skew25, skew75 depending on ploidy + + 5 January 2016: picks up automaticaly if it is diploid or haploid + 17 December 2015: two modes 1. less 25% 2. b/w 30 and 40 percentage + 13 June -output sk25,sk75 + 12 june-- final mixture file; modified as output name.mix: mix_percentage=17 +possible_mix_percentage= 17 +confidence_nonzeroMix=0.64 +AvActDepth=52 (avDP4) +min_depthR=2 +min_depthA=2 + + 30 May last 75 implemented - 28 April-to remove training printfs, to output mixture=0 if not any - 25 April: incorporate info about (100-mix)% to increase proly of detection - 24 April-output in infoAF 4 fields for each variant position, last is its actual coverage sDP4 - 22 April: max sginif or first max if not signif + all 100 AAF to see 'complimentary' - 10 April: if ( DP4[0] >= THR[0] && DP4[1]>= THR[0] && DP4[2]>= THR[1] && DP4[3]>= THR[1])// 0 ref alt are ok - thr_sk - 10 March 2014 for skewness to first 25% AAF - 3 feb-reads thr from a file; 29 Jan 2014 - get autom thr, starting with default ones 1,5 - still old fields of vcf input - 21 Nov 2013, for two thresholds *------------------------------------------------------------------- - * Description: computes maf distrib over variant positions from just extracted 6 fileld + * Description: computes AAF distrib over variant positions from vcf extracted 6 fields, stores histogram file 'distr', + computes possible percentage of contaminated mixture and its confidence * Exported functions: * HISTORY: skewness to first 25% of AAF ----------introduced 10 March -computes AF (alternative to Ref allele frequency) for each error base call from vcf +computes AF (alternative to Ref allele frequency) for each error base call from extracted vcf applies minimum depth threshold, both for Ref and Alternative alleles fwd rev (total min depth will be 4*MIN_DEPTH -input1 thrfile (thRR thAA) -input2: extracted vcf, +input1 thrfile (thRR thAA mu std ploidy ) + 3 2 82 110 2 +input2: extracted vcf: pos, D , DP4 (DP4 is four fields) -output1: info AF.txt= -pos %AF Q30frac sumDP4 (for this position) -1546 100 0.89 101 -4469 100 0.86 90 -4532 100 0.78 87 -4559 97 0.95 60 +output1: mix_conf AF.txt= +mixture confidence +12 0.87 -output2: distrAF.tsigt=histogram of AF with significance for each bin -% count signif -8 0 0 -9 0 0 -10 385 1 -11 780 3 +output2: .distrAF=histogram of AF with significance for each bin +% count +8 0 +9 0 +10 385 +11 780 -usage:./get_mixture ra_i_j.thr extracted.vcf .infAF .distrAF +usage:./get_mixture rams.thr extracted.vcf name.mix name.distrAF */ @@ -61,16 +67,28 @@ usage:./get_mixture ra_i_j.thr extracted.vcf .infAF .distrAF #define NRID 6 // Number of variant ids=fields: pos, chromosome, Depth,DP4,(no PV4+maf) in esigtracted_vcf files #define NPAR 5 // masig Number of PARameters and arguments -#define Nthr 2 // Number of thresholds +#define Nthr 5 // Number of thresholds and muD1 stdD1 12 June + ploidy=1 (haploid) (or 2-diploid) // ******** declarations of functions -int GetAf (int []);// computes AF percent for a variant position +int GetAf (int []);// computes AF percentage for a variant position +void get_bins_ploidy (int bins_ploid[], int plo);// // how to consider first n25 depending on ploidy +void get_skew_ploidy (float thr_sk[], int plo);// // how to consider skew25, skew75 depending on ploidy float GetMu( int data[]); float GetStd( int data[]); +float GetSkew25( int data[],int n25, int n49); +float GetSkew75( int data[],int n51, int n75); + +float Confidence25(float sk25,float sk75,int mix25,int mixc25,float thr25,float thr75, float avDP4); +float Confidence25_50(float sk25,float sk75,int mix25_50,int mixc25_50,float thr25,float thr75, float avDP4); + + +int GetMaxPeak25(int data[], int n25); +int GetPeakComplement25(int data[], int n75); + +int GetMaxPeak25_50(int data[], int n25, int n49); +int GetPeakComplement25_50(int data[], int n51, int n75); -int GetMaxPeak2(int data[]); -float ComplimentaryPeaks(int data[], int perc); //=====================================================MAIN @@ -78,49 +96,71 @@ int main (int argc, char *argcv[]) { // flags int firstSixAreOK = 1; - int canWriteAF = 1; + int canWriteMix = 1; int canWriteDistrib = 1; - FILE *extract_vcfFile, *afFile, *distribFile, *thrFile; //file handles - - int n,i,perc;//, cntZero=0, counter=0; - //char chrom[50]; - int count_afterF=0;// after filtering with min depths - int count_beforeF=0;// before filtering with min depths - int DP4[4];// - int D,pos,AF;// Depth, genome pos of error - int sumD=0,sumDP4=0;//across all pos! - int sumDbefore=0; - int thR,thA;// from file - int less50; - int more25;// sum AF (25,50) - int less25; - int bw25_40; - int sDP4;//act coverage of a base - - float sDP4f;//act coverage of a base as float - float mu,std;//mean and std of alternative frequency, AF - float avDbefore; - float avD,avDP4,Q30frac; - - float perc_left; - float sig,conf;//confidence (0,1) - float sk25;//skewness to first 25% of AAF ----------introduced 10 March - float thr_sk; - float fprolly; - - int histAF[100];// to store mafs percentages - int THR[2];// define from input1 thr file + FILE *thrFile, *extract_vcfFile, *mixFile, *distribFile ; //file handles + + int n,i; + int dist;// distance bewteen mix and (100-mix_compl): should be 0 ideally + + // ---------- params from input1 rams(now four+1 values thrR thrA muD stdD ploidy) + int muD,stdD; + int thR,thA, ploid,plo; + + int THR[2];// compute from input1 thr file + int msD[2]; + int bins_ploid[4];// 4-vector of bins depending on ploidy + float thr_sk[2];//2-vector of sk25,sk75 depending on ploidy + + + //---------------params from input2 vcfq + int DP4[4];// + int D,pos;// Depth, genome pos of error + + //----------stats to compute + int avDP4;// of actual sDP4 depth after RA & bad regions filtering + int sumDP4=0;//across all pos for mu std + int sumDbefore=0; + float avDbefore; + + float sDP4; + float mix25_left; + // ----------------settings for skewness and their likelihoods: hardcoded within functions + // mode 1: mix<=25% + int n25,n49; + int n51,n75; + // mode 2 : 30< mix < 45 + int n2_25,n2_49; + int n2_51,n2_75; + // skewness thresholds + float thr25,thr75; + + //---------------results to compute: + int count_afterF=0;// after filtering with min depths + int count_beforeF=0;// before filtering with min depths + + int mix25,mixc25; + int mix25_update; + int mix25_50,mixc25_50; + int AF; + float conf,conf25,conf25_50;//confidence (0,1) + float sk25,sk75;//skewness to first 25% of AAF ----------introduced 10 March + float likely75,likely25; + + //arrays + int histAF[100];// to store mafs percentages + +//-----------------------------------------open files for read/write if(argc < NPAR)//four input_output files are submitted { printf("not enough of parms |input_output files\n"); - printf("usage:./get_signifAF name.thr input2 output1 output2\n"); + printf("usage:./get_mixture input1.thr input2.vcfq output1.mix output2.distr\n"); return -1; } - thrFile=fopen(argcv[1],"r"); if (thrFile == NULL) { printf("cannot open first input _threshold file %s\n", argcv[1]); @@ -129,13 +169,13 @@ int main (int argc, char *argcv[]) { extract_vcfFile=fopen(argcv[2],"r"); if (extract_vcfFile == NULL) { - printf("cannot open first input _vcf.tsigt file %s\n", argcv[2]); + printf("cannot open first input _.vcfq file %s\n", argcv[2]); return -1; } - afFile=fopen(argcv[3],"w"); - if (afFile == NULL) { - printf("cannot open first output1 infoAF.tsigt file %s for writing\n", argcv[3]); + mixFile=fopen(argcv[3],"w"); + if (mixFile == NULL) { + printf("cannot open first output1 mix_conf file %s for writing\n", argcv[3]); return -1; } @@ -145,43 +185,73 @@ int main (int argc, char *argcv[]) { return -1; } - printf("get_peak_thrFile\n"); - - // give value for thr_sk= 0.55 for human; 1.0 for pathogens - thr_sk=0.6; + printf("get_mixture: parameters\n"); + // ===SETTINGS parameters for skewness measurements and peak consideretion: if Diploid then more noise around 50% + dist=10; // initiate zero vector for histograme for(i=0;i<100;i++)// { histAF[i]=0; } - // initiate zero vector for threshold + // initiate zero vectors for thresholds and constant for ploidy for(i=0;i<1;i++)// { THR[i]=0; + msD[i]=0; + thr_sk[i]=0.0; } + plo=0;// initial ploidy + //initial bins + for(i=0;i<4;i++)// + { + bins_ploid[i]=0; + } - // scan thr file and assign current precomputed thresholds for filtering - while( (n = fscanf(thrFile,"%d %d", &thR, &thA )) >= 0) + // scan thr file and assign current precomputed thresholds mu sd ploidy for filtering + while( (n = fscanf(thrFile,"%d %d %d %d %d", &thR, &thA,&muD, &stdD,&ploid)) >= 0) // until the end of the input threshold file { - if (n != Nthr) // incorrect format, NCOLS=number of columns in pipeCT input + if (n != Nthr) // incorrect format input { printf ("corrupted input thrFile format\n"); return -1; } - THR[0]=thR; THR[1]=thA; + msD[0]=muD; + msD[1]=stdD; + plo=ploid; } + printf("ploidy= %d\n", plo); + + //===============================get precomputed bins for skewness dep on ploidy + get_bins_ploidy(bins_ploid,plo);// + + for(i=0;i<4;i++){ + printf("ploid_bin= %d\n", bins_ploid[i]); - printf("recommended precomputed Ref threshold= %d\n", thR); - printf("recommended precomputed Alternative threshold= %d\n", thA); + } + // mode 1: mix<25% diploid + n25=bins_ploid[0]; + n49=bins_ploid[1]; + n51=bins_ploid[2]; + n75=bins_ploid[3]; + + get_skew_ploidy (thr_sk, plo); + for(i=0;i<2;i++){ + printf("ploid_skew= %.2f\n", thr_sk[i]); + + } + // give value for thr25= 0.55 for human diploid; 1.0, 0.8 for pathogens-haploid + thr25=thr_sk[0]; + thr75=thr_sk[1];// complementary peak's (100-mix) skewness - //6 fields of input file // no PV4 + //6 fields of input2 file // no PV4: do it to compute AAF=AF + + + while( (n = fscanf(extract_vcfFile,"%d,%d,%d,%d,%d,%d", &pos, &D, &DP4[0], &DP4[1], &DP4[2], &DP4[3])) >= 0 && firstSixAreOK == 1 && canWriteMix == 1) - while( (n = fscanf(extract_vcfFile,"%d,%d,%d,%d,%d,%d", &pos, &D, &DP4[0], &DP4[1], &DP4[2], &DP4[3])) >= 0 && firstSixAreOK == 1 && canWriteAF == 1) - // read the Read Title { if( n != NRID ) // incorrect format { @@ -191,151 +261,115 @@ int main (int argc, char *argcv[]) { count_beforeF++; sumDbefore=sumDbefore+D; - // f1 Josie : filter for DP4 separately for ref and alternative alleles + // f1 Josie : filter for DP4 separately for ref and alternative alleles+ filter abnormal Depth - if ( DP4[0] >= THR[0] && DP4[1]>= THR[0] && DP4[2]>= THR[1] && DP4[3]>= THR[1])// 0 ref alt are ok + if ( DP4[0] >= THR[0] && DP4[1]>= THR[0] && DP4[2]>= THR[1] && DP4[3]>= THR[1] && D < (msD[0]+msD[1]))// 0 ref alt are ok { AF = GetAf(DP4); histAF[AF-1]++; - /*for(i=0;i<100;i++)// - { - if (AF==i+1) - histAF[i]=histAF[i]+1;// sh be +1 - } - */ - // compute average run depth default (Q13), D across pos,sumD - // compute average run depth actual after Q30-25, sum(DP4) across pos,sumDP4 - count_afterF++; - sumD=sumD+D; + + // for EACH POSITION, compute depths after thrRA filtering: + // average run depth after Q30, sum(DP4) across pos,= sumDP4 + count_afterF++; sDP4=DP4[0]+DP4[1]+DP4[2]+DP4[3]; sumDP4=sumDP4+sDP4; + }// end min_depth filter of DP4 - sDP4f=sDP4; - Q30frac=sDP4f/D; - - // output check:4 columns position AF% highQ frac, actual depth=sDP4 - if( fprintf(afFile,"%d %d %.2f %d\n", pos, AF, Q30frac,sDP4) < 0 ) - { - canWriteAF = 0; - break; - } + // -------------output stupidly deep positions if (D >= (msD[0]+msD[1])) { fill the file - }// end min_depth filter of DP4 - - if (canWriteAF == 0) {break;} - - // removing space symbols before carriage return + // removing space symbols before carriage return do { n = fgetc (extract_vcfFile); } while ((char)n != '\n'); - } //END of outer while loop across vcfq file - - // to find significat peak of AF histo! - mu=GetMu(histAF); - std=GetStd(histAF); + } //END of outer while loop across vcfq file AF is computed! - - less50=histAF[0]; - for(i=0;i<101;i++)//we used to consider only 49, because we need only lower freq, esp for diploid organisms + //====================== output2 write AF + for(i=0;i<101;i++) { - sig=0;// significance of histo bin - if (std > 0) { - sig=(histAF[i]-mu)/std; - less50+=histAF[i]; + if( fprintf(distribFile,"%d %d\n",i, histAF[i]) <= 0 ) { + canWriteDistrib = 0; } - if( fprintf(distribFile,"%d %d %.2f\n",i, histAF[i],sig) <= 0 ) { - canWriteDistrib = 0; - } - }// for i=100 cycle +//=======================================analysis of AF distribution to find mixture sample - less25=histAF[0]; - for(i=0;i<25;i++)//only because we need only lower freq, esp for diploid organisms - { - less25+=histAF[i]; - } - bw25_40=histAF[26]; - for(i=27;i<40;i++)//only because we need only lower freq, esp for diploid organisms - { - bw25_40+=histAF[i]; - } + sk25=GetSkew25(histAF,n25,n49); + sk75=GetSkew75(histAF,n51,n75); -//compute 25% AF skewness - //if (bw25_40==0){ - sk25=0.00;// if no bases for bw25_40 - //} - if (bw25_40 >0 ) - { - sk25=1.0*less25/bw25_40;// if proportion of less 25% AF is high (more 1?), more likely a mixture - } +//===================================mode 1: +// find what percentage AAF gives first max peak among first 25% AAF + mix25=GetMaxPeak25(histAF,n25);//peaks in first 25% AAF + mixc25=GetPeakComplement25(histAF,n75);// complement to 25% peaks (100-mix25) - // find first max peak among first 25% AAF + //-----==================================mode2 + mix25_50=GetMaxPeak25_50(histAF,n25,n49);//peaks in (25,49) percentage AAF + mixc25_50=GetPeakComplement25_50(histAF,n51,n75); - perc=GetMaxPeak2(histAF);//peaks in first 25% AAF - // condition on mixture percent: - //if more than 25, it is likely to be diploid allele : 17 Nov+ amount AF<25% skewed (too high) +// stats after filtering :compute average depth :default and after filtering - if (perc < 25 && sk25 >= thr_sk)// thr_sk was 0.95 for path, 0.62 for human - { - printf("percent mixture= %d\n", perc); - } - if (perc >=25 || sk25 < thr_sk ) - { - perc=0.0; - printf("percent mixture= %d\n", perc); - printf("not likely there is a low percent mixture here at this min depth\n"); - } + avDP4=ceil(sumDP4/count_afterF);//actual average cov after Filt + mix25_left=(100.0*count_afterF/count_beforeF);// percentage of filteerd out regions ( ? do we need?) - // compute avarege depth :default and after filtering - avD=sumD/count_afterF; - avDP4=sumDP4/count_afterF;//actual after RA filter - avDbefore=sumDbefore/count_beforeF; - perc_left=(100.0*count_afterF/count_beforeF); - printf("actual average depth= %.2f\n", avDP4); +// make decision about on mixture percentage: update it to zero if needed + // mix25_update=MakeDecisionMix(mix25, sk25, thr25,sk75,thr75); - // confidence of mixture estimating - conf=(1-1/avDP4)*0.7; - if (sk25>0.8)// larger any how! - {conf=(1-1/avDP4)*0.9;//(sigsig[perc]+1)*(1-1/avDP4);// 0.8 is MY constant - } - fprolly=ComplimentaryPeaks(histAF, perc); + // confidences of mixture estimating + + conf25=Confidence25(sk25,sk75,mix25,mixc25, thr25, thr75, avDP4); + conf25_50=Confidence25_50(sk25,sk75,mix25_50,mixc25_50, thr25, thr75, avDP4); + + printf("Results:\n"); + //printf("updated percentage mixture= %d\n", mix25_update); + printf("possible low percentage mixture= %d\n", mix25); + //printf("possible complement low percentage mixture= %d\n", mixc25); + + if (( mix25-100+mixc25)1) - {conf=1.0;} + printf("confidence of low freq mixture = %.4f\n", conf25); - printf("Confidence of non-zero mixture = %.2f\n", conf); - printf("skewness of first 25 perc AF = %.2f\n", sk25); + printf("possible high percentage mixture= %d\n", mix25_50); + printf("confidence of high freq mixture = %.4f\n", conf25_50); + //printf("possible complement high percentage mixture= %d\n", mixc25_50); + //printf("Confidence of non-zero mixture = %.2f\n", conf); + //printf("skewness of first 25 mix25 AF = %.2f\n", sk25); + //printf("skewness of last 25 mix25 AF = %.2f\n", sk75); + // printf("likelihood of last 25 mix25 AF = %.2f\n", likely75); + + // MAIN output: mixFile + if (fprintf(mixFile,"mix_percentage=%d\npossible_mix_percentage= %d\nhigh_freq_possible_mix_percentage= %d\nconfidence_nonzeroMix=%.2f\nAvActDepth=%d\nmin_depthR=%d\nmin_depthA=%d\nskew25=%.2f\nskew75=%.2f\n", mix25_update,mix25,mix25_50,conf,avDP4,thR,thA,sk25,sk75)<= 0 ) + { + canWriteMix = 0; + //return -1; + } fclose(distribFile); fclose(extract_vcfFile); - fclose(afFile); + fclose(mixFile); if( firstSixAreOK == 0 || - canWriteAF == 0 || canWriteDistrib ==0) { + canWriteMix == 0 || canWriteDistrib ==0) { printf ("Error during execution. Details: \n"); printf ("\tfirstSixAreOK %d\n", firstSixAreOK); - printf ("\tcanWriteAF %d\n", canWriteAF); + printf ("\tcanWriteMix %d\n", canWriteMix); printf ("\tcanWriteDistrib %d\n", canWriteDistrib); printf ("Execution aborted\n"); return -1; } - printf(" done.\n"); + printf(" get mixture is performed.\n"); return 0; }//main /***************************************************** Functions******************************/ - -//////////////////////////////////////////////////// -// Calculates AF=alternative allele frequebcy from data=DP4 counts + //////////////////////////////////////////////////// +// Calculates AF=alternative allele frequency from data=DP4 counts // input - array (4-vector) of DP4 counts //output -one float number from (0,1) //////////////////////////////////////////////////// @@ -345,13 +379,12 @@ int GetAf (int data[]) int i,AF; Isum = data[0]; - for (i=1; i<4; i++) + for (i=1; i<4; i++)//there are four conts for each base { Isum += data[i]; } AF1=(data[2]+data[3])/Isum; AF=(long) (100*AF1+0.5); - //(long) (sig+0.5) return AF; } // GetMu.c calculates mu for all non-zero elements of 100-vector @@ -362,18 +395,16 @@ float GetMu( int data[]) int cnz=0; Isum = data[0]; - //Isum2 = data[0]*data[0]; - for (i=1; i<101; i++) + for (i=1; i<101; i++) { if (data[i] > 0) { cnz++; Isum += data[i]; - //Isum2 += data[i]*data[i]; } } mu=Isum/cnz; - //std=Isum2/cnz-mu*mu; + return mu; } @@ -399,100 +430,481 @@ float GetStd( int data[]) std=sqrt(Isum2/cnz-mu*mu); return std; } +//================================================================== +//----------------------max from the 25-49 +//----------------------max from the ( 25, 45) + int GetMaxPeak25_50(int data[], int n25, int n49) + { + + // input data=AAF + int Imax; + int i,mix25_50,k; + int peak[n25], mix25_peak[n25]; + + // initialise mix25_50, in case no peaks exist + + mix25_50=0.0; + +// find max peak among peaks, first 25 percentage only, data is maf(AAF) vector + +//what if no peaks? + peak[0]=0; + + // find peaks + k=0; + for (i=n25; i0 ) && ((data[i+1]-data[i]) < 0))// && ( Imax < data[i]))//here peak in [i] + { + + peak[k] = data[i]; + mix25_peak[k]=i; + k=k+1; + + } + } +// found max peak/s if any exist + + if (peak[0]>0) + { + Imax = peak[1]; + + for (i=1; i0)-exist any peak <25% + printf("Mode 2: percentage giving this max = %d\n", mix25_50); + return mix25_50; + +}//================================================================== //----------------------max from the first 25 - int GetMaxPeak2(int data[]) + int GetMaxPeak25(int data[], int n25) { + // input data=AAF int Imax; - int i,perc,k; - int peak[25], per_peak[25]; + int i,mix25,k; + int peak[n25], mix25_peak[n25]; + + // initialise mix25, in case no peaks exist - // initialise perc + mix25=0.0; - perc=0.0; +// find max peak among peaks, first 25 percentage only, data is maf(AAF) vector -// find max peak among peaks, first 25 percent only +//what if no peaks? + peak[0]=0; // find peaks k=0; - for (i=1; i<25; i++) + for (i=1; i0 ) && ((data[i+1]-data[i]) <=0))// && ( Imax < data[i]))//here peak in [i] + if ( ((data[i]-data[i-1]) >0 ) && ((data[i+1]-data[i]) < 0))// && ( Imax < data[i]))//here peak in [i] { peak[k] = data[i]; - per_peak[k]=i; - //printf("peak = %d\n", peak[k]); - //printf("per_peak = %d\n", per_peak[k]); + mix25_peak[k]=i; k=k+1; } } -// find max peak - Imax = peak[0]; +// find max peak/s if any exist + + if (peak[0]>0) + { + Imax = peak[1]; for (i=1; i0)-exist any peak <25% + printf("MOde1: percentage giving this max = %d\n", mix25); + return mix25; } +//==================================================================================== + +//=================================== +float GetSkew25( int data[],int n25, int n49) + { -//=================================================fihg AAF complimentary peaks, (100-perc)AAF if any -float ComplimentaryPeaks(int data[], int perc) + float sk25; + int i,k; + int less25; + int bw25_50; + // sum up from 0 to n25 % + less25=data[0]; + for(i=0;i0 ) + { + sk25=1.0*less25/bw25_50;// if proportion of less 25% AF is high (more 1?), more likely to be a mixture + } + return sk25; +} +// ---------------------float GetSkew( int data[],int n51, int n75); +float GetSkew75( int data[],int n51, int n75) { - float fprolly; + float sk75; int i,k; - int peak[25], per_peak[25]; - int around; + int more75; + int bw75_50; + int last; + + last=5; + // sum up from n75 to almost 100% + more75=data[n75]; + for(i=n75;i<(100-last);i++) + { + more75+=data[i]; + } + // sum 50 to 75 + bw75_50=data[n51]; + for(i=n51;i0 ) + { + sk75=1.0*more75/bw75_50;// if proportion of less 75% AF is high (more 1?), more likely to be a mixture + } + return sk75; +} + + +// make decision if mix25 noise or mixture: update it to zero if needed +//if mix25 is more than n25 (here =25), it is likely to be diploid allele noise : 17 Nov+ amount AF<25% skewed (too high) +//int MakeDecisionMix(int mix25, float sk25, float thr25) +int MakeDecisionMix(int mix25, float sk25, float thr25,float sk75,float thr75)// update percentage mix + +{ + int mix25_update; + mix25_update=0.0; + if ((sk25 >= thr25) && (sk75 >= thr75))// thr of mixture=mix25 ;thr25 was 0.95 for path, 0.62 for human + { + mix25_update=mix25; + //printf("percentage mixture= %d\n", mix25); + } + + + return mix25_update; +} + - around=10;// frequency may vary up to this amount of % - fprolly=1.0; -// find max peak among peaks, last 75% (because as mixture was first 25 percent only) +//==========================================fill the thr bins for skewness +void get_bins_ploidy (int bins_ploid[], int plo)// +{ + if (plo>=2){ + + bins_ploid[0]=24; + bins_ploid[1]=45; + bins_ploid[2]=55; + bins_ploid[3]=77; + + // mode 1: mix<25% diploid + //n25=26; + // n49=46; + // n51=60; + // n75=83; + + + } + + if (plo<2){ + + bins_ploid[0]=26; + bins_ploid[1]=50; + bins_ploid[2]=51; + bins_ploid[3]=75; + + // mode 1: mix<25% haploid + //n25=26; + // n49=50; + // n51=51; + // n75=79; + + + } +} +//--------------------skewness dep on ploidy +void get_skew_ploidy (float thr_sk[], int plo)// // how to consider skew25, skew75 depending on ploidy +{ + if (plo<2) + { + + thr_sk[0]=0.8; + thr_sk[1]=0.6; + } + + // give value for thr25= 0.55 for human diploid; 1.0, 0.8 for pathogens-haploid + //thr25=0.25; + //thr75=0.5;//complementary skewness threshold + + + if (plo>=2) + { + + thr_sk[0]=0.5; + thr_sk[1]=0.4; + } + + +} +//==================================== + int GetPeakComplement25(int data[], int n75) + { + + // input data=AAF + int Imax; + int i,mixc25,k,n; + int mixcc25;//100-mix + int peak[25], perc_peak[25]; + + // initialise mixc25, in case no peaks exist + + mixc25=0.0; + mixcc25=0.0; + +// find max peak among peaks, first 25 percent only, data is maf(AAF) vector + n=6;// not look at last n bins- noise for 100% SNPs +//what if no peaks? + peak[0]=0; // find peaks k=0; - for (i=75; i<101; i++) + for (i=n75; i<100-n; i++) { - if ( ((data[i]-data[i-1]) >0 ) && ((data[i+1]-data[i]) <=0))// && ( Imax < data[i]))//here peak in [i] + if ( ((data[i]-data[i-1]) >0 ) && ((data[i+1]-data[i]) < 0))// && ( Imax < data[i]))//here peak in [i] { peak[k] = data[i]; - per_peak[k]=i; - //printf("peak = %d\n", peak[k]); - //printf("per_peak = %d\n", per_peak[k]); + perc_peak[k]=i; k=k+1; } } +// find max peak/s if any exist - // see if these peaks are complimentary to detected perc=% mixture <25% - for (i=0; i (100-perc-around)) && (per_peak[i] < (100-perc+around))) - fprolly=1.1; + if (peak[0]>0) + { + Imax = peak[1]; + + for (i=1; i0)-exist any peak <25% + + mixcc25=100-mixc25; + printf("Mode 1 com:percentage giving this complement max = %d\n", mixcc25); + return mixcc25; + +} +//=========================== +//================================================================== +//----------------------max peak from the 50- 75 + int GetPeakComplement25_50(int data[], int n51, int n75) + { + + // input data=AAF + int Imax; + int i,mixc25_50,k,n; + int mixcc25_50;// 100-mixc + int peak[25], perc_peak[25]; + + // initialise mixc25_50, in case no peaks exist + + mixc25_50=0.0; + mixcc25_50=0.0; + +// find max peak among peaks, first 25 percent only, data is maf(AAF) vector + +//what if no peaks? + peak[0]=0; + + // find peaks + k=0; + for (i=n51; i0 ) && ((data[i+1]-data[i]) < 0))// && ( Imax < data[i]))//here peak in [i] + { + + peak[k] = data[i]; + perc_peak[k]=i; + k=k+1; + + } } - //&& dp(i,1)>(100-perc-around) & dp(i,1)<(100-perc_ms+m1) +// find max peak/s if any exist + + if (peak[0]>0) + { + Imax = peak[1]; + + for (i=1; i0)-exist any peak <25% + mixcc25_50=100-mixc25_50; + printf("Mode 2 com: percentage giving this complement max = %d\n", mixcc25_50); + return mixcc25_50; +} +// ============================================MODE1 +// confidence of mixture <25%-mode1 +float Confidence25(float sk25,float sk75,int mix25,int mixc25, float thr25, float thr75, float avDP4) +{ +//likelihoods + + + float conf,lik1,likc1; + float E1,EC1; + + //-------------------- + printf("Mode1:\n "); + conf=1.0;//initial + + E1=0.0; + if (mix25 >0 ) E1=1.0;// peak <25 does Exist + printf("exist mix25 = %.2f\n", E1); + + EC1=0.0; + if (mixc25 >0 ) EC1=1.0; + printf(" exist complem mix25 = %.2f\n", EC1); + + lik1=sk25*sk25; + if (sk25 >= thr25) + {lik1=sk25;} + printf(" likely mix25 = %.2f\n", lik1); + + likc1=sk75*sk75; + if (sk75 >= thr75) + {likc1=sk75;} + printf(" likely complem mix25 = %.2f\n", likc1); + + // larger any how! + conf=(1-1/avDP4)*(E1+EC1)*0.5*lik1*likc1; + + if (conf>1) + {conf=1.0;} + + return conf; +} - return fprolly; -} \ No newline at end of file + +//====================== +// confidence of mixture 25%-50% mode2 +float Confidence25_50(float sk25,float sk75,int mix25_50,int mixc25_50, float thr25, float thr75, float avDP4) +{ +//likelihoods + + + float conf,lik1,likc1; + float E1,EC1; + + //-------------------- + printf("Mode2:\n "); + conf=1.0;//initial + + E1=0.0; + if (mix25_50 >0 ) E1=1.0;// peak <25 does Exist + printf("exist mix25_50 = %.2f\n", E1); + + EC1=0.0; + if (mixc25_50 >0 ) EC1=1.0; + printf(" exist complem mix25_50 = %.2f\n", EC1); + + lik1=sk25*sk25; + if (sk25 < thr25)// humph 25-20 is larger than humph25=sk25! + {lik1=1-sk25;} + printf(" likely mix25_50 = %.2f\n", lik1); + + likc1=sk75*sk75; + if (sk75 < thr75) + {likc1=1-sk75;} + printf(" likely complem mix25_50 = %.2f\n", likc1); + + // larger any how! + conf=(1-1/avDP4)*(E1+EC1)*0.5*lik1*likc1; + + if (conf>1) + {conf=1.0;} + + return conf; +} From bcf45bd251f0fd3f732ecac356ef4074cf18eb85 Mon Sep 17 00:00:00 2001 From: Irina Abnizova Date: Fri, 19 Feb 2016 15:05:26 +0000 Subject: [PATCH 03/17] ntroduced peak_dist=8 between complementary mix peaks, into conf functions --- src/contamination/get_mixture.c | 68 +++++++++++++++++++++++++++------ 1 file changed, 57 insertions(+), 11 deletions(-) diff --git a/src/contamination/get_mixture.c b/src/contamination/get_mixture.c index 5fbfcb7..e4be535 100755 --- a/src/contamination/get_mixture.c +++ b/src/contamination/get_mixture.c @@ -3,7 +3,10 @@ * Last edited: - 25 jan simpler confidences: conf25 + 19 Jan confidences + mix1 conf1 low freq allele mixture <25% + mix2 conf2 high freq allele mixture (25% , 50%) + 11 Jan 2016 : look at ploidy, get hard codes inside corresponding functions void get_bins_ploidy (int bins_ploid[], int plo);// // how to consider first n25 depending on ploidy @@ -19,12 +22,12 @@ AvActDepth=52 (avDP4) min_depthR=2 min_depthA=2 - 30 May last 75 implemented + 30 May last 75 bins implemented *------------------------------------------------------------------- * Description: computes AAF distrib over variant positions from vcf extracted 6 fields, stores histogram file 'distr', - computes possible percentage of contaminated mixture and its confidence + computes possible percentage of contaminated mixture (two modes) and its confidence * Exported functions: * HISTORY: @@ -328,8 +331,8 @@ int main (int argc, char *argcv[]) { printf("possible low percentage mixture= %d\n", mix25); //printf("possible complement low percentage mixture= %d\n", mixc25); - if (( mix25-100+mixc25)0 ) EC1=1.0; + if (abs(mixc25-mix25) < peak_dist ) // close enough + EC1=1.0; + //if (mixc25 >0 ) EC1=1.0; printf(" exist complem mix25 = %.2f\n", EC1); lik1=sk25*sk25; if (sk25 >= thr25) {lik1=sk25;} + + printf(" likely mix25 = %.2f\n", lik1); likc1=sk75*sk75; @@ -860,6 +876,9 @@ float Confidence25(float sk25,float sk75,int mix25,int mixc25, float thr25, floa // larger any how! conf=(1-1/avDP4)*(E1+EC1)*0.5*lik1*likc1; + // if (abs(mixc25-mix25) < peak_dist )// close enough + // conf=conf*1.1; + if (conf>1) {conf=1.0;} @@ -877,6 +896,9 @@ float Confidence25_50(float sk25,float sk75,int mix25_50,int mixc25_50, float th float conf,lik1,likc1; float E1,EC1; + int peak_dist; + + peak_dist=8; //-------------------- printf("Mode2:\n "); @@ -887,7 +909,11 @@ float Confidence25_50(float sk25,float sk75,int mix25_50,int mixc25_50, float th printf("exist mix25_50 = %.2f\n", E1); EC1=0.0; - if (mixc25_50 >0 ) EC1=1.0; + // if (mixc25_50 >0 ) EC1=1.0; + + if (abs(mixc25_50-mix25_50) < peak_dist ) // close enough + EC1=1.0; + printf(" exist complem mix25_50 = %.2f\n", EC1); lik1=sk25*sk25; @@ -908,3 +934,23 @@ float Confidence25_50(float sk25,float sk75,int mix25_50,int mixc25_50, float th return conf; } +//------------------------- +int let2int(char letter) +{ + + int result; + + //initiate + result=0; + + if (letter=='a' || letter=='A') + result=1; + if (letter=='c' || letter=='C') + result=2; + if (letter=='g' || letter=='G') + result=3; + if (letter=='t' || letter=='T') + result=4; + + return result; + } From 64a74f81158c278f025f732c139e8c8c6e53eca1 Mon Sep 17 00:00:00 2001 From: Irina Abnizova Date: Mon, 22 Feb 2016 06:13:05 +0000 Subject: [PATCH 04/17] adjusted likelihood for the second mode: mix > 25% --- src/contamination/get_mixture.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/contamination/get_mixture.c b/src/contamination/get_mixture.c index e4be535..72ee123 100755 --- a/src/contamination/get_mixture.c +++ b/src/contamination/get_mixture.c @@ -916,14 +916,14 @@ float Confidence25_50(float sk25,float sk75,int mix25_50,int mixc25_50, float th printf(" exist complem mix25_50 = %.2f\n", EC1); - lik1=sk25*sk25; + lik1=sk25*sk25;//assume sk25<1 if (sk25 < thr25)// humph 25-20 is larger than humph25=sk25! - {lik1=1-sk25;} + {lik1=sk25;} printf(" likely mix25_50 = %.2f\n", lik1); likc1=sk75*sk75; if (sk75 < thr75) - {likc1=1-sk75;} + {likc1=sk75;} printf(" likely complem mix25_50 = %.2f\n", likc1); // larger any how! From 73fe12b37de1053e4d763dece6fc893ce95df47c Mon Sep 17 00:00:00 2001 From: Irina Abnizova Date: Wed, 24 Feb 2016 10:25:34 +0000 Subject: [PATCH 05/17] introduced likelyhood --- src/contamination/get_mixture.c | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/src/contamination/get_mixture.c b/src/contamination/get_mixture.c index 72ee123..a76ca79 100755 --- a/src/contamination/get_mixture.c +++ b/src/contamination/get_mixture.c @@ -66,7 +66,7 @@ usage:./get_mixture rams.thr extracted.vcf name.mix name.distrAF #include #include -// ******** predefined user constants, 22 october +// ******** predefined user constants, #define NRID 6 // Number of variant ids=fields: pos, chromosome, Depth,DP4,(no PV4+maf) in esigtracted_vcf files #define NPAR 5 // masig Number of PARameters and arguments @@ -135,8 +135,6 @@ int main (int argc, char *argcv[]) { int n25,n49; int n51,n75; // mode 2 : 30< mix < 45 - int n2_25,n2_49; - int n2_51,n2_75; // skewness thresholds float thr25,thr75; @@ -351,7 +349,7 @@ int main (int argc, char *argcv[]) { // canWriteMix = 0; //return -1; //} - if (fprintf(mixFile,"mix lowfreq=%d\nconfidence low freq= %.4f\nmix high freq= %d\nconfidence high freq=%.4f\nAvActDepth=%d\nmin_depthR=%d\nmin_depthA=%d\nskew25=%.2f\nskew75=%.2f\n", mix25,conf25,mix25_50,conf25_50,avDP4,thR,thA,sk25,sk75)<= 0 ) + if (fprintf(mixFile,"mix low freq=%d\nconfidence low freq= %.4f\nmix high freq= %d\nconfidence high freq=%.4f\nAvActDepth=%d\nmin_depthR=%d\nmin_depthA=%d\nskew25=%.2f\nskew75=%.2f\n", mix25,conf25,mix25_50,conf25_50,avDP4,thR,thA,sk25,sk75)<= 0 ) { canWriteMix = 0; //return -1; @@ -555,7 +553,7 @@ float GetStd( int data[]) break;} } }// if (peak[1]>0)-exist any peak <25% - printf("MOde1: percentage giving this max = %d\n", mix25); + printf("Mode1: percentage giving this max = %d\n", mix25); return mix25; } @@ -630,7 +628,9 @@ float GetSkew75( int data[],int n51, int n75) // make decision if mix25 noise or mixture: update it to zero if needed //if mix25 is more than n25 (here =25), it is likely to be diploid allele noise : 17 Nov+ amount AF<25% skewed (too high) -//int MakeDecisionMix(int mix25, float sk25, float thr25) + + +//int MakeDecisionMix(int mix25, float sk25, float thr25)---------------------to update mix int MakeDecisionMix(int mix25, float sk25, float thr25,float sk75,float thr75)// update percentage mix { @@ -647,7 +647,6 @@ int MakeDecisionMix(int mix25, float sk25, float thr25,float sk75,float thr75)// } - //==========================================fill the thr bins for skewness void get_bins_ploidy (int bins_ploid[], int plo)// { @@ -663,8 +662,6 @@ void get_bins_ploidy (int bins_ploid[], int plo)// // n49=46; // n51=60; // n75=83; - - } if (plo<2){ @@ -679,8 +676,6 @@ void get_bins_ploidy (int bins_ploid[], int plo)// // n49=50; // n51=51; // n75=79; - - } } //--------------------skewness dep on ploidy @@ -916,7 +911,7 @@ float Confidence25_50(float sk25,float sk75,int mix25_50,int mixc25_50, float th printf(" exist complem mix25_50 = %.2f\n", EC1); - lik1=sk25*sk25;//assume sk25<1 + lik1=sk25*sk25; if (sk25 < thr25)// humph 25-20 is larger than humph25=sk25! {lik1=sk25;} printf(" likely mix25_50 = %.2f\n", lik1); From 11ba91dd76b92b36387e223fa6d66b3e3715cfa1 Mon Sep 17 00:00:00 2001 From: Irina Abnizova Date: Wed, 24 Feb 2016 10:26:26 +0000 Subject: [PATCH 06/17] new program for automated ploidy detection and calc of filter threshold and stats output is filtered vcfq file --- src/contamination/get_thr_ploidy.c | 353 +++++++++++++++++++++++++++++ 1 file changed, 353 insertions(+) create mode 100755 src/contamination/get_thr_ploidy.c diff --git a/src/contamination/get_thr_ploidy.c b/src/contamination/get_thr_ploidy.c new file mode 100755 index 0000000..84993f0 --- /dev/null +++ b/src/contamination/get_thr_ploidy.c @@ -0,0 +1,353 @@ +/* File: get_thr_ploidy.c // filters 1 1 vcfq file and computes av std Depth depending on Depth of vcfq file + + * Authors: designed by Irina Abnizova (ia1) and edited by Steve Leonard(srl) + * + Last edited: + 18 Jan 2016 - real std + 11 June: one thr file. output is e.g. 2 2 41 113 + 9 June 2014 :n1 from pipe of three:filters 1 1 vcfq file and computes av std Depth depending on Depth of vcfq file + 28 May : not output filtered vcfq + 22 May get_filters 1 1 .vcfq RA_ms.filters + 21 May filters bad varians: abnormal Depth + 30 April: adds Filteref 1,1 vcfq + distr_1_1 + 29 April + // computes suitable thresholds bases on avDP4=actual (not cov before as earlier!!! + + 3 Feb-put thrR thrA into file thrD ;29 Jan 2014 - get autom thr, starting with default ones 1,5 + still old fields of vcf input + + *------------------------------------------------------------------- + * Description: computes computes threshold file depending on Depth of vcfq file + + * Exported functions: + * HISTORY: + +computes minimum depth threshold, both for Ref and Alternative alleles fwd rev (total min depth will be +4*MIN_DEPTH + + +input: MIN_DEPTH_R(integer)=1[default] MIN_DEPTH_A(integer)=1[default] extracted=vcfq, + +output1: +thrR thrA +3 1 +output2: .distAF_1_1 +output3: .vcfqF_1_1 filtered 1,1 +usage:./get_thrRA MIN_DEPTH_R(integer) MIN_DEPTH_A(integer) name.vcf ra_i_j.thr(Depth) name.distr name.vcfqF_1_1 + +*/ + + +#include +#include +#include +#include + +// ******** predefined user constants, + +#define NRID 6 // Nunber of variant ids=fields: pos, Depth,DP4,(no PV4+maf) in extracted_vcf files +#define NPAR 7 // Number of PARameters and arguments + + +// ********* Global variables min_depths for Ref and Alternative alleles +int thR = 0; +int thA = 0; + +// ******** declarations of functions + +int GetAf (int []);// computes AF percent for a variant position +int GetMax (int []);// percent giving max AAF +int ploidy (int ind);// defines ploidy from results of GetMax ploid=2 if ~50% +int GetMax50 (int data[]);// local max around 50% + +int main (int argc, char *argcv[]) { + + // flags + int firstSixAreOK = 1; + int canWriteDistrib = 1; + int canWriteFilteredVCF = 1; + + FILE *extract_vcfFile, *thrFile, *distribFile, *filtered_vcfFile; //file handles + + int n,i,ind,ploid, ind50;// + int count_afterF=0;// after filtering with min depths + int count_beforeF=0;// before filtering with min depths + int DP4[4];// + int D,pos;// Depth, genome pos of error + int AF; + + //arrays + int histAF[100];// to store mafs percentages + + + //stats + int sumDP4=0,sumDP4_2=0;//across all pos for mu std + int sumDbefore=0; + float avDbefore; + float Q30frac; + float sDP4; + float perc_left; + + //to compute + int avDP4,stdDP4,stdDP4_1,avDP41;// of actual sDP4 depth after 1 1 filtering + int thRR, thAA;// recommended based on actual depth avDP4 and stdDP4 + float ratio; + + if(argc < NPAR)//five input_output files are submitted + { + printf("not enough of parms |input_output files\n"); + printf("usage:./get_thr thrR thrA input2 output1 output2 output3\n"); + return -1; + } + + if (sscanf(argcv[1],"%d",&thR) == EOF) { + printf("Failed to convert the first argument %s to integer\n", argcv[1]); + } + + + if (sscanf(argcv[2],"%d",&thA) == EOF) { + printf("Failed to convert the second argument %s to integer\n", argcv[2]); + } + + + extract_vcfFile=fopen(argcv[3],"r"); + if (extract_vcfFile == NULL) { + printf("cannot open first input _vcf.txt file %s\n", argcv[3]); + return -1; + } + // two outputs + thrFile=fopen(argcv[4],"w"); + if (thrFile == NULL) { + printf("cannot open first output file thrD %s\n", argcv[4]); + return -1; + } + + distribFile=fopen(argcv[5],"w"); + if (distribFile == NULL) { + printf("cannot open second output file distAF %s\n", argcv[5]); + return -1; + } + + filtered_vcfFile=fopen(argcv[6],"w"); + if (filtered_vcfFile == NULL) { + printf("cannot open second output file filteredVCF %s\n", argcv[6]); + return -1; + } + + printf("get_thr_ploidy\n"); + + // initiate zero vector for histograme + for(i=0;i<100;i++)// + { + histAF[i]=0; + } + + + //6 fields of input file,CSV reading + while( (n = fscanf(extract_vcfFile,"%d,%d,%d,%d,%d,%d", &pos, &D, &DP4[0], &DP4[1], &DP4[2], &DP4[3])) >= 0 && firstSixAreOK == 1)// && canWriteAF == 1) + // read the Read Title + { + if( n != NRID ) // incorrect format + { + firstSixAreOK = 0; + break; + } + + count_beforeF++; + sumDbefore=sumDbefore+D; + + + // f1 Josie : filter for DP4 separately for ref and alternative alleles + + if ( DP4[0] >= thR && DP4[1]>= thR && DP4[2]>= thA && DP4[3]>= thA)// && D < 10)// 0 ref are ok + { + + // compute AAF histo + AF = GetAf(DP4); + histAF[AF-1]++; + + // for EACH POSITION, compute actual depths sDP4 after thrRA filtering: + + // average run depth after Q25, sum(DP4) across pos,= sumDP4 + count_afterF++; + sDP4=DP4[0]+DP4[1]+DP4[2]+DP4[3];// for one position + sumDP4=sumDP4+sDP4;//for av + sumDP4_2=sumDP4_2+sDP4*sDP4;//for std + + Q30frac=sDP4/D;// how actual quality Depth differs from default + + if( fprintf(filtered_vcfFile,"%d,%d,%d,%d,%d,%d\n",pos, D, DP4[0], DP4[1],DP4[2], DP4[3]) <= 0 ) { + canWriteDistrib = 0; + //"%d,%d,%d,%d,%d,%d", pos, D, DP4[0], DP4[1],DP4[2], DP4[3])) + } + + }// end filter DP4 + + // removing space symbols before carriage return + do { + n = fgetc (extract_vcfFile); + } + while ((char)n != '\n'); + + } //END of outer while loop for all vcfq file, distrAF is computed + + // check ploidy + ind=GetMax(histAF); + printf("percent of max AAF= %d\n", ind); + + ind50=GetMax50(histAF); + printf("percent of max AAF around 50= %d\n", ind50); + + ploid=ploidy(ind); + printf("ploidy= %d\n", ploid); + //====================== output2 write AF + for(i=0;i<101;i++) + { + if( fprintf(distribFile,"%d %d\n",i, histAF[i]) <= 0 ) { + canWriteDistrib = 0; + } + + }// for i=100 cycle + // compute average depth : default and actual after 1,1 and q25 filtering + + // stats after 1,1 filtering + + avDP4=ceil(sumDP4/count_afterF);//actual average cov after Filt + //stdDP4=ceil(sqrt(sumDP4_2/count_afterF));// why did I approximate this like that?... + stdDP4=ceil(sqrt(sumDP4_2/count_afterF -avDP4*avDP4));// + avDbefore=sumDbefore/count_beforeF; + perc_left=(100.0*count_afterF/count_beforeF); + printf("average Depth after filtering= %d\n", avDP4); + printf("std Depth appr= %d\n", stdDP4); + printf("percent data left after q25 & min_depth filtering thrR thrA = %.2f\n", perc_left); + + // adjust actual depth to Bad (deep covered) regions and recommend thr RA + + avDP41=avDP4; + ratio=(sqrt(sumDP4_2/count_afterF))/(sumDP4/count_afterF);//stdDP4/avDP4; + printf("cv actual depth = %.2f\n", ratio); + if ( ratio > 1.5) //bad regions + {avDP41=ceil(0.8*avDP4); + printf("moderately deep covered regions here\n"); + printf("adjusted(avDP4) = %d\n", avDP41); + } + if (ratio > 3) //very bad regions + {avDP41=ceil(0.7*avDP4); + printf("extremely deep covered regions here\n"); + printf("adjusted(avDP4) = %d\n", avDP41); + } + + // recommendation for thrRR AA + if ( avDP41 > 0 && avDP41 < 4 ) + { thRR = 100;thAA=100;//stupidly high? + printf("better not to bother to look for mixture: too low actual coverage=%d\n",avDP4); + } + if ( avDP41 >= 4 && avDP41 < 10 ) + { thRR = 1;thAA=1; + } + if ( avDP41 >= 10 && avDP41 < 70 ) + { thRR = 2;thAA=2; + } + if ( avDP41 >= 70 && avDP41 < 90 ) + { thRR = 3;thAA=2; + } + if ( avDP41 >= 90 ) + { thRR = 3;thAA=3; + } + + // fill in the output files + + //====================== output1 write thrs, avD, stdD and ploidy + fprintf(thrFile,"%d %d %d %d %d\n",thRR,thAA,avDP4,stdDP4,ploid); + + fclose(extract_vcfFile); + fclose(thrFile); + fclose(distribFile); + fclose(filtered_vcfFile); + + // checking write/read + if( firstSixAreOK == 0)// || canWriteFilteredVCF == 0) + { + printf ("Error during execution. Details: \n"); + printf ("\tfirstSixAreOK %d\n", firstSixAreOK); + printf ("\tcanWriteDistrib %d\n", canWriteDistrib); + printf ("\tcanWriteFilteredVCF %d\n", canWriteFilteredVCF); + printf ("Execution aborted\n"); + return -1; + } + + printf(" done get_thr_ploidy.\n"); + return 0; +}//main + +//////////////////////////////////////////////////// +// Calculates AF=alternative allele frequency from data=DP4 counts +// input - array (4-vector) of DP4 counts +//output -one float number from (0,1) +//////////////////////////////////////////////////// +int GetAf (int data[]) +{ + float Isum,AF1; + int i,AF; + + Isum = data[0]; + for (i=1; i<4; i++)//there are four conts for each base + { + Isum += data[i]; + } + AF1=(data[2]+data[3])/Isum; + AF=(long) (100*AF1+0.5); + //(long) (sig+0.5) + return AF; +} +//--------------fins max value and its bin/index; returns bin; data is histo + +int GetMax (int data[]) +{ + int Imax; + int i,ind; + + Imax = data[0]; + for (i=1; i<80; i++)// for histo: not latest bins (for bad reference ~100) + { + if( Imax < data[i]) + { + Imax = data[i]; + ind=i;// bin/index for max value + } + } + return ind; +} + +//-----------------------identify ploidy +int ploidy (int ind) +{ + + int ploid; + + ploid=1; + + if ( ind >40 && ind < 60) + ploid=2; + + return ploid; +} + +//-----------------------double check local max around 50+-10 + +int GetMax50 (int data[]) +{ + int Imax; + int i,ind; + + Imax = data[0]; + for (i=40; i<60; i++)// for histo: not latest bins (for bad reference ~100) + { + if( Imax < data[i]) + { + Imax = data[i]; + ind=i;// bin/index for max value + } + } + return ind; +} From 4d6b3f64f7e43baa0a73aac87f6fa04ad805885b Mon Sep 17 00:00:00 2001 From: Irina Abnizova Date: Wed, 24 Feb 2016 11:53:10 +0000 Subject: [PATCH 07/17] changes to build contamination binaries --- src/Makefile.am | 4 ++-- src/configure.ac | 5 +++-- src/contamination/Makefile.am | 11 +++++++++++ 3 files changed, 16 insertions(+), 4 deletions(-) create mode 100644 src/contamination/Makefile.am diff --git a/src/Makefile.am b/src/Makefile.am index 4e48a1c..6718cd0 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -1,2 +1,2 @@ - -SUBDIRS = shared calibration_pu predictor_pu spatial_filter error_analysis +ACLOCAL_AMFLAGS = -I ac_stubs +SUBDIRS = shared calibration_pu predictor_pu spatial_filter error_analysis contamination diff --git a/src/configure.ac b/src/configure.ac index b5b2a61..4436ee5 100644 --- a/src/configure.ac +++ b/src/configure.ac @@ -1,11 +1,12 @@ dnl Process this file with autoconf to produce a configure script. AC_INIT(pb_cal, 8.0.0) AC_CONFIG_HEADERS(pb_config.h) -AM_INIT_AUTOMAKE +AM_INIT_AUTOMAKE([foreign]) AM_MAINTAINER_MODE dnl Checks for programs. AC_PROG_CC +AC_PROG_LIBTOOL AC_PROG_INSTALL AC_PROG_RANLIB @@ -30,4 +31,4 @@ AX_LIB_IO_LIB(1.12.1, AX_LIB_ZLIB(,[have_zlib=yes], [AC_MSG_ERROR([Abort: no zlib. Please rerun configure using the --with-zlib=DIR option.])]) AX_LIB_SAMTOOLS(0.1.3,[have_bam=yes],[AC_MSG_ERROR([Abort: no samtools. Please rerun configure using the ---with-samtools=DIR option.])]) -AC_OUTPUT(Makefile shared/Makefile calibration_pu/Makefile predictor_pu/Makefile spatial_filter/Makefile error_analysis/Makefile) +AC_OUTPUT(Makefile shared/Makefile calibration_pu/Makefile predictor_pu/Makefile spatial_filter/Makefile error_analysis/Makefile contamination/Makefile) diff --git a/src/contamination/Makefile.am b/src/contamination/Makefile.am new file mode 100644 index 0000000..d65b356 --- /dev/null +++ b/src/contamination/Makefile.am @@ -0,0 +1,11 @@ +bin_PROGRAMS = get_mixture get_thr_ploidy + +get_mixture_SOURCES = get_mixture.c + +get_mixture_LDADD = @IO_LIB_LDFLAGS@ @SAMTOOLS_LDFLAGS@ -L../shared -lhelper_funcs + +get_thr_ploidy_SOURCES = get_thr_ploidy.c + +get_thr_ploidy_LDADD = @IO_LIB_LDFLAGS@ @SAMTOOLS_LDFLAGS@ -L../shared -lhelper_funcs + +AM_CPPFLAGS = @IO_LIB_CFLAGS@ @SAMTOOLS_CFLAGS@ -I@top_srcdir@/shared From 95981cf05822b03c2760c49499f1612231034de2 Mon Sep 17 00:00:00 2001 From: Irina Abnizova Date: Fri, 26 Feb 2016 12:03:59 +0000 Subject: [PATCH 08/17] more realistic likelihood of a mixture --- src/contamination/get_mixture.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/contamination/get_mixture.c b/src/contamination/get_mixture.c index a76ca79..2ac94e4 100755 --- a/src/contamination/get_mixture.c +++ b/src/contamination/get_mixture.c @@ -3,7 +3,9 @@ * Last edited: - 19 Jan confidences + 26 Feb smake likelihood a little bigger: sk=sk*0.9, instead of sk*sk + + Jan confidences mix1 conf1 low freq allele mixture <25% mix2 conf2 high freq allele mixture (25% , 50%) @@ -856,14 +858,14 @@ float Confidence25(float sk25,float sk75,int mix25,int mixc25, float thr25, floa //if (mixc25 >0 ) EC1=1.0; printf(" exist complem mix25 = %.2f\n", EC1); - lik1=sk25*sk25; + lik1=sk25*0.9; if (sk25 >= thr25) {lik1=sk25;} printf(" likely mix25 = %.2f\n", lik1); - likc1=sk75*sk75; + likc1=sk75*0.9; if (sk75 >= thr75) {likc1=sk75;} printf(" likely complem mix25 = %.2f\n", likc1); @@ -911,12 +913,12 @@ float Confidence25_50(float sk25,float sk75,int mix25_50,int mixc25_50, float th printf(" exist complem mix25_50 = %.2f\n", EC1); - lik1=sk25*sk25; + lik1=sk25*0.9; if (sk25 < thr25)// humph 25-20 is larger than humph25=sk25! {lik1=sk25;} printf(" likely mix25_50 = %.2f\n", lik1); - likc1=sk75*sk75; + likc1=sk75*0.9; if (sk75 < thr75) {likc1=sk75;} printf(" likely complem mix25_50 = %.2f\n", likc1); From 0ef55eead6c138d5f7bca6b79963d8806ce1c4e1 Mon Sep 17 00:00:00 2001 From: Irina Abnizova Date: Mon, 14 Mar 2016 15:04:30 +0000 Subject: [PATCH 09/17] introduced three mode's option: low, middle, high AAF with corresponding confidences --- src/contamination/get_mixture.c | 768 +++++++------------------------- 1 file changed, 166 insertions(+), 602 deletions(-) diff --git a/src/contamination/get_mixture.c b/src/contamination/get_mixture.c index 2ac94e4..f2a3a1a 100755 --- a/src/contamination/get_mixture.c +++ b/src/contamination/get_mixture.c @@ -3,28 +3,7 @@ * Last edited: - 26 Feb smake likelihood a little bigger: sk=sk*0.9, instead of sk*sk - - Jan confidences - mix1 conf1 low freq allele mixture <25% - mix2 conf2 high freq allele mixture (25% , 50%) - - - 11 Jan 2016 : look at ploidy, get hard codes inside corresponding functions - void get_bins_ploidy (int bins_ploid[], int plo);// // how to consider first n25 depending on ploidy - void get_skew_ploidy (float thr_sk[], int plo);// // how to consider skew25, skew75 depending on ploidy - - 5 January 2016: picks up automaticaly if it is diploid or haploid - 17 December 2015: two modes 1. less 25% 2. b/w 30 and 40 percentage - 13 June -output sk25,sk75 - 12 june-- final mixture file; modified as output name.mix: mix_percentage=17 -possible_mix_percentage= 17 -confidence_nonzeroMix=0.64 -AvActDepth=52 (avDP4) -min_depthR=2 -min_depthA=2 - - 30 May last 75 bins implemented + 14 March - three modes and likelihoods: low, middle , high *------------------------------------------------------------------- @@ -34,7 +13,6 @@ min_depthA=2 * Exported functions: * HISTORY: -skewness to first 25% of AAF ----------introduced 10 March computes AF (alternative to Ref allele frequency) for each error base call from extracted vcf applies minimum depth threshold, both for Ref and Alternative alleles fwd rev (total min depth will be @@ -74,26 +52,15 @@ usage:./get_mixture rams.thr extracted.vcf name.mix name.distrAF #define NPAR 5 // masig Number of PARameters and arguments #define Nthr 5 // Number of thresholds and muD1 stdD1 12 June + ploidy=1 (haploid) (or 2-diploid) - // ******** declarations of functions int GetAf (int []);// computes AF percentage for a variant position -void get_bins_ploidy (int bins_ploid[], int plo);// // how to consider first n25 depending on ploidy -void get_skew_ploidy (float thr_sk[], int plo);// // how to consider skew25, skew75 depending on ploidy float GetMu( int data[]); float GetStd( int data[]); -float GetSkew25( int data[],int n25, int n49); -float GetSkew75( int data[],int n51, int n75); - -float Confidence25(float sk25,float sk75,int mix25,int mixc25,float thr25,float thr75, float avDP4); -float Confidence25_50(float sk25,float sk75,int mix25_50,int mixc25_50,float thr25,float thr75, float avDP4); - - -int GetMaxPeak25(int data[], int n25); -int GetPeakComplement25(int data[], int n75); - -int GetMaxPeak25_50(int data[], int n25, int n49); -int GetPeakComplement25_50(int data[], int n51, int n75); +void skewnesses (int data[], float sk[], float skc[], int st[], int stc[]); +void MixMode(int mix[], int st[], int data[], int d); +int GetMaxPeakInterval(int data[], int n1, int n2);//------max peak from the interval +void Likely (float Lik[], float sk[],float skc[], int mixc[],float avDP4); //=====================================================MAIN @@ -107,17 +74,11 @@ int main (int argc, char *argcv[]) { FILE *thrFile, *extract_vcfFile, *mixFile, *distribFile ; //file handles int n,i; - int dist;// distance bewteen mix and (100-mix_compl): should be 0 ideally // ---------- params from input1 rams(now four+1 values thrR thrA muD stdD ploidy) int muD,stdD; int thR,thA, ploid,plo; - int THR[2];// compute from input1 thr file - int msD[2]; - int bins_ploid[4];// 4-vector of bins depending on ploidy - float thr_sk[2];//2-vector of sk25,sk75 depending on ploidy - //---------------params from input2 vcfq int DP4[4];// @@ -127,34 +88,28 @@ int main (int argc, char *argcv[]) { int avDP4;// of actual sDP4 depth after RA & bad regions filtering int sumDP4=0;//across all pos for mu std int sumDbefore=0; - float avDbefore; float sDP4; - float mix25_left; + float filtered_left; - // ----------------settings for skewness and their likelihoods: hardcoded within functions - // mode 1: mix<=25% - int n25,n49; - int n51,n75; - // mode 2 : 30< mix < 45 - // skewness thresholds - float thr25,thr75; + // ----------------settings for skewnesses and their likelihoods: hardcoded within functions + int d;//distance , bin width for hist to compute skewness //---------------results to compute: int count_afterF=0;// after filtering with min depths int count_beforeF=0;// before filtering with min depths - int mix25,mixc25; - int mix25_update; - int mix25_50,mixc25_50; int AF; - float conf,conf25,conf25_50;//confidence (0,1) - float sk25,sk75;//skewness to first 25% of AAF ----------introduced 10 March - float likely75,likely25; //arrays int histAF[100];// to store mafs percentages + int peak[100]; + int perc_peak[100]; + int st[4], stc [4];// starts of summing intervals for histo + float sk[3],skc[3];// three skewnesses for mode0,1,2 + int mix[3],mixc[3]; //mixtures three modes and their complements + float Lik[3];// likelihood or confidences three modes //-----------------------------------------open files for read/write if(argc < NPAR)//four input_output files are submitted @@ -191,25 +146,33 @@ int main (int argc, char *argcv[]) { printf("get_mixture: parameters\n"); // ===SETTINGS parameters for skewness measurements and peak consideretion: if Diploid then more noise around 50% - dist=10; - // initiate zero vector for histograme + + // initiate zero vector for histograme and its peaks for(i=0;i<100;i++)// { histAF[i]=0; + peak[i]=0; + perc_peak[i]=0; } - // initiate zero vectors for thresholds and constant for ploidy - for(i=0;i<1;i++)// - { - THR[i]=0; - msD[i]=0; - thr_sk[i]=0.0; - } + plo=0;// initial ploidy - //initial bins - for(i=0;i<4;i++)// + d=11; + //starts for skew + for(i=0;i<4;i++)// + { + st[i]=0; + stc[i]=0; + } + + // initial skwnesses, mix, conf for modes 0,1,2 + for(i=0;i<3;i++)// { - bins_ploid[i]=0; - } + sk[i]=0.0; + skc[i]=0.0; + mix[i]=0; + mixc[i]=0; + Lik[i]=0.0; + } // scan thr file and assign current precomputed thresholds mu sd ploidy for filtering while( (n = fscanf(thrFile,"%d %d %d %d %d", &thR, &thA,&muD, &stdD,&ploid)) >= 0) @@ -220,38 +183,15 @@ int main (int argc, char *argcv[]) { printf ("corrupted input thrFile format\n"); return -1; } - THR[0]=thR; - THR[1]=thA; - msD[0]=muD; - msD[1]=stdD; + plo=ploid; } printf("ploidy= %d\n", plo); - //===============================get precomputed bins for skewness dep on ploidy - get_bins_ploidy(bins_ploid,plo);// - - for(i=0;i<4;i++){ - printf("ploid_bin= %d\n", bins_ploid[i]); - - } - // mode 1: mix<25% diploid - n25=bins_ploid[0]; - n49=bins_ploid[1]; - n51=bins_ploid[2]; - n75=bins_ploid[3]; - - get_skew_ploidy (thr_sk, plo); - for(i=0;i<2;i++){ - printf("ploid_skew= %.2f\n", thr_sk[i]); - - } - // give value for thr25= 0.55 for human diploid; 1.0, 0.8 for pathogens-haploid - thr25=thr_sk[0]; - thr75=thr_sk[1];// complementary peak's (100-mix) skewness - //6 fields of input2 file // no PV4: do it to compute AAF=AF +// read main vcf file + //6 fields of input2 file // no PV4: do it to compute AAF=AF while( (n = fscanf(extract_vcfFile,"%d,%d,%d,%d,%d,%d", &pos, &D, &DP4[0], &DP4[1], &DP4[2], &DP4[3])) >= 0 && firstSixAreOK == 1 && canWriteMix == 1) @@ -266,7 +206,7 @@ int main (int argc, char *argcv[]) { // f1 Josie : filter for DP4 separately for ref and alternative alleles+ filter abnormal Depth - if ( DP4[0] >= THR[0] && DP4[1]>= THR[0] && DP4[2]>= THR[1] && DP4[3]>= THR[1] && D < (msD[0]+msD[1]))// 0 ref alt are ok + if ( DP4[0] >= thR && DP4[1]>= thR && DP4[2]>= thA && DP4[3]>= thA && D < (muD+2*stdD))// 0 ref alt are ok { AF = GetAf(DP4); histAF[AF-1]++; @@ -298,65 +238,34 @@ int main (int argc, char *argcv[]) { }// for i=100 cycle //=======================================analysis of AF distribution to find mixture sample - sk25=GetSkew25(histAF,n25,n49); - sk75=GetSkew75(histAF,n51,n75); - -//===================================mode 1: -// find what percentage AAF gives first max peak among first 25% AAF - mix25=GetMaxPeak25(histAF,n25);//peaks in first 25% AAF - mixc25=GetPeakComplement25(histAF,n75);// complement to 25% peaks (100-mix25) - - //-----==================================mode2 - mix25_50=GetMaxPeak25_50(histAF,n25,n49);//peaks in (25,49) percentage AAF - mixc25_50=GetPeakComplement25_50(histAF,n51,n75); - - // stats after filtering :compute average depth :default and after filtering avDP4=ceil(sumDP4/count_afterF);//actual average cov after Filt - mix25_left=(100.0*count_afterF/count_beforeF);// percentage of filteerd out regions ( ? do we need?) - -// make decision about on mixture percentage: update it to zero if needed - // mix25_update=MakeDecisionMix(mix25, sk25, thr25,sk75,thr75); + filtered_left=(100.0*count_afterF/count_beforeF);// percentage of filteerd out regions ( ? do we need?) + printf("percentage left after filtering = %.2f\n",filtered_left); + printf("coverage after filtering = %d\n",avDP4); + //skewnesses (histAF, sk, skc); + skewnesses (histAF, sk, skc, st,stc); + for (i=0;i<3;i++) + printf("sk = %.2f\n",sk[i]); + for (i=0;i<3;i++) + printf("skc = %.2f\n",skc[i]); - - // confidences of mixture estimating - - conf25=Confidence25(sk25,sk75,mix25,mixc25, thr25, thr75, avDP4); - conf25_50=Confidence25_50(sk25,sk75,mix25_50,mixc25_50, thr25, thr75, avDP4); + // mixtures per mode; confidences per mixture printf("Results:\n"); - //printf("updated percentage mixture= %d\n", mix25_update); - printf("possible low percentage mixture= %d\n", mix25); - //printf("possible complement low percentage mixture= %d\n", mixc25); - - //if (( mix25-100+mixc25)0 ) && ((data[i+1]-data[i]) < 0))// && ( Imax < data[i]))//here peak in [i] - { - - peak[k] = data[i]; - mix25_peak[k]=i; - k=k+1; + st[i]=st[i-1]+d; + stc[i]=stc[i-1]+d; + //printf(" starts=%d\n",st[i]); + //printf(" starts com=%d\n",stc[i]); + } - } - } -// found max peak/s if any exist + //2. sums between starts: 3 sums and 3 sums complementary - if (peak[0]>0) - { - Imax = peak[1]; - - for (i=1; i0)-exist any peak <25% - printf("Mode 2: percentage giving this max = %d\n", mix25_50); - return mix25_50; - -}//================================================================== -//----------------------max from the first 25 - int GetMaxPeak25(int data[], int n25) - { - - // input data=AAF - int Imax; - int i,mix25,k; - int peak[n25], mix25_peak[n25]; - - // initialise mix25, in case no peaks exist - - mix25=0.0; - -// find max peak among peaks, first 25 percentage only, data is maf(AAF) vector - -//what if no peaks? - peak[0]=0; - - // find peaks - k=0; - for (i=1; i0 ) && ((data[i+1]-data[i]) < 0))// && ( Imax < data[i]))//here peak in [i] + for(i=0;i < 4;i++) + { + n1=st[i]; + n2=stc[i]; + s[i]=data[n1]; + sc[i]=data[n2]; + for (j=n1+1;j0) - { - Imax = peak[1]; - - for (i=1; i0)-exist any peak <25% - printf("Mode1: percentage giving this max = %d\n", mix25); - return mix25; - -} -//==================================================================================== - -//=================================== -float GetSkew25( int data[],int n25, int n49) - { - - float sk25; - int i,k; - int less25; - int bw25_50; - // sum up from 0 to n25 % - less25=data[0]; - for(i=0;i0 ) - { - sk25=1.0*less25/bw25_50;// if proportion of less 25% AF is high (more 1?), more likely to be a mixture - } - return sk25; -} -// ---------------------float GetSkew( int data[],int n51, int n75); -float GetSkew75( int data[],int n51, int n75) - { - - float sk75; - int i,k; - int more75; - int bw75_50; - int last; - - last=5; - // sum up from n75 to almost 100% - more75=data[n75]; - for(i=n75;i<(100-last);i++) - { - more75+=data[i]; - } - // sum 50 to 75 - bw75_50=data[n51]; - for(i=n51;i0 ) - { - sk75=1.0*more75/bw75_50;// if proportion of less 75% AF is high (more 1?), more likely to be a mixture - } - return sk75; -} - - -// make decision if mix25 noise or mixture: update it to zero if needed -//if mix25 is more than n25 (here =25), it is likely to be diploid allele noise : 17 Nov+ amount AF<25% skewed (too high) - - -//int MakeDecisionMix(int mix25, float sk25, float thr25)---------------------to update mix -int MakeDecisionMix(int mix25, float sk25, float thr25,float sk75,float thr75)// update percentage mix + //3. // compute skewnesses -{ - int mix25_update; - mix25_update=0.0; - if ((sk25 >= thr25) && (sk75 >= thr75))// thr of mixture=mix25 ;thr25 was 0.95 for path, 0.62 for human - { - mix25_update=mix25; - //printf("percentage mixture= %d\n", mix25); - } + for (i=0;i<3;i++) + { + if (s[i+1]>0) sk[i]=s[i]/s[i+1]; + if (sc[i]>0) skc[i]=sc[i+1]/sc[i]; + } - return mix25_update; } +//2============================================ -//==========================================fill the thr bins for skewness -void get_bins_ploidy (int bins_ploid[], int plo)// -{ - if (plo>=2){ - - bins_ploid[0]=24; - bins_ploid[1]=45; - bins_ploid[2]=55; - bins_ploid[3]=77; - - // mode 1: mix<25% diploid - //n25=26; - // n49=46; - // n51=60; - // n75=83; - } - - if (plo<2){ - - bins_ploid[0]=26; - bins_ploid[1]=50; - bins_ploid[2]=51; - bins_ploid[3]=75; +void MixMode(int mix[], int st[], int data[], int d) - // mode 1: mix<25% haploid - //n25=26; - // n49=50; - // n51=51; - // n75=79; - } -} -//--------------------skewness dep on ploidy -void get_skew_ploidy (float thr_sk[], int plo)// // how to consider skew25, skew75 depending on ploidy { - if (plo<2) - { - - thr_sk[0]=0.8; - thr_sk[1]=0.6; - } - - // give value for thr25= 0.55 for human diploid; 1.0, 0.8 for pathogens-haploid - //thr25=0.25; - //thr75=0.5;//complementary skewness threshold - - - if (plo>=2) - { - - thr_sk[0]=0.5; - thr_sk[1]=0.4; - } + int i; + int n1,n2; + int m; + for (i=0;i< 3;i++) + { + n1=st[i]; + n2=n1+d; + m=GetMaxPeakInterval(data, n1, n2);//2.1 function + mix[i]=m; + printf(" mixes in modes=%d\n",mix[i]); + } } -//==================================== - int GetPeakComplement25(int data[], int n75) +//2.1================================================================== +//----------------------max peak from the interval + int GetMaxPeakInterval(int data[], int n1, int n2) { + // input data=AAF + int max_peak; + int i,mixx,k,j; + int peak[100], perc_peak[100]; + // initialise mix, in case no peaks exist + mixx=0.0; +// find max peak among peaks in the interval only, data is maf(AAF) vector - // input data=AAF - int Imax; - int i,mixc25,k,n; - int mixcc25;//100-mix - int peak[25], perc_peak[25]; - - // initialise mixc25, in case no peaks exist - - mixc25=0.0; - mixcc25=0.0; - -// find max peak among peaks, first 25 percent only, data is maf(AAF) vector - n=6;// not look at last n bins- noise for 100% SNPs //what if no peaks? peak[0]=0; + perc_peak[0]=0; // find peaks k=0; - for (i=n75; i<100-n; i++) + for (i=n1; i0 ) && ((data[i+1]-data[i]) < 0))// && ( Imax < data[i]))//here peak in [i] + if ( ((data[i]-data[i-1]) >0 ) && ((data[i+1]-data[i]) < 0))// && ( max_peak < data[i]))//here peak in [i] { peak[k] = data[i]; perc_peak[k]=i; + //printf("peaks= %d\n", peak[k]); + //printf("peaks percent= %d\n", perc_peak[k]); k=k+1; - } } -// find max peak/s if any exist +// found max peak/s if any exist - if (peak[0]>0) +//if one peak + if (k==1) { - Imax = peak[1]; - - for (i=1; i0)-exist any peak <25% - - mixcc25=100-mixc25; - printf("Mode 1 com:percentage giving this complement max = %d\n", mixcc25); - return mixcc25; - -} -//=========================== -//================================================================== -//----------------------max peak from the 50- 75 - int GetPeakComplement25_50(int data[], int n51, int n75) - { - - // input data=AAF - int Imax; - int i,mixc25_50,k,n; - int mixcc25_50;// 100-mixc - int peak[25], perc_peak[25]; - - // initialise mixc25_50, in case no peaks exist - - mixc25_50=0.0; - mixcc25_50=0.0; - -// find max peak among peaks, first 25 percent only, data is maf(AAF) vector - -//what if no peaks? - peak[0]=0; - - // find peaks - k=0; - for (i=n51; i0 ) && ((data[i+1]-data[i]) < 0))// && ( Imax < data[i]))//here peak in [i] - { - - peak[k] = data[i]; - perc_peak[k]=i; - k=k+1; - - } - } -// find max peak/s if any exist + max_peak = peak[0]; + mixx=perc_peak[0]; + // printf("Mode i:max peak = %d\n", max_peak); + // printf("Mode i: percentage giving this max = %d\n", mixx); + } - if (peak[0]>0) + if (k>1)//more than one peak { - Imax = peak[1]; + max_peak = peak[0]; - for (i=1; i0)-exist any peak <25% - mixcc25_50=100-mixc25_50; - printf("Mode 2 com: percentage giving this complement max = %d\n", mixcc25_50); - return mixcc25_50; - -} -// ============================================MODE1 -// confidence of mixture <25%-mode1 -float Confidence25(float sk25,float sk75,int mix25,int mixc25, float thr25, float thr75, float avDP4) -{ -//likelihoods - - - float conf,lik1,likc1; - float E1,EC1; - int peak_dist; - - peak_dist=8; - - //-------------------- - printf("Mode1:\n "); - conf=1.0;//initial - - E1=0.0; - if (mix25 >0 ) E1=1.0;// peak <25 does Exist - printf("exist mix25 = %.2f\n", E1); - - EC1=0.0; - if (abs(mixc25-mix25) < peak_dist ) // close enough - EC1=1.0; - //if (mixc25 >0 ) EC1=1.0; - printf(" exist complem mix25 = %.2f\n", EC1); - - lik1=sk25*0.9; - if (sk25 >= thr25) - {lik1=sk25;} - - - printf(" likely mix25 = %.2f\n", lik1); - - likc1=sk75*0.9; - if (sk75 >= thr75) - {likc1=sk75;} - printf(" likely complem mix25 = %.2f\n", likc1); - - // larger any how! - conf=(1-1/avDP4)*(E1+EC1)*0.5*lik1*likc1; - - // if (abs(mixc25-mix25) < peak_dist )// close enough - // conf=conf*1.1; - - if (conf>1) - {conf=1.0;} - - return conf; -} + return mixx; +}//================================================================== -//====================== -// confidence of mixture 25%-50% mode2 -float Confidence25_50(float sk25,float sk75,int mix25_50,int mixc25_50, float thr25, float thr75, float avDP4) +//--confidences modes0,1,2 +void Likely (float Lik[], float sk[],float skc[], int mixc[],float avDP4) { -//likelihoods - - - float conf,lik1,likc1; - float E1,EC1; - int peak_dist; + int i; + float l,lc,lp; - peak_dist=8; - - //-------------------- - printf("Mode2:\n "); - conf=1.0;//initial - - E1=0.0; - if (mix25_50 >0 ) E1=1.0;// peak <25 does Exist - printf("exist mix25_50 = %.2f\n", E1); - - EC1=0.0; - // if (mixc25_50 >0 ) EC1=1.0; - - if (abs(mixc25_50-mix25_50) < peak_dist ) // close enough - EC1=1.0; - - printf(" exist complem mix25_50 = %.2f\n", EC1); - - lik1=sk25*0.9; - if (sk25 < thr25)// humph 25-20 is larger than humph25=sk25! - {lik1=sk25;} - printf(" likely mix25_50 = %.2f\n", lik1); + for (i=0;i<3;i++) + { + lp=0.0; + if (mixc[i]>0) + lp=skc[2-i]; + l=sk[i]; + lc=skc[2-i]; + Lik[i]=(1-1/avDP4)*(l+lc+lp)/3; + printf("Mode i: likelihood of mix = %.2f\n", Lik[i]); - likc1=sk75*0.9; - if (sk75 < thr75) - {likc1=sk75;} - printf(" likely complem mix25_50 = %.2f\n", likc1); +} +} - // larger any how! - conf=(1-1/avDP4)*(E1+EC1)*0.5*lik1*likc1; - if (conf>1) - {conf=1.0;} - return conf; -} -//------------------------- -int let2int(char letter) -{ - int result; - //initiate - result=0; - if (letter=='a' || letter=='A') - result=1; - if (letter=='c' || letter=='C') - result=2; - if (letter=='g' || letter=='G') - result=3; - if (letter=='t' || letter=='T') - result=4; - return result; - } From 306f28f8d573bc679c4a457720840d3a4c5c40f8 Mon Sep 17 00:00:00 2001 From: Irina Abnizova Date: Thu, 14 Apr 2016 07:48:57 +0100 Subject: [PATCH 10/17] introduced three modes; merged outputs with Steven's --- src/contamination/get_mixture.c | 231 ++++++++++++++++++-------------- 1 file changed, 131 insertions(+), 100 deletions(-) diff --git a/src/contamination/get_mixture.c b/src/contamination/get_mixture.c index f2a3a1a..54c17a3 100755 --- a/src/contamination/get_mixture.c +++ b/src/contamination/get_mixture.c @@ -3,6 +3,8 @@ * Last edited: + 13 April- added Steven's corrections and outputs + April - if empty vcfq file; if small data in histo thr_tot=200 variants 14 March - three modes and likelihoods: low, middle , high @@ -24,9 +26,13 @@ input1 thrfile (thRR thAA mu std ploidy ) input2: extracted vcf: pos, D , DP4 (DP4 is four fields) -output1: mix_conf AF.txt= -mixture confidence -12 0.87 +output1: mix= +mixture mode1 likelihood1 +12 0.87 +mixture mode2 likelihood2 +12 0.87 +mixture mode3 likelihood3 +12 0.87 output2: .distrAF=histogram of AF with significance for each bin % count @@ -36,12 +42,13 @@ output2: .distrAF=histogram of AF with significance for each bin 11 780 -usage:./get_mixture rams.thr extracted.vcf name.mix name.distrAF +usage:./get_mixture name.thr extracted.vcf name.mix name.distrAF */ #include -//#include "conio.h" +#include +#include #include #include #include @@ -58,7 +65,8 @@ float GetMu( int data[]); float GetStd( int data[]); void skewnesses (int data[], float sk[], float skc[], int st[], int stc[]); -void MixMode(int mix[], int st[], int data[], int d); +void MixMode(int mix[], int st[], int data[], int dist); +void MixModeComplement(int mixc[], int stc[], int data[], int dist); int GetMaxPeakInterval(int data[], int n1, int n2);//------max peak from the interval void Likely (float Lik[], float sk[],float skc[], int mixc[],float avDP4); @@ -66,7 +74,7 @@ void Likely (float Lik[], float sk[],float skc[], int mixc[],float avDP4); int main (int argc, char *argcv[]) { - // flags + // ---------------------flags int firstSixAreOK = 1; int canWriteMix = 1; int canWriteDistrib = 1; @@ -94,7 +102,7 @@ int main (int argc, char *argcv[]) { // ----------------settings for skewnesses and their likelihoods: hardcoded within functions - int d;//distance , bin width for hist to compute skewness + int dist;//distance , bin width for hist to compute skewness //---------------results to compute: int count_afterF=0;// after filtering with min depths @@ -102,169 +110,174 @@ int main (int argc, char *argcv[]) { int AF; - //arrays + //---------------------------arrays int histAF[100];// to store mafs percentages - int peak[100]; - int perc_peak[100]; int st[4], stc [4];// starts of summing intervals for histo float sk[3],skc[3];// three skewnesses for mode0,1,2 int mix[3],mixc[3]; //mixtures three modes and their complements float Lik[3];// likelihood or confidences three modes -//-----------------------------------------open files for read/write + //-----------------------------------------open files for read/write + if(argc < NPAR)//four input_output files are submitted - { - printf("not enough of parms |input_output files\n"); - printf("usage:./get_mixture input1.thr input2.vcfq output1.mix output2.distr\n"); - return -1; + { + fprintf(stderr, "not enough of parms |input_output files\n"); + fprintf(stderr, "usage:./get_mixture input1.thr input2.vcfq output1.mix output2.distr\n"); + exit(EXIT_FAILURE); } thrFile=fopen(argcv[1],"r"); - if (thrFile == NULL) { - printf("cannot open first input _threshold file %s\n", argcv[1]); - return -1; + if (thrFile == NULL) { + fprintf(stderr, "cannot open first input _threshold file %s: %s\n", argcv[1], strerror(errno)); + exit(EXIT_FAILURE); } + extract_vcfFile=fopen(argcv[2],"r"); - if (extract_vcfFile == NULL) { - printf("cannot open first input _.vcfq file %s\n", argcv[2]); - return -1; + if (extract_vcfFile == NULL) { + fprintf(stderr, "cannot open first input _.vcfq file %s: %s\n", argcv[2], strerror(errno)); + exit(EXIT_FAILURE); } mixFile=fopen(argcv[3],"w"); - if (mixFile == NULL) { - printf("cannot open first output1 mix_conf file %s for writing\n", argcv[3]); - return -1; + if (mixFile == NULL) { + fprintf(stderr, "cannot open first output mix_conf file %s for writing: %s\n", argcv[3], strerror(errno)); + exit(EXIT_FAILURE); } distribFile=fopen(argcv[4],"w"); - if (distribFile == NULL) { - printf("cannot open second output file distAF %s for writing\n", argcv[4]); - return -1; + if (distribFile == NULL) { + fprintf(stderr, "cannot open second output distAF file %s for writing: %s\n", argcv[4], strerror(errno)); + exit(EXIT_FAILURE); } - printf("get_mixture: parameters\n"); + + fprintf(stderr, "get_mixture: parameters\n"); + // ===SETTINGS parameters for skewness measurements and peak consideretion: if Diploid then more noise around 50% - // initiate zero vector for histograme and its peaks + plo=0;// initial ploidy + dist=11; + avDP4 = 0;//initial average depth + + // -----------initiate zero vector for histograme and its peaks for(i=0;i<100;i++)// { histAF[i]=0; - peak[i]=0; - perc_peak[i]=0; - } + } - plo=0;// initial ploidy - d=11; - //starts for skew + //--------------starts for skews and complement skews bins for(i=0;i<4;i++)// { st[i]=0; stc[i]=0; } - - // initial skwnesses, mix, conf for modes 0,1,2 - for(i=0;i<3;i++)// - { + // ------------initial skwnesses, mix, conf for modes 0,1,2 + for(i=0;i<3;i++)// + { sk[i]=0.0; skc[i]=0.0; mix[i]=0; mixc[i]=0; Lik[i]=0.0; - } - - // scan thr file and assign current precomputed thresholds mu sd ploidy for filtering - while( (n = fscanf(thrFile,"%d %d %d %d %d", &thR, &thA,&muD, &stdD,&ploid)) >= 0) - // until the end of the input threshold file - { - if (n != Nthr) // incorrect format input - { - printf ("corrupted input thrFile format\n"); - return -1; } - plo=ploid; - } - printf("ploidy= %d\n", plo); + //1. scan-read thr file and assign current precomputed thresholds mu sd ploidy for filtering + while( (n = fscanf(thrFile,"%d %d %d %d %d", &thR, &thA,&muD, &stdD,&ploid)) >= 0) + { + if (n != Nthr) // incorrect format input thr file + { + fprintf (stderr, "corrupted input thrFile format\n"); + exit(EXIT_FAILURE); + } + plo=ploid; + } + fprintf(stderr, "ploidy= %d\n", plo); -// read main vcf file - //6 fields of input2 file // no PV4: do it to compute AAF=AF + //2. scan-read main vcf file:6 fields of input2 file // no PV4: do it to compute AAF=AF while( (n = fscanf(extract_vcfFile,"%d,%d,%d,%d,%d,%d", &pos, &D, &DP4[0], &DP4[1], &DP4[2], &DP4[3])) >= 0 && firstSixAreOK == 1 && canWriteMix == 1) - { - if( n != NRID ) // incorrect format - { - firstSixAreOK = 0; - break; - } - count_beforeF++; - sumDbefore=sumDbefore+D; - - // f1 Josie : filter for DP4 separately for ref and alternative alleles+ filter abnormal Depth + if( n != NRID ) // incorrect format vcfq file + { + fprintf(stderr,"reading strange line= %d\n", n); + continue;//skips this line if not end + } + count_beforeF++; + sumDbefore=sumDbefore+D; + // filter for DP4 separately for ref and alternative alleles+ filter abnormal large Depth if ( DP4[0] >= thR && DP4[1]>= thR && DP4[2]>= thA && DP4[3]>= thA && D < (muD+2*stdD))// 0 ref alt are ok { AF = GetAf(DP4); histAF[AF-1]++; - // for EACH POSITION, compute depths after thrRA filtering: + // for EACH POSITION, compute depths after thrRA and stupid Depth filtering: // average run depth after Q30, sum(DP4) across pos,= sumDP4 - count_afterF++; - sDP4=DP4[0]+DP4[1]+DP4[2]+DP4[3]; - sumDP4=sumDP4+sDP4; - }// end min_depth filter of DP4 + count_afterF++; + sDP4=DP4[0]+DP4[1]+DP4[2]+DP4[3]; + sumDP4=sumDP4+sDP4; + }// end filters and AAF computing for one line DP4 - // -------------output stupidly deep positions if (D >= (msD[0]+msD[1])) { fill the file + //? -------------output stupidly deep positions if (D >= (msD[0]+msD[1])) { fill the file - // removing space symbols before carriage return + // removing space symbols before carriage return do { n = fgetc (extract_vcfFile); } while ((char)n != '\n'); - } //END of outer while loop across vcfq file AF is computed! + } //END of while loop across vcfq file AF is computed! - //====================== output2 write AF - for(i=0;i<101;i++) - { - if( fprintf(distribFile,"%d %d\n",i, histAF[i]) <= 0 ) { - canWriteDistrib = 0; - } - }// for i=100 cycle -//=======================================analysis of AF distribution to find mixture sample + fprintf(stderr,"count after filtering = %d\n",count_afterF); -// stats after filtering :compute average depth :default and after filtering + if ( count_afterF >200 )// non empty, large enough after filtering vcfq + { + //=======================================analysis of AF distribution to find mixture sample + + // stats after filtering :compute average depth :default and after filtering avDP4=ceil(sumDP4/count_afterF);//actual average cov after Filt filtered_left=(100.0*count_afterF/count_beforeF);// percentage of filteerd out regions ( ? do we need?) - printf("percentage left after filtering = %.2f\n",filtered_left); - printf("coverage after filtering = %d\n",avDP4); + fprintf(stderr,"percentage left after filtering = %.2f\n",filtered_left); + fprintf(stderr,"coverage after filtering = %d\n",avDP4); //skewnesses (histAF, sk, skc); skewnesses (histAF, sk, skc, st,stc); for (i=0;i<3;i++) - printf("sk = %.2f\n",sk[i]); + fprintf(stderr,"sk = %.2f\n",sk[i]); for (i=0;i<3;i++) - printf("skc = %.2f\n",skc[i]); + fprintf(stderr,"skc = %.2f\n",skc[i]); // mixtures per mode; confidences per mixture - printf("Results:\n"); - MixMode(mix, st, histAF,d); - MixMode(mixc, stc, histAF,d); + fprintf(stderr,"Results:\n"); + MixMode(mix, st, histAF,dist); + MixModeComplement(mixc, stc, histAF,dist); Likely (Lik, sk,skc, mixc,avDP4); - //MAIN output mix file + }// if not empty data + +//==========================================================OUTPUTS + //====================== output2 write AF + for(i=0;i<101;i++) + { + if( fprintf(distribFile,"%d %d\n",i, histAF[i]) <= 0 ) { + canWriteDistrib = 0; + } + + }// for i=100 cycle + + //=================MAIN output1 mix file if (fprintf(mixFile,"mix low freq=%d\nconfidence low freq= %.4f\nmix middle freq= %d\nconfidence middle freq=%.4f\nmix high freq= %d\nconfidence high freq=%.4f\nAvActDepth=%d\nmin_depthR=%d\nmin_depthA=%d\n", mix[0],Lik[0],mix[1],Lik[1],mix[2],Lik[2],avDP4,thR,thA)<= 0 ) { canWriteMix = 0; - //return -1; + } fclose(distribFile); fclose(extract_vcfFile); @@ -277,10 +290,10 @@ int main (int argc, char *argcv[]) { printf ("\tcanWriteMix %d\n", canWriteMix); printf ("\tcanWriteDistrib %d\n", canWriteDistrib); printf ("Execution aborted\n"); - return -1; + exit(EXIT_FAILURE); } - printf(" get mixture is performed.\n"); + fprintf(stderr," get mixture is performed.\n"); return 0; }//main @@ -376,14 +389,14 @@ void skewnesses (int data[], float sk[], float skc[], int st[], int stc[]) { n1=st[i]; n2=stc[i]; - s[i]=data[n1]; - sc[i]=data[n2]; - for (j=n1+1;j Date: Thu, 14 Apr 2016 12:55:03 +0100 Subject: [PATCH 11/17] I modified reading vcfq file to skip a bad line# Please enter the commit message for your changes. Lines starting --- src/contamination/get_thr_ploidy.c | 283 ++++++++++++++++------------- 1 file changed, 160 insertions(+), 123 deletions(-) diff --git a/src/contamination/get_thr_ploidy.c b/src/contamination/get_thr_ploidy.c index 84993f0..7312bf1 100755 --- a/src/contamination/get_thr_ploidy.c +++ b/src/contamination/get_thr_ploidy.c @@ -1,8 +1,12 @@ -/* File: get_thr_ploidy.c // filters 1 1 vcfq file and computes av std Depth depending on Depth of vcfq file +/* File: get_thr_ploidy_vcfa_ch.c // filters 1 1 vcfq file and computes av std Depth depending on Depth of vcfq file * Authors: designed by Irina Abnizova (ia1) and edited by Steve Leonard(srl) * Last edited: + + 14 April- merge with Steve's changes + + March 2016- prepared to read chr names and letters; skip bad lines 18 Jan 2016 - real std 11 June: one thr file. output is e.g. 2 2 41 113 9 June 2014 :n1 from pipe of three:filters 1 1 vcfq file and computes av std Depth depending on Depth of vcfq file @@ -13,11 +17,11 @@ 29 April // computes suitable thresholds bases on avDP4=actual (not cov before as earlier!!! - 3 Feb-put thrR thrA into file thrD ;29 Jan 2014 - get autom thr, starting with default ones 1,5 - still old fields of vcf input + 3 Feb-put thrR thrA into file thrD ; + - get autom thr, starting with default ones 1,5 still old fields of vcf input *------------------------------------------------------------------- - * Description: computes computes threshold file depending on Depth of vcfq file + * Description: computes computes threshold file depending on Depth of vcfq file, and some extra outputs * Exported functions: * HISTORY: @@ -33,19 +37,23 @@ thrR thrA 3 1 output2: .distAF_1_1 output3: .vcfqF_1_1 filtered 1,1 + usage:./get_thrRA MIN_DEPTH_R(integer) MIN_DEPTH_A(integer) name.vcf ra_i_j.thr(Depth) name.distr name.vcfqF_1_1 */ #include +#include +#include #include #include #include + // ******** predefined user constants, -#define NRID 6 // Nunber of variant ids=fields: pos, Depth,DP4,(no PV4+maf) in extracted_vcf files +#define NRID 6 // Number of variant ids=fields: pos,Depth,DP4,(no PV4+maf) in extracted_vcf files #define NPAR 7 // Number of PARameters and arguments @@ -59,203 +67,211 @@ int GetAf (int []);// computes AF percent for a variant position int GetMax (int []);// percent giving max AAF int ploidy (int ind);// defines ploidy from results of GetMax ploid=2 if ~50% int GetMax50 (int data[]);// local max around 50% +int let2int(char letter); + int main (int argc, char *argcv[]) { - // flags + // -----error flags int firstSixAreOK = 1; int canWriteDistrib = 1; int canWriteFilteredVCF = 1; FILE *extract_vcfFile, *thrFile, *distribFile, *filtered_vcfFile; //file handles - int n,i,ind,ploid, ind50;// + // ---------------to read vcfa/vcfq + static const int line_size = 8192;//stupidly large + char line[line_size]; + + int k,i,ind,ploid, ind50;// int count_afterF=0;// after filtering with min depths int count_beforeF=0;// before filtering with min depths + + //----------from vcfa/vcfq input int DP4[4];// - int D,pos;// Depth, genome pos of error - int AF; + int pos,D;// Depth, genome pos of error - //arrays + //--------------arrays int histAF[100];// to store mafs percentages - //stats + //-----------------stats int sumDP4=0,sumDP4_2=0;//across all pos for mu std int sumDbefore=0; - float avDbefore; - float Q30frac; - float sDP4; + float sDP4; float perc_left; - //to compute - int avDP4,stdDP4,stdDP4_1,avDP41;// of actual sDP4 depth after 1 1 filtering + //-----------to compute fot outputs + + int AF; + int avDP4,stdDP4,avDP41;// of actual sDP4 depth after 1 1 filtering int thRR, thAA;// recommended based on actual depth avDP4 and stdDP4 float ratio; - if(argc < NPAR)//five input_output files are submitted + +//--------------------------opening + if(argc < NPAR)//five input_output files are submitted { - printf("not enough of parms |input_output files\n"); - printf("usage:./get_thr thrR thrA input2 output1 output2 output3\n"); - return -1; + fprintf(stderr, "not enough of parms |input_output files\n"); + fprintf(stderr, "usage:./get_thr thrR thrA input2 output1 output2 output3\n"); + exit(EXIT_FAILURE); } if (sscanf(argcv[1],"%d",&thR) == EOF) { - printf("Failed to convert the first argument %s to integer\n", argcv[1]); - } + fprintf(stderr, "Failed to convert the first argument %s to integer\n", argcv[1]); + } + if (sscanf(argcv[2],"%d",&thA) == EOF) { + fprintf(stderr, "Failed to convert the second argument %s to integer\n", argcv[2]); + } - if (sscanf(argcv[2],"%d",&thA) == EOF) { - printf("Failed to convert the second argument %s to integer\n", argcv[2]); - } + extract_vcfFile=fopen(argcv[3],"r"); + if (extract_vcfFile == NULL) { + fprintf(stderr, "cannot open first input _vcf.txt file %s: %s\n", argcv[3], strerror(errno)); + exit(EXIT_FAILURE); + } + // -----------------------three outputs - extract_vcfFile=fopen(argcv[3],"r"); - if (extract_vcfFile == NULL) { - printf("cannot open first input _vcf.txt file %s\n", argcv[3]); - return -1; - } - // two outputs thrFile=fopen(argcv[4],"w"); - if (thrFile == NULL) { - printf("cannot open first output file thrD %s\n", argcv[4]); - return -1; - } + if (thrFile == NULL) { + fprintf(stderr, "cannot open first output file thrD %s: %s\n", argcv[4], strerror(errno)); + exit(EXIT_FAILURE); + } distribFile=fopen(argcv[5],"w"); - if (distribFile == NULL) { - printf("cannot open second output file distAF %s\n", argcv[5]); - return -1; + if (distribFile == NULL) { + fprintf(stderr, "cannot open second output distAF file %s: %s\n", argcv[5], strerror(errno)); + exit(EXIT_FAILURE); } filtered_vcfFile=fopen(argcv[6],"w"); - if (filtered_vcfFile == NULL) { - printf("cannot open second output file filteredVCF %s\n", argcv[6]); - return -1; + if (filtered_vcfFile == NULL) { + fprintf(stderr, "cannot open third output filteredVCF file %s: %s\n", argcv[6], strerror(errno)); + exit(EXIT_FAILURE); } - printf("get_thr_ploidy\n"); + fprintf(stderr,"get_thr_ploidy\n"); - // initiate zero vector for histograme + // initiate zero vector for histograme for(i=0;i<100;i++)// { histAF[i]=0; } - //6 fields of input file,CSV reading - while( (n = fscanf(extract_vcfFile,"%d,%d,%d,%d,%d,%d", &pos, &D, &DP4[0], &DP4[1], &DP4[2], &DP4[3])) >= 0 && firstSixAreOK == 1)// && canWriteAF == 1) - // read the Read Title - { - if( n != NRID ) // incorrect format - { - firstSixAreOK = 0; - break; - } + //read main vcfq/a file: 6 fields of input file,CSV reading + while (fgets(line, line_size, extract_vcfFile)) + { + // read what is in this string=line + k = sscanf(line, "%d,%d,%d,%d,%d,%d", &pos,&D, &DP4[0], &DP4[1], &DP4[2], &DP4[3]); // && firstSixAreOK == 1) - count_beforeF++; - sumDbefore=sumDbefore+D; + if( k != NRID ) // incorrect format + { + printf("reading strange line= %d\n", k); + continue;//skips this line if not end + } + count_beforeF++; + sumDbefore=sumDbefore+D; - // f1 Josie : filter for DP4 separately for ref and alternative alleles + // f1 Josie : filter for DP4 separately for ref and alternative alleles if ( DP4[0] >= thR && DP4[1]>= thR && DP4[2]>= thA && DP4[3]>= thA)// && D < 10)// 0 ref are ok { - // compute AAF histo AF = GetAf(DP4); histAF[AF-1]++; // for EACH POSITION, compute actual depths sDP4 after thrRA filtering: - - // average run depth after Q25, sum(DP4) across pos,= sumDP4 - count_afterF++; - sDP4=DP4[0]+DP4[1]+DP4[2]+DP4[3];// for one position - sumDP4=sumDP4+sDP4;//for av - sumDP4_2=sumDP4_2+sDP4*sDP4;//for std - - Q30frac=sDP4/D;// how actual quality Depth differs from default - - if( fprintf(filtered_vcfFile,"%d,%d,%d,%d,%d,%d\n",pos, D, DP4[0], DP4[1],DP4[2], DP4[3]) <= 0 ) { - canWriteDistrib = 0; - //"%d,%d,%d,%d,%d,%d", pos, D, DP4[0], DP4[1],DP4[2], DP4[3])) + // average run depth after Q25, sum(DP4) across pos,= sumDP4 + count_afterF++; + sDP4=DP4[0]+DP4[1]+DP4[2]+DP4[3];// for one position + sumDP4=sumDP4+sDP4;//for av + sumDP4_2=sumDP4_2+sDP4*sDP4;//for std + + //============== output3 filtered 1 1 vcfq file + if( fprintf(filtered_vcfFile,"%d,%d,%d,%d,%d,%d\n",pos,D, DP4[0], DP4[1],DP4[2], DP4[3]) <= 0 ) + { + canWriteDistrib = 0; } - }// end filter DP4 - - // removing space symbols before carriage return - do { - n = fgetc (extract_vcfFile); - } - while ((char)n != '\n'); + }// end filter f1 DP4 } //END of outer while loop for all vcfq file, distrAF is computed - // check ploidy - ind=GetMax(histAF); - printf("percent of max AAF= %d\n", ind); + fprintf(stderr,"count of good lines total before filter = %d\n", count_beforeF); + + // ----------- check ploidy detection + ind=GetMax(histAF);// percent giving max peak in hist <80% + fprintf(stderr,"percent of max AAF= %d\n", ind); + + ind50=GetMax50(histAF);//percent giving max peak in hist within (40,60)% + fprintf(stderr,"percent of max AAF around 50= %d\n", ind50);// - ind50=GetMax50(histAF); - printf("percent of max AAF around 50= %d\n", ind50); + ploid=ploidy(ind);// automated detection of ploidy: 1 or 2 + fprintf(stderr,"ploidy= %d\n", ploid); - ploid=ploidy(ind); - printf("ploidy= %d\n", ploid); - //====================== output2 write AF + //====================== output2 write AF for(i=0;i<101;i++) { if( fprintf(distribFile,"%d %d\n",i, histAF[i]) <= 0 ) { canWriteDistrib = 0; } - }// for i=100 cycle - // compute average depth : default and actual after 1,1 and q25 filtering - // stats after 1,1 filtering + // compute stats: average depth : default and actual after 1,1 and q25 filtering avDP4=ceil(sumDP4/count_afterF);//actual average cov after Filt - //stdDP4=ceil(sqrt(sumDP4_2/count_afterF));// why did I approximate this like that?... stdDP4=ceil(sqrt(sumDP4_2/count_afterF -avDP4*avDP4));// - avDbefore=sumDbefore/count_beforeF; perc_left=(100.0*count_afterF/count_beforeF); - printf("average Depth after filtering= %d\n", avDP4); - printf("std Depth appr= %d\n", stdDP4); - printf("percent data left after q25 & min_depth filtering thrR thrA = %.2f\n", perc_left); - // adjust actual depth to Bad (deep covered) regions and recommend thr RA + fprintf(stderr,"average Depth after filtering= %d\n", avDP4); + fprintf(stderr,"std Depth appr= %d\n", stdDP4); + fprintf(stderr,"percent data left after q25 & min_depth filtering thrR=1 thrA=1 = %.2f\n", perc_left); + + // adjust actual depth to Bad (deep covered) regions and recommend thr RA avDP41=avDP4; - ratio=(sqrt(sumDP4_2/count_afterF))/(sumDP4/count_afterF);//stdDP4/avDP4; - printf("cv actual depth = %.2f\n", ratio); - if ( ratio > 1.5) //bad regions - {avDP41=ceil(0.8*avDP4); - printf("moderately deep covered regions here\n"); - printf("adjusted(avDP4) = %d\n", avDP41); - } - if (ratio > 3) //very bad regions - {avDP41=ceil(0.7*avDP4); - printf("extremely deep covered regions here\n"); - printf("adjusted(avDP4) = %d\n", avDP41); + ratio=(sqrt(sumDP4_2/count_afterF-avDP4*avDP4))/(sumDP4/count_afterF);//stdDP4/avDP4; + fprintf(stderr,"cv actual depth = %.2f\n", ratio); + + if ( ratio > 1.5) //bad regions + { + avDP41=ceil(0.8*avDP4); + fprintf(stderr,"moderately deep covered regions here\n"); + fprintf(stderr,"adjusted(avDP4) = %d\n", avDP41); + } + + if (ratio > 3) //very bad regions + { + avDP41=ceil(0.7*avDP4); + fprintf(stderr,"extremely deep covered regions here\n"); + fprintf(stderr,"adjusted(avDP4) = %d\n", avDP41); } - // recommendation for thrRR AA + // recommendation for thrRR AA, depending on adjasted Depth if ( avDP41 > 0 && avDP41 < 4 ) - { thRR = 100;thAA=100;//stupidly high? + { + thRR = 100;thAA=100;//stupidly high? printf("better not to bother to look for mixture: too low actual coverage=%d\n",avDP4); } - if ( avDP41 >= 4 && avDP41 < 10 ) - { thRR = 1;thAA=1; + if ( avDP41 >= 4 && avDP41 < 10 ) + { + thRR = 1;thAA=1; } - if ( avDP41 >= 10 && avDP41 < 70 ) - { thRR = 2;thAA=2; + if ( avDP41 >= 10 && avDP41 < 70 ) + { + thRR = 2;thAA=2; } - if ( avDP41 >= 70 && avDP41 < 90 ) - { thRR = 3;thAA=2; + if ( avDP41 >= 70 && avDP41 < 90 ) + { + thRR = 3;thAA=2; } - if ( avDP41 >= 90 ) - { thRR = 3;thAA=3; - } - - // fill in the output files + if ( avDP41 >= 90 ) + { + thRR = 3;thAA=3; + } //====================== output1 write thrs, avD, stdD and ploidy fprintf(thrFile,"%d %d %d %d %d\n",thRR,thAA,avDP4,stdDP4,ploid); @@ -268,15 +284,15 @@ int main (int argc, char *argcv[]) { // checking write/read if( firstSixAreOK == 0)// || canWriteFilteredVCF == 0) { - printf ("Error during execution. Details: \n"); - printf ("\tfirstSixAreOK %d\n", firstSixAreOK); - printf ("\tcanWriteDistrib %d\n", canWriteDistrib); - printf ("\tcanWriteFilteredVCF %d\n", canWriteFilteredVCF); - printf ("Execution aborted\n"); - return -1; + fprintf(stderr,"Error during execution. Details: \n"); + fprintf(stderr,"\tfirstSixAreOK %d\n", firstSixAreOK); + fprintf(stderr,"\tcanWriteDistrib %d\n", canWriteDistrib); + fprintf(stderr,"\tcanWriteFilteredVCF %d\n", canWriteFilteredVCF); + fprintf(stderr,"Execution aborted\n"); + exit(EXIT_FAILURE); } - printf(" done get_thr_ploidy.\n"); + fprintf(stderr," done get_thr_ploidy.\n"); return 0; }//main @@ -351,3 +367,24 @@ int GetMax50 (int data[]) } return ind; } +//------------------------- +int let2int(char letter) +{ + + int result; + + //initiate + result=0; + + if (letter=='a' || letter=='A') + result=1; + if (letter=='c' || letter=='C') + result=2; + if (letter=='g' || letter=='G') + result=3; + if (letter=='t' || letter=='T') + result=4; + + return result; + } + From cbeb62fd8a06ea5120f043a53e754803da41bfb7 Mon Sep 17 00:00:00 2001 From: Irina Abnizova Date: Thu, 14 Apr 2016 14:41:13 +0100 Subject: [PATCH 12/17] introduced restrictions of empty/small vcfq file --- src/contamination/get_thr_ploidy.c | 34 ++++++++++++++++++++++-------- 1 file changed, 25 insertions(+), 9 deletions(-) diff --git a/src/contamination/get_thr_ploidy.c b/src/contamination/get_thr_ploidy.c index 7312bf1..98d6406 100755 --- a/src/contamination/get_thr_ploidy.c +++ b/src/contamination/get_thr_ploidy.c @@ -158,7 +158,13 @@ int main (int argc, char *argcv[]) { { histAF[i]=0; } - + // initialise values + thRR=0; + thAA=0; + avDP41=0; + avDP4=0; + stdDP4=0; + ploid=1; //read main vcfq/a file: 6 fields of input file,CSV reading while (fgets(line, line_size, extract_vcfFile)) @@ -201,6 +207,20 @@ int main (int argc, char *argcv[]) { } //END of outer while loop for all vcfq file, distrAF is computed fprintf(stderr,"count of good lines total before filter = %d\n", count_beforeF); + fprintf(stderr,"count of good lines total after filter = %d\n", count_afterF); + + //====================== output2 write AF + for(i=0;i<101;i++) + { + if( fprintf(distribFile,"%d %d\n",i, histAF[i]) <= 0 ) { + canWriteDistrib = 0; + } + }// for i=100 cycle + + // -----------------------if large enough vcfq file + if ( count_afterF >200 )// non empty, large enough after filtering vcfq + { + // ----------- check ploidy detection ind=GetMax(histAF);// percent giving max peak in hist <80% @@ -212,13 +232,7 @@ int main (int argc, char *argcv[]) { ploid=ploidy(ind);// automated detection of ploidy: 1 or 2 fprintf(stderr,"ploidy= %d\n", ploid); - //====================== output2 write AF - for(i=0;i<101;i++) - { - if( fprintf(distribFile,"%d %d\n",i, histAF[i]) <= 0 ) { - canWriteDistrib = 0; - } - }// for i=100 cycle + // compute stats: average depth : default and actual after 1,1 and q25 filtering @@ -273,8 +287,10 @@ int main (int argc, char *argcv[]) { thRR = 3;thAA=3; } + }// if large enough vcfq file + //====================== output1 write thrs, avD, stdD and ploidy - fprintf(thrFile,"%d %d %d %d %d\n",thRR,thAA,avDP4,stdDP4,ploid); + fprintf(thrFile,"%d %d %d %d %d\n",thRR,thAA,avDP41,stdDP4,ploid); fclose(extract_vcfFile); fclose(thrFile); From 329787db8872078656b288328e1c733b1530241e Mon Sep 17 00:00:00 2001 From: Irina Abnizova Date: Thu, 14 Apr 2016 17:03:21 +0100 Subject: [PATCH 13/17] added <= for peak detection in get_mixture.c --- src/contamination/get_mixture.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/contamination/get_mixture.c b/src/contamination/get_mixture.c index 54c17a3..67bb619 100755 --- a/src/contamination/get_mixture.c +++ b/src/contamination/get_mixture.c @@ -473,7 +473,7 @@ void MixModeComplement(int mixc[], int stc[], int data[], int dist) k=0; for (i=n1; i0 ) && ((data[i+1]-data[i]) < 0))// && ( max_peak < data[i]))//here peak in [i] + if ( ((data[i]-data[i-1]) >=0 ) && ((data[i+1]-data[i]) < 0))// && ( max_peak < data[i]))//here peak in [i] { peak[k] = data[i]; From 82cd24daa84491fbfaf9699dfba26069ddd6affa Mon Sep 17 00:00:00 2001 From: Steven Leonard Date: Thu, 21 Apr 2016 12:05:38 +0100 Subject: [PATCH 14/17] added command lines args and generally cleaned up code --- src/contamination/get_mixture.c | 838 ++++++++++++++--------------- src/contamination/get_thr_ploidy.c | 594 ++++++++++---------- 2 files changed, 686 insertions(+), 746 deletions(-) diff --git a/src/contamination/get_mixture.c b/src/contamination/get_mixture.c index 67bb619..c856d5d 100755 --- a/src/contamination/get_mixture.c +++ b/src/contamination/get_mixture.c @@ -15,35 +15,6 @@ * Exported functions: * HISTORY: -computes AF (alternative to Ref allele frequency) for each error base call from extracted vcf - -applies minimum depth threshold, both for Ref and Alternative alleles fwd rev (total min depth will be -4*MIN_DEPTH - -input1 thrfile (thRR thAA mu std ploidy ) - 3 2 82 110 2 - -input2: extracted vcf: pos, D , DP4 (DP4 is four fields) - - -output1: mix= -mixture mode1 likelihood1 -12 0.87 -mixture mode2 likelihood2 -12 0.87 -mixture mode3 likelihood3 -12 0.87 - -output2: .distrAF=histogram of AF with significance for each bin -% count -8 0 -9 0 -10 385 -11 780 - - -usage:./get_mixture name.thr extracted.vcf name.mix name.distrAF - */ #include @@ -52,493 +23,500 @@ usage:./get_mixture name.thr extracted.vcf name.mix name.distrAF #include #include #include +#include // ******** predefined user constants, - -#define NRID 6 // Number of variant ids=fields: pos, chromosome, Depth,DP4,(no PV4+maf) in esigtracted_vcf files -#define NPAR 5 // masig Number of PARameters and arguments -#define Nthr 5 // Number of thresholds and muD1 stdD1 12 June + ploidy=1 (haploid) (or 2-diploid) +#define NPAR 4 // Number of arguments +#define NTHR 5 // Number of fields (thR, thA, mu, std and ploidy (1=haploid 2=diploid)) +#define NRID 6 // Number of fields (pos,Depth,DP4) in extracted_vcf files +#define MIN_COUNT_AFTERF 200 // Minimum variant count after filtering +#define DEFAULT_DIST 11 // Default bin width for hist to compute skewness // ******** declarations of functions -int GetAf (int []);// computes AF percentage for a variant position -float GetMu( int data[]); -float GetStd( int data[]); - +void usage(int code); // usage +int GetAf (int []);// computes AF percentage for a variant position +float GetMu( int data[]); +float GetStd( int data[]); void skewnesses (int data[], float sk[], float skc[], int st[], int stc[]); void MixMode(int mix[], int st[], int data[], int dist); void MixModeComplement(int mixc[], int stc[], int data[], int dist); int GetMaxPeakInterval(int data[], int n1, int n2);//------max peak from the interval void Likely (float Lik[], float sk[],float skc[], int mixc[],float avDP4); -//=====================================================MAIN - -int main (int argc, char *argcv[]) { +//////////////////////////////////////////////////// +// main +//////////////////////////////////////////////////// +int main (int argc, char *argv[]) { - // ---------------------flags - int firstSixAreOK = 1; - int canWriteMix = 1; - int canWriteDistrib = 1; + // verbose mode + int verbose = 0; - FILE *thrFile, *extract_vcfFile, *mixFile, *distribFile ; //file handles + // file names + char *thr_file, *extract_vcf_file, *mix_file, *distrib_file; - int n,i; + // file handles + FILE *thrFile, *extract_vcfFile, *mixFile, *distribFile; - // ---------- params from input1 rams(now four+1 values thrR thrA muD stdD ploidy) - int muD,stdD; - int thR,thA, ploid,plo; + // values in threshold file + int muD, stdD; + int thR, thA, ploid; + // to read vcfa/vcfq + static const int line_size = 8192; // maximum line size + char line[line_size]; - //---------------params from input2 vcfq - int DP4[4];// - int D,pos;// Depth, genome pos of error + // values in vcfq file + int DP4[4]; + int pos, D; - //----------stats to compute - int avDP4;// of actual sDP4 depth after RA & bad regions filtering - int sumDP4=0;//across all pos for mu std - int sumDbefore=0; + int i; - float sDP4; - float filtered_left; + // stats to compute + float sumDP4 ;//across all pos for mu std + int avDP4; // of actual sDP4 depth after RA & bad regions filtering - // ----------------settings for skewnesses and their likelihoods: hardcoded within functions + float filtered_left; - int dist;//distance , bin width for hist to compute skewness + // settings for skewnesses and their likelihoods: hardcoded within functions - //---------------results to compute: - int count_afterF=0;// after filtering with min depths - int count_beforeF=0;// before filtering with min depths + int dist = DEFAULT_DIST; // bin width for hist to compute skewness - int AF; + //---------------results to compute: + int count_beforeF; // variant count before filtering + int count_afterF; // variant count after filtering - //---------------------------arrays - int histAF[100];// to store mafs percentages - int st[4], stc [4];// starts of summing intervals for histo - float sk[3],skc[3];// three skewnesses for mode0,1,2 - int mix[3],mixc[3]; //mixtures three modes and their complements - float Lik[3];// likelihood or confidences three modes + //---------------------------arrays + int histAF[100]; // to store mafs percentages + int st[4], stc [4]; // starts of summing intervals for histo + float sk[3], skc[3]; // three skewnesses for mode0,1,2 + int mix[3], mixc[3]; // mixtures three modes and their complements + float Lik[3]; // likelihood or confidences three modes - //-----------------------------------------open files for read/write + static struct option long_options[] = + { {"verbose", 0, 0, 'v'}, + {"help", 0, 0, 'h'}, + {0, 0, 0, 0} + }; - if(argc < NPAR)//four input_output files are submitted - { - fprintf(stderr, "not enough of parms |input_output files\n"); - fprintf(stderr, "usage:./get_mixture input1.thr input2.vcfq output1.mix output2.distr\n"); - exit(EXIT_FAILURE); + char c; + while ( (c = getopt_long(argc, argv, "r:a:vh?", long_options, 0)) != -1) { + switch (c) { + case 'v': verbose = 1; break; + case 'h': + case '?': usage(0); break; + default: fprintf(stderr, "ERROR: Unknown option %c\n", c); + usage(1); + break; + } } - thrFile=fopen(argcv[1],"r"); - if (thrFile == NULL) { - fprintf(stderr, "cannot open first input _threshold file %s: %s\n", argcv[1], strerror(errno)); - exit(EXIT_FAILURE); + if ( (argc-optind) < NPAR) { + // not enough parameters + usage(-1); + } else { + thr_file = argv[optind+0]; + extract_vcf_file = argv[optind+1]; + mix_file = argv[optind+2]; + distrib_file = argv[optind+3]; } - - extract_vcfFile=fopen(argcv[2],"r"); - if (extract_vcfFile == NULL) { - fprintf(stderr, "cannot open first input _.vcfq file %s: %s\n", argcv[2], strerror(errno)); - exit(EXIT_FAILURE); + // open files for read/write + thrFile = fopen(thr_file,"r"); + if (thrFile == NULL) { + fprintf(stderr, "cannot open threshold_file %s: %s\n", thr_file, strerror(errno)); + exit(EXIT_FAILURE); + } + extract_vcfFile = fopen(extract_vcf_file,"r"); + if (extract_vcfFile == NULL) { + fprintf(stderr, "cannot open filtered_vcf_file %s: %s\n", extract_vcf_file, strerror(errno)); + exit(EXIT_FAILURE); + } + mixFile = fopen(mix_file,"w"); + if (mixFile == NULL) { + fprintf(stderr, "cannot open mixture_file %s: %s\n", mix_file, strerror(errno)); + exit(EXIT_FAILURE); } + distribFile = fopen(distrib_file,"w"); + if (distribFile == NULL) { + fprintf(stderr, "cannot open distribution_file %s: %s\n", distrib_file, strerror(errno)); + exit(EXIT_FAILURE); + } + + fprintf(stderr, "get_mixture\n"); - mixFile=fopen(argcv[3],"w"); - if (mixFile == NULL) { - fprintf(stderr, "cannot open first output mix_conf file %s for writing: %s\n", argcv[3], strerror(errno)); - exit(EXIT_FAILURE); + // initialise AAF histogram + for (i=0;i<100;i++) { + histAF[i] = 0; } - distribFile=fopen(argcv[4],"w"); - if (distribFile == NULL) { - fprintf(stderr, "cannot open second output distAF file %s for writing: %s\n", argcv[4], strerror(errno)); + // read thr file - 5 fields theshold ref, threshold alt, mean depath, stddev depth and ploidy + while (fgets(line, line_size, thrFile)) { + int k = sscanf(line, "%d %d %d %d %d", &thR, &thA, &muD, &stdD, &ploid); + if (k != NTHR) { + // number of fields read not correct + fprintf (stderr, "corrupt threshold file %s\n", line); exit(EXIT_FAILURE); + } } - - - fprintf(stderr, "get_mixture: parameters\n"); - - - // ===SETTINGS parameters for skewness measurements and peak consideretion: if Diploid then more noise around 50% - - plo=0;// initial ploidy - dist=11; - avDP4 = 0;//initial average depth - - // -----------initiate zero vector for histograme and its peaks - for(i=0;i<100;i++)// - { - histAF[i]=0; - } - - //--------------starts for skews and complement skews bins - for(i=0;i<4;i++)// - { - st[i]=0; - stc[i]=0; - } - // ------------initial skwnesses, mix, conf for modes 0,1,2 - for(i=0;i<3;i++)// - { - sk[i]=0.0; - skc[i]=0.0; - mix[i]=0; - mixc[i]=0; - Lik[i]=0.0; - } - - //1. scan-read thr file and assign current precomputed thresholds mu sd ploidy for filtering - while( (n = fscanf(thrFile,"%d %d %d %d %d", &thR, &thA,&muD, &stdD,&ploid)) >= 0) - { - if (n != Nthr) // incorrect format input thr file - { - fprintf (stderr, "corrupted input thrFile format\n"); - exit(EXIT_FAILURE); - } - - plo=ploid; - } - fprintf(stderr, "ploidy= %d\n", plo); - - - //2. scan-read main vcf file:6 fields of input2 file // no PV4: do it to compute AAF=AF - - while( (n = fscanf(extract_vcfFile,"%d,%d,%d,%d,%d,%d", &pos, &D, &DP4[0], &DP4[1], &DP4[2], &DP4[3])) >= 0 && firstSixAreOK == 1 && canWriteMix == 1) - { - if( n != NRID ) // incorrect format vcfq file - { - fprintf(stderr,"reading strange line= %d\n", n); - continue;//skips this line if not end - } - count_beforeF++; - sumDbefore=sumDbefore+D; - - // filter for DP4 separately for ref and alternative alleles+ filter abnormal large Depth - if ( DP4[0] >= thR && DP4[1]>= thR && DP4[2]>= thA && DP4[3]>= thA && D < (muD+2*stdD))// 0 ref alt are ok - { - AF = GetAf(DP4); - histAF[AF-1]++; - - // for EACH POSITION, compute depths after thrRA and stupid Depth filtering: - // average run depth after Q30, sum(DP4) across pos,= sumDP4 - count_afterF++; - sDP4=DP4[0]+DP4[1]+DP4[2]+DP4[3]; - sumDP4=sumDP4+sDP4; - }// end filters and AAF computing for one line DP4 - - //? -------------output stupidly deep positions if (D >= (msD[0]+msD[1])) { fill the file - - // removing space symbols before carriage return - do { - n = fgetc (extract_vcfFile); + fprintf(stderr, "ploidy= %d\n", ploid); + + // initialise counts + count_beforeF = 0; + count_afterF = 0; + + // initialise sums + sumDP4 = 0; + + // read vcfq file - 6 fields position, depth and 4xdepth (ref/forward, ref/reverse, alt/forward and alt/reverse) + while (fgets(line, line_size, extract_vcfFile)) { + int k = sscanf(line, "%d,%d,%d,%d,%d,%d", &pos, &D, &DP4[0], &DP4[1], &DP4[2], &DP4[3]); + if (k != NRID) { + // number of fields read not correct + fprintf(stderr, "skipping malformed VCF line %s\n", line); + continue; } - while ((char)n != '\n'); + count_beforeF++; - } //END of while loop across vcfq file AF is computed! + // filter for DP4 separately for ref and alternative alleles and abnormaly large Depth + if( DP4[0] >= thR && DP4[1]>= thR && DP4[2]>= thA && DP4[3]>= thA && D < (muD+2*stdD) ) { + int AF; + float sDP4; + count_afterF++; - fprintf(stderr,"count after filtering = %d\n",count_afterF); + // compute AF and update AAF histo + AF = GetAf(DP4); + histAF[AF-1]++; - if ( count_afterF >200 )// non empty, large enough after filtering vcfq - { - //=======================================analysis of AF distribution to find mixture sample - - // stats after filtering :compute average depth :default and after filtering - - avDP4=ceil(sumDP4/count_afterF);//actual average cov after Filt - filtered_left=(100.0*count_afterF/count_beforeF);// percentage of filteerd out regions ( ? do we need?) - fprintf(stderr,"percentage left after filtering = %.2f\n",filtered_left); - fprintf(stderr,"coverage after filtering = %d\n",avDP4); - - //skewnesses (histAF, sk, skc); - skewnesses (histAF, sk, skc, st,stc); - for (i=0;i<3;i++) - fprintf(stderr,"sk = %.2f\n",sk[i]); - for (i=0;i<3;i++) - fprintf(stderr,"skc = %.2f\n",skc[i]); - - // mixtures per mode; confidences per mixture - - fprintf(stderr,"Results:\n"); - MixMode(mix, st, histAF,dist); - MixModeComplement(mixc, stc, histAF,dist); - Likely (Lik, sk,skc, mixc,avDP4); - - }// if not empty data + // calc sDP4 and update sumDP4 + sDP4 = DP4[0] + DP4[1] + DP4[2] + DP4[3]; + sumDP4 = sumDP4 + sDP4; + } + } -//==========================================================OUTPUTS - //====================== output2 write AF - for(i=0;i<101;i++) - { - if( fprintf(distribFile,"%d %d\n",i, histAF[i]) <= 0 ) { - canWriteDistrib = 0; - } + fprintf(stderr, "count of variants before filtering = %d\n", count_beforeF); + fprintf(stderr, "count of variants after filtering = %d\n", count_afterF); - }// for i=100 cycle + // initialise interval sums + for (i=0;i<4;i++) { + st[i] = 0; + stc[i] = 0; + } + // initialise skwnesses, mix, conf for modes 0,1,2 + for (i=0;i<3;i++) { + sk[i] = 0.0; + skc[i] = 0.0; + mix[i] = 0; + mixc[i] = 0; + Lik[i] = 0.0; + } - //=================MAIN output1 mix file + // initialise variables + avDP4 = 0; // average depth + + // do we have enough data ? + if (count_afterF < MIN_COUNT_AFTERF) { + fprintf(stderr, "Not enough variants after filtering, count_afterF < %d\n", MIN_COUNT_AFTERF); + + } else { + // stats after filtering + avDP4 = ceil(sumDP4 / count_afterF); // average depth after filtering + filtered_left = (100.0 * count_afterF / count_beforeF); // percentage left after filtering + fprintf(stderr, "percentage left after filtering = %.2f\n", filtered_left); + fprintf(stderr, "coverage after filtering = %d\n", avDP4); + + // calculate skewness + skewnesses(histAF, sk, skc, st, stc); + + // mixtures per mode; confidences per mixture + MixMode(mix, st, histAF, dist); + MixModeComplement(mixc, stc, histAF, dist); + Likely(Lik, sk, skc, mixc, avDP4); + + if (verbose) { + fprintf(stderr, "Results:\n"); + for (i=0;i<3;i++) { + fprintf(stderr, "sk = %.2f\n", sk[i]); + } + for (i=0;i<3;i++) { + fprintf(stderr, "skc = %.2f\n", skc[i]); + } + for (i=0;i<3;i++) { + fprintf(stderr, "Mode %d: likelihood of mix = %.2f\n", i, Lik[i]); + } + } + } - if (fprintf(mixFile,"mix low freq=%d\nconfidence low freq= %.4f\nmix middle freq= %d\nconfidence middle freq=%.4f\nmix high freq= %d\nconfidence high freq=%.4f\nAvActDepth=%d\nmin_depthR=%d\nmin_depthA=%d\n", mix[0],Lik[0],mix[1],Lik[1],mix[2],Lik[2],avDP4,thR,thA)<= 0 ) - { - canWriteMix = 0; + // write distribution file + for (i=0;i<101;i++) { + if (fprintf(distribFile,"%d %d\n",i, histAF[i]) <= 0) { + fprintf(stderr, "error writing distribution_file: %s\n", strerror(errno)); + exit(EXIT_FAILURE); + } + } + + // write mixture file + if (fprintf(mixFile,"mix low freq=%d\nconfidence low freq=%.4f\n", mix[0], Lik[0]) <= 0 ) { + fprintf(stderr, "error writing mixture_file: %s\n", strerror(errno)); + exit(EXIT_FAILURE); + } + if (fprintf(mixFile,"mix middle freq= %d\nconfidence middle freq=%.4f\n",mix[1], Lik[1]) <= 0 ) { + fprintf(stderr, "error writing mixture_file: %s\n", strerror(errno)); + exit(EXIT_FAILURE); + } + if (fprintf(mixFile,"mix high freq= %d\nconfidence high freq=%.4f\n", mix[2], Lik[2]) <= 0 ) { + fprintf(stderr, "error writing mixture_file: %s\n", strerror(errno)); + exit(EXIT_FAILURE); + } + if (fprintf(mixFile,"AvActDepth=%d\nmin_depthR=%d\nmin_depthA=%d\n", avDP4, thR, thA) <= 0 ) { + fprintf(stderr, "error writing mixture_file: %s\n", strerror(errno)); + exit(EXIT_FAILURE); + } - } + // close files + fclose(thrFile); fclose(distribFile); fclose(extract_vcfFile); fclose(mixFile); - if( firstSixAreOK == 0 || - canWriteMix == 0 || canWriteDistrib ==0) { - printf ("Error during execution. Details: \n"); - printf ("\tfirstSixAreOK %d\n", firstSixAreOK); - printf ("\tcanWriteMix %d\n", canWriteMix); - printf ("\tcanWriteDistrib %d\n", canWriteDistrib); - printf ("Execution aborted\n"); - exit(EXIT_FAILURE); - } - - fprintf(stderr," get mixture is performed.\n"); + fprintf(stderr, "done get mixture.\n"); return 0; -}//main +} -/***************************************************** Functions******************************/ +//////////////////////////////////////////////////// +// usage +//////////////////////////////////////////////////// +void usage(int code) +{ + FILE *usagefp = stderr; + + fprintf(usagefp, "get_mixture\n\n"); + fprintf(usagefp, + "Usage: get_mixture [options] threshold_file filtered_vcf_file mixture_file distribution_file\n" + "\n" " applies minimum depth thresholds for reference and alternative alleles, calculates a histogram of allele frequencies and estimates mixture likelihoods\n"); + fprintf(usagefp, "\n"); + fprintf(usagefp, " options:\n"); + fprintf(usagefp, "\n"); + fprintf(usagefp, " -v verbose\n"); + fprintf(usagefp, " default false\n"); + fprintf(usagefp, "\n"); + + exit(code); +} - //////////////////////////////////////////////////// +//////////////////////////////////////////////////// // Calculates AF=alternative allele frequency from data=DP4 counts // input - array (4-vector) of DP4 counts -//output -one float number from (0,1) +// output -index into array of percentages 0,..,100 //////////////////////////////////////////////////// int GetAf (int data[]) { - float Isum,AF1; - int i,AF; + float sum, AF1; + int i, AF; - Isum = data[0]; - for (i=1; i<4; i++)//there are four conts for each base - { - Isum += data[i]; + sum = data[0]; + for (i=1;i<4;i++) { + sum += data[i]; } - AF1=(data[2]+data[3])/Isum; - AF=(long) (100*AF1+0.5); + AF1 = (data[2] + data[3]) / sum; + AF = (int) (100*AF1 + 0.5); + return AF; } + +//////////////////////////////////////////////////// // GetMu.c calculates mu for all non-zero elements of 100-vector +//////////////////////////////////////////////////// float GetMu( int data[]) - { - float mu,Isum; - int i; - int cnz=0; - - Isum = data[0]; - for (i=1; i<101; i++) - { - if (data[i] > 0) - { - cnz++; - Isum += data[i]; - } - } - mu=Isum/cnz; - - return mu; +{ + float mu, sum; + int i; + int cnz = 0; + + sum = data[0]; + for (i=1;i<101;i++) { + if (data[i] > 0) { + cnz++; + sum += data[i]; + } + } + mu = sum / cnz; + + return mu; } -// ========================stand deviation +//////////////////////////////////////////////////// +// standard deviation +//////////////////////////////////////////////////// float GetStd( int data[]) - { - float mu,std,Isum,Isum2; - int i; - int cnz=0; - - Isum = data[0]; - Isum2 = data[0]*data[0]; - for (i=1; i<101; i++) - { - if (data[i] > 0) - { - cnz++; - Isum += data[i]; - Isum2 += data[i]*data[i]; - } - } - mu=Isum/cnz; - std=sqrt(Isum2/cnz-mu*mu); - return std; +{ + float mu, std, sum, sum2; + int i; + int cnz = 0; + + sum = data[0]; + sum2 = data[0] * data[0]; + for (i=1;i<101;i++) { + if (data[i] > 0) { + cnz++; + sum += data[i]; + sum2 += data[i] * data[i]; + } + } + mu = sum / cnz; + std = sqrt( (sum2 / cnz) - (mu * mu) ); + return std; } -//1============================new skewnesses for three modes +//////////////////////////////////////////////////// +// new skewnesses for three modes +//////////////////////////////////////////////////// void skewnesses (int data[], float sk[], float skc[], int st[], int stc[]) { + int i, j, d, n1, n2; + float s[4]; + float sc[4]; + + d = DEFAULT_DIST; + + //1. starts, starts complementary + st[0] = 5; + stc[0] = 51; + for (i=1;i<4;i++) { + st[i] = st[i-1]+d; + stc[i] = stc[i-1]+d; + } - int i,j,d,n1,n2; - float s[4];//sums - float sc[4]; - - d=11;// - - //1. starts, starts complementary - st[0]=5; - stc[0]=51; - for (i=1;i<4;i++) - { - st[i]=st[i-1]+d; - stc[i]=stc[i-1]+d; - //printf(" starts=%d\n",st[i]); - //printf(" starts com=%d\n",stc[i]); - } - - //2. sums between starts: 3 sums and 3 sums complementary - - for(i=0;i < 4;i++) - { - n1=st[i]; - n2=stc[i]; - s[i]=0; - sc[i]=0; - for (j=n1;j0) sk[i]=s[i]/s[i+1]; - if (sc[i]>0) skc[i]=sc[i+1]/sc[i]; - } - + //2. sums between starts: 3 sums and 3 sums complementary + for(i=0;i<4;i++) { + n1 = st[i]; + n2 = stc[i]; + s[i] = 0; + sc[i] = 0; + for (j=n1;j0) { + sk[i]=s[i]/s[i+1]; + } + if (sc[i]>0) { + skc[i]=sc[i+1]/sc[i]; + } + } } -//2============================================ - +//////////////////////////////////////////////////// +// mixture mode +//////////////////////////////////////////////////// void MixMode(int mix[], int st[], int data[], int dist) - { - int i; - int n1,n2; - int m; - - for (i=0;i< 3;i++) - { - n1=st[i]; - n2=n1+dist; - m=GetMaxPeakInterval(data, n1, n2);//2.1 function - mix[i]=m; - printf(" mixes in modes=%d\n",mix[i]); - } - + int i; + int n1, n2; + int m; + + for (i=0;i<3;i++) { + n1 = st[i]; + n2 = n1 + dist; + m = GetMaxPeakInterval(data, n1, n2); + mix[i] = m; + } } -//2.1.1============================================ +//////////////////////////////////////////////////// +// mixture mode complement +//////////////////////////////////////////////////// void MixModeComplement(int mixc[], int stc[], int data[], int dist) +{ + int i; + int n1, n2; + int m; + + for (i=0;i<3;i++) { + n1 = stc[i+1]; + n2 = n1 + dist; + m = GetMaxPeakInterval(data, n1, n2); + mixc[i] = m; + } +} +//////////////////////////////////////////////////// +// max peak from the interval +//////////////////////////////////////////////////// +int GetMaxPeakInterval(int data[], int n1, int n2) { - int i; - int n1,n2; - int m; - - for (i=0;i< 3;i++) - { - n1=stc[i+1]; - n2=n1+dist; - m=GetMaxPeakInterval(data, n1, n2);//2.1 function - mixc[i]=m; - printf(" mixes in modes=%d\n",mixc[i]); - } + // input data = AAF + int max_peak; + int i, mixx, k, j; + int peak[100], perc_peak[100]; + + // initialise mix, in case no peaks exist + mixx = 0.0; + + // find max peak among peaks in the interval only, data is maf(AAF) vector + + // what if no peaks? + peak[0] = 0; + perc_peak[0] = 0; + + // find peaks + k = 0; + for (i=n1;i=0 ) && ((data[i+1]-data[i]) < 0)) { + peak[k] = data[i]; + perc_peak[k] = i; + k++; + } + } + + if (k == 1) { + // one peak + max_peak = peak[0]; + mixx = perc_peak[0]; + } else if (k > 1) { + // more than one peak + max_peak = peak[0]; + for (j=1;j=0 ) && ((data[i+1]-data[i]) < 0))// && ( max_peak < data[i]))//here peak in [i] - { - - peak[k] = data[i]; - perc_peak[k]=i; - //printf("peaks= %d\n", peak[k]); - //printf("peaks percent= %d\n", perc_peak[k]); - k=k+1; - } - } -// found max peak/s if any exist - -//if one peak - if (k==1) - { - max_peak = peak[0]; - mixx=perc_peak[0]; - // printf("Mode i:max peak = %d\n", max_peak); - // printf("Mode i: percentage giving this max = %d\n", mixx); - } - - if (k>1)//more than one peak - { - max_peak = peak[0]; - - for (j=1; j0)-exist any peak <25% - - return mixx; - -}//================================================================== - -//--confidences modes0,1,2 + +//////////////////////////////////////////////////// +// confidences modes 0,1,2 +//////////////////////////////////////////////////// void Likely (float Lik[], float sk[],float skc[], int mixc[],float avDP4) { - int i; - float l,lc,lp; - - for (i=0;i<3;i++) - { - lp=0.0; - if (mixc[i]>0) - lp=skc[2-i]; - l=sk[i]; - lc=skc[2-i]; - Lik[i]=(1-1/avDP4)*(l+lc+lp)/3; - printf("Mode i: likelihood of mix = %.2f\n", Lik[i]); + int i; + float l, lc, lp; -} + for (i=0;i<3;i++) { + lp = 0.0; + if (mixc[i]>0) { + lp = skc[2-i]; + } + l = sk[i]; + lc = skc[2-i]; + Lik[i] = (1-1/avDP4) * (l+lc+lp) / 3; + } } diff --git a/src/contamination/get_thr_ploidy.c b/src/contamination/get_thr_ploidy.c index 98d6406..9a6ac7d 100755 --- a/src/contamination/get_thr_ploidy.c +++ b/src/contamination/get_thr_ploidy.c @@ -26,381 +26,343 @@ * Exported functions: * HISTORY: -computes minimum depth threshold, both for Ref and Alternative alleles fwd rev (total min depth will be -4*MIN_DEPTH - - -input: MIN_DEPTH_R(integer)=1[default] MIN_DEPTH_A(integer)=1[default] extracted=vcfq, - -output1: -thrR thrA -3 1 -output2: .distAF_1_1 -output3: .vcfqF_1_1 filtered 1,1 - -usage:./get_thrRA MIN_DEPTH_R(integer) MIN_DEPTH_A(integer) name.vcf ra_i_j.thr(Depth) name.distr name.vcfqF_1_1 - */ - #include #include #include #include #include #include - +#include // ******** predefined user constants, - -#define NRID 6 // Number of variant ids=fields: pos,Depth,DP4,(no PV4+maf) in extracted_vcf files -#define NPAR 7 // Number of PARameters and arguments - - -// ********* Global variables min_depths for Ref and Alternative alleles -int thR = 0; -int thA = 0; +#define NPAR 3 // Number of arguments +#define THR 1 // Default reference allele threshold +#define THA 1 // Default alternative allele threshold +#define NRID 6 // Number of fields (pos,Depth,DP4) in extracted_vcf files +#define MIN_COUNT_AFTERF 200 // Minimum variant count after filtering // ******** declarations of functions +void usage(int code); // usage +int GetAf (int []); // computes AF percent for a variant position +int GetMax (int [], int i1, int i2); // percent giving max AAF +int ploidy (int ind); // defines ploidy from results of GetMax ploid=2 if ~50% -int GetAf (int []);// computes AF percent for a variant position -int GetMax (int []);// percent giving max AAF -int ploidy (int ind);// defines ploidy from results of GetMax ploid=2 if ~50% -int GetMax50 (int data[]);// local max around 50% -int let2int(char letter); +//////////////////////////////////////////////////// +// main +//////////////////////////////////////////////////// +int main (int argc, char *argv[]) { + // min_depths for Ref and Alternative alleles + int thR = THR; + int thA = THA; -int main (int argc, char *argcv[]) { + // verbose mode + int verbose = 0; - // -----error flags - int firstSixAreOK = 1; - int canWriteDistrib = 1; - int canWriteFilteredVCF = 1; + // file names + char *extract_vcf_file, *thr_file, *filtered_vcf_file; - FILE *extract_vcfFile, *thrFile, *distribFile, *filtered_vcfFile; //file handles + // file handles + FILE *extract_vcfFile, *thrFile, *filtered_vcfFile; - // ---------------to read vcfa/vcfq - static const int line_size = 8192;//stupidly large + // to read vcfq + static const int line_size = 8192; // maximum line size char line[line_size]; - int k,i,ind,ploid, ind50;// - int count_afterF=0;// after filtering with min depths - int count_beforeF=0;// before filtering with min depths + // values in vcfq file + int DP4[4]; // 4xdepths + int pos, D; // genome pos of error, depth + + int count_beforeF; // variant count before filtering + int count_afterF; // variant count after filtering + + // arrays + int histAF[100]; // maf percentages + + // stats + float sumDP4, sumDP4_2; // sum of depths and sum of square depths used for mu/std calculation + + // to compute for outputs + int ploid; // polidy + int avDP4, stdDP4; // mean and std of depth after filtering + int avDP41; // adjusted mean depth + int thRR, thAA; // estimated thresholds based on actual depths avDP4 and stdDP4 + + int i; // array index + + static struct option long_options[] = + { {"thR", 1, 0, 'r'}, + {"thA", 1, 0, 'a'}, + {"verbose", 0, 0, 'v'}, + {"help", 0, 0, 'h'}, + {0, 0, 0, 0} + }; + + char c; + while ( (c = getopt_long(argc, argv, "r:a:vh?", long_options, 0)) != -1) { + switch (c) { + case 'r': thR = atoi(optarg); break; + case 'a': thA = atoi(optarg); break; + case 'v': verbose = 1; break; + case 'h': + case '?': usage(0); break; + default: fprintf(stderr, "ERROR: Unknown option %c\n", c); + usage(1); + break; + } + } - //----------from vcfa/vcfq input - int DP4[4];// - int pos,D;// Depth, genome pos of error + if ( (argc-optind) < NPAR) { + // not enough parameters + usage(-1); + } else { + extract_vcf_file = argv[optind+0]; + thr_file = argv[optind+1]; + filtered_vcf_file = argv[optind+2]; + } - //--------------arrays - int histAF[100];// to store mafs percentages + // open files for read/write + extract_vcfFile = fopen(extract_vcf_file,"r"); + if (extract_vcfFile == NULL) { + fprintf(stderr, "cannot open input_vcf_file %s: %s\n", extract_vcf_file, strerror(errno)); + exit(EXIT_FAILURE); + } + thrFile = fopen(thr_file,"w"); + if (thrFile == NULL) { + fprintf(stderr, "cannot open threshold_file %s: %s\n", thr_file, strerror(errno)); + exit(EXIT_FAILURE); + } + filtered_vcfFile = fopen(filtered_vcf_file,"w"); + if (filtered_vcfFile == NULL) { + fprintf(stderr, "cannot open filtered_vcf_file %s: %s\n", filtered_vcf_file, strerror(errno)); + exit(EXIT_FAILURE); + } + fprintf(stderr, "get_thr_ploidy\n"); - //-----------------stats - int sumDP4=0,sumDP4_2=0;//across all pos for mu std - int sumDbefore=0; - float sDP4; - float perc_left; + // initialise AAF histogram + for (i=0;i<100;i++) { + histAF[i] = 0; + } + + // initialise counts + count_beforeF = 0; + count_afterF = 0; + + // initialise sums + sumDP4 = 0; + sumDP4_2 = 0; + + // read vcfq file - 6 fields position, depth and 4xdepths (ref/forward, ref/reverse, alt/forward and alt/reverse) + while (fgets(line, line_size, extract_vcfFile)) { + int k = sscanf(line, "%d,%d,%d,%d,%d,%d", &pos, &D, &DP4[0], &DP4[1], &DP4[2], &DP4[3]); + if (k != NRID) { + // number of fields read not correct + fprintf(stderr, "skipping malformed VCF line %s\n", line); + } + count_beforeF++; - //-----------to compute fot outputs + // filter for DP4 separately for ref and alternative alleles + if (DP4[0] >= thR && DP4[1]>= thR && DP4[2]>= thA && DP4[3]>= thA) { + int AF; + float sDP4; - int AF; - int avDP4,stdDP4,avDP41;// of actual sDP4 depth after 1 1 filtering - int thRR, thAA;// recommended based on actual depth avDP4 and stdDP4 - float ratio; + count_afterF++; + // compute AF and update histo + AF = GetAf(DP4); + histAF[AF-1]++; -//--------------------------opening - if(argc < NPAR)//five input_output files are submitted - { - fprintf(stderr, "not enough of parms |input_output files\n"); - fprintf(stderr, "usage:./get_thr thrR thrA input2 output1 output2 output3\n"); - exit(EXIT_FAILURE); + // calc sDP4 and update sumDP4, sumDP4_2 + sDP4 = DP4[0] + DP4[1] + DP4[2] + DP4[3]; + sumDP4 += sDP4; + sumDP4_2 += sDP4*sDP4; + + // write to filtered vcf file + if (fprintf(filtered_vcfFile,"%d,%d,%d,%d,%d,%d\n",pos,D, DP4[0], DP4[1],DP4[2], DP4[3]) <= 0) { + fprintf(stderr, "error writing filteredVCF file: %s\n", strerror(errno)); + exit(EXIT_FAILURE); + } + } } - if (sscanf(argcv[1],"%d",&thR) == EOF) { - fprintf(stderr, "Failed to convert the first argument %s to integer\n", argcv[1]); - } + fprintf(stderr, "count of variants before filtering = %d\n", count_beforeF); + fprintf(stderr, "count of variants after filtering = %d\n", count_afterF); - if (sscanf(argcv[2],"%d",&thA) == EOF) { - fprintf(stderr, "Failed to convert the second argument %s to integer\n", argcv[2]); - } + // initialise values + avDP4 = 0; // mean depth + stdDP4 = 0; // stddev depth + ploid = 1; // ploidy + avDP41 = 0; // adjustered mean depth + thRR = 0; // estimated reference threshold + thAA = 0; // estimated alternative threshold - extract_vcfFile=fopen(argcv[3],"r"); - if (extract_vcfFile == NULL) { - fprintf(stderr, "cannot open first input _vcf.txt file %s: %s\n", argcv[3], strerror(errno)); - exit(EXIT_FAILURE); - } + // do we have enough data ? + if (count_afterF < MIN_COUNT_AFTERF) { + fprintf(stderr, "Not enough variants after filtering, count_afterF < %d\n", MIN_COUNT_AFTERF); - // -----------------------three outputs - - thrFile=fopen(argcv[4],"w"); - if (thrFile == NULL) { - fprintf(stderr, "cannot open first output file thrD %s: %s\n", argcv[4], strerror(errno)); - exit(EXIT_FAILURE); - } - - distribFile=fopen(argcv[5],"w"); - if (distribFile == NULL) { - fprintf(stderr, "cannot open second output distAF file %s: %s\n", argcv[5], strerror(errno)); - exit(EXIT_FAILURE); - } + } else { + int ind, ind50; + float perc_left; + float ratio; - filtered_vcfFile=fopen(argcv[6],"w"); - if (filtered_vcfFile == NULL) { - fprintf(stderr, "cannot open third output filteredVCF file %s: %s\n", argcv[6], strerror(errno)); - exit(EXIT_FAILURE); - } + // ploidy detection + ind = GetMax(histAF, 1, 80);// percent giving max peak in hist in interval (1,80)% + fprintf(stderr, "percent of max AAF= %d\n", ind); - fprintf(stderr,"get_thr_ploidy\n"); + ind50 = GetMax(histAF, 40, 60);//percent giving max peak in hist in interval (40,60)% + fprintf(stderr, "percent of max AAF around 50= %d\n", ind50);// - // initiate zero vector for histograme - for(i=0;i<100;i++)// - { - histAF[i]=0; - } - // initialise values - thRR=0; - thAA=0; - avDP41=0; - avDP4=0; - stdDP4=0; - ploid=1; - - //read main vcfq/a file: 6 fields of input file,CSV reading - while (fgets(line, line_size, extract_vcfFile)) - { - // read what is in this string=line - k = sscanf(line, "%d,%d,%d,%d,%d,%d", &pos,&D, &DP4[0], &DP4[1], &DP4[2], &DP4[3]); // && firstSixAreOK == 1) - - if( k != NRID ) // incorrect format - { - printf("reading strange line= %d\n", k); - continue;//skips this line if not end - } - - count_beforeF++; - sumDbefore=sumDbefore+D; - - // f1 Josie : filter for DP4 separately for ref and alternative alleles - - if ( DP4[0] >= thR && DP4[1]>= thR && DP4[2]>= thA && DP4[3]>= thA)// && D < 10)// 0 ref are ok - { - // compute AAF histo - AF = GetAf(DP4); - histAF[AF-1]++; - - // for EACH POSITION, compute actual depths sDP4 after thrRA filtering: - // average run depth after Q25, sum(DP4) across pos,= sumDP4 - count_afterF++; - sDP4=DP4[0]+DP4[1]+DP4[2]+DP4[3];// for one position - sumDP4=sumDP4+sDP4;//for av - sumDP4_2=sumDP4_2+sDP4*sDP4;//for std - - //============== output3 filtered 1 1 vcfq file - if( fprintf(filtered_vcfFile,"%d,%d,%d,%d,%d,%d\n",pos,D, DP4[0], DP4[1],DP4[2], DP4[3]) <= 0 ) - { - canWriteDistrib = 0; - } - - }// end filter f1 DP4 - - } //END of outer while loop for all vcfq file, distrAF is computed - - fprintf(stderr,"count of good lines total before filter = %d\n", count_beforeF); - fprintf(stderr,"count of good lines total after filter = %d\n", count_afterF); - - //====================== output2 write AF - for(i=0;i<101;i++) - { - if( fprintf(distribFile,"%d %d\n",i, histAF[i]) <= 0 ) { - canWriteDistrib = 0; - } - }// for i=100 cycle - - // -----------------------if large enough vcfq file - if ( count_afterF >200 )// non empty, large enough after filtering vcfq - { - - - // ----------- check ploidy detection - ind=GetMax(histAF);// percent giving max peak in hist <80% - fprintf(stderr,"percent of max AAF= %d\n", ind); - - ind50=GetMax50(histAF);//percent giving max peak in hist within (40,60)% - fprintf(stderr,"percent of max AAF around 50= %d\n", ind50);// - - ploid=ploidy(ind);// automated detection of ploidy: 1 or 2 - fprintf(stderr,"ploidy= %d\n", ploid); - - - - // compute stats: average depth : default and actual after 1,1 and q25 filtering - - avDP4=ceil(sumDP4/count_afterF);//actual average cov after Filt - stdDP4=ceil(sqrt(sumDP4_2/count_afterF -avDP4*avDP4));// - perc_left=(100.0*count_afterF/count_beforeF); - - fprintf(stderr,"average Depth after filtering= %d\n", avDP4); - fprintf(stderr,"std Depth appr= %d\n", stdDP4); - fprintf(stderr,"percent data left after q25 & min_depth filtering thrR=1 thrA=1 = %.2f\n", perc_left); - - // adjust actual depth to Bad (deep covered) regions and recommend thr RA - - avDP41=avDP4; - ratio=(sqrt(sumDP4_2/count_afterF-avDP4*avDP4))/(sumDP4/count_afterF);//stdDP4/avDP4; - fprintf(stderr,"cv actual depth = %.2f\n", ratio); - - if ( ratio > 1.5) //bad regions - { - avDP41=ceil(0.8*avDP4); - fprintf(stderr,"moderately deep covered regions here\n"); - fprintf(stderr,"adjusted(avDP4) = %d\n", avDP41); - } - - if (ratio > 3) //very bad regions - { - avDP41=ceil(0.7*avDP4); - fprintf(stderr,"extremely deep covered regions here\n"); - fprintf(stderr,"adjusted(avDP4) = %d\n", avDP41); - } - - // recommendation for thrRR AA, depending on adjasted Depth - if ( avDP41 > 0 && avDP41 < 4 ) - { - thRR = 100;thAA=100;//stupidly high? - printf("better not to bother to look for mixture: too low actual coverage=%d\n",avDP4); - } - if ( avDP41 >= 4 && avDP41 < 10 ) - { - thRR = 1;thAA=1; - } - if ( avDP41 >= 10 && avDP41 < 70 ) - { - thRR = 2;thAA=2; - } - if ( avDP41 >= 70 && avDP41 < 90 ) - { - thRR = 3;thAA=2; - } - if ( avDP41 >= 90 ) - { - thRR = 3;thAA=3; - } - - }// if large enough vcfq file - - //====================== output1 write thrs, avD, stdD and ploidy - fprintf(thrFile,"%d %d %d %d %d\n",thRR,thAA,avDP41,stdDP4,ploid); + ploid = ploidy(ind);// automated detection of ploidy: 1 or 2 + fprintf(stderr, "ploidy= %d\n", ploid); - fclose(extract_vcfFile); - fclose(thrFile); - fclose(distribFile); - fclose(filtered_vcfFile); + // compute stats: average depth : default and actual after 1,1 and q25 filtering + + avDP4 = ceil(sumDP4 / count_afterF); // actual average cov after Filt + stdDP4 = ceil(sqrt((sumDP4_2 / count_afterF) - avDP4 * avDP4));// + perc_left = (100.0 * count_afterF / count_beforeF); + + fprintf(stderr, "average depth after filtering= %d\n", avDP4); + fprintf(stderr, "stdev depth after filtering= %d\n", stdDP4); + fprintf(stderr, "percent data left filtering= %.2f\n", perc_left); + + // adjust depth to account for bad (very deep) regions - // checking write/read - if( firstSixAreOK == 0)// || canWriteFilteredVCF == 0) + avDP41 = avDP4; + ratio = sqrt((sumDP4_2 / count_afterF) - (avDP4 * avDP4)) / (sumDP4 / count_afterF); // stdDP4 / avDP4; + fprintf(stderr, "cv actual depth = %.2f\n", ratio); + + if (ratio > 1.5) { + // bad regions + avDP41 = ceil(0.8*avDP4); + fprintf(stderr, "moderately deep regions detected\n"); + fprintf(stderr, "adjusted average depth = %d\n", avDP41); + } + + if (ratio > 3) { + // very bad regions + avDP41 = ceil(0.7*avDP4); + fprintf(stderr, "extremely deep covered detected\n"); + fprintf(stderr, "adjusted average depth = %d\n", avDP41); + } + + // calculate thresholds using adjusted Depth + if (avDP41 > 0 && avDP41 < 4 ) { - fprintf(stderr,"Error during execution. Details: \n"); - fprintf(stderr,"\tfirstSixAreOK %d\n", firstSixAreOK); - fprintf(stderr,"\tcanWriteDistrib %d\n", canWriteDistrib); - fprintf(stderr,"\tcanWriteFilteredVCF %d\n", canWriteFilteredVCF); - fprintf(stderr,"Execution aborted\n"); - exit(EXIT_FAILURE); + thRR = 100; thAA = 100; // stupidly high ? + fprintf(stderr, "coverage too low, adjusted average depth= %d\n", avDP41); + fprintf(stderr, "theshold set to exclude all variants in mixture calculation\n"); + } + if (avDP41 >= 4 && avDP41 < 10) { + thRR = 1; thAA = 1; + } + if (avDP41 >= 10 && avDP41 < 70) { + thRR = 2; thAA = 2; + } + if (avDP41 >= 70 && avDP41 < 90) { + thRR = 3; thAA = 2; + } + if (avDP41 >= 90) { + thRR = 3; thAA = 3; } + } + + // write threshold file + if (fprintf(thrFile, "%d %d %d %d %d\n", thRR, thAA, avDP41, stdDP4, ploid) <= 0) { + fprintf(stderr, "error writing threshold_file: %s\n", strerror(errno)); + exit(EXIT_FAILURE); + } + + // close files + fclose(extract_vcfFile); + fclose(thrFile); + fclose(filtered_vcfFile); - fprintf(stderr," done get_thr_ploidy.\n"); + fprintf(stderr, "done get_thr_ploidy.\n"); return 0; -}//main +} + +//////////////////////////////////////////////////// +// usage +//////////////////////////////////////////////////// +void usage(int code) +{ + FILE *usagefp = stderr; + + fprintf(usagefp, "get_thr_ploidy\n\n"); + fprintf(usagefp, + "Usage: get_thr_ploidy [options] input_vcf_file threshold_file filtered_vcf_file\n" + "\n" " computes minimum depth thresholds for reference and alternative alleles\n"); + fprintf(usagefp, "\n"); + fprintf(usagefp, " options:\n"); + fprintf(usagefp, "\n"); + fprintf(usagefp, " -r minimum reference allele depth\n"); + fprintf(usagefp, " default 1\n"); + fprintf(usagefp, "\n"); + fprintf(usagefp, " -a minimum reference allele depth\n"); + fprintf(usagefp, " default 1\n"); + fprintf(usagefp, "\n"); + fprintf(usagefp, " -v verbose\n"); + fprintf(usagefp, " default false\n"); + fprintf(usagefp, "\n"); + + exit(code); +} //////////////////////////////////////////////////// // Calculates AF=alternative allele frequency from data=DP4 counts // input - array (4-vector) of DP4 counts -//output -one float number from (0,1) +// output -index into array of percentages 0,..,100 //////////////////////////////////////////////////// int GetAf (int data[]) { - float Isum,AF1; - int i,AF; + float sum, AF1; + int i, AF; - Isum = data[0]; - for (i=1; i<4; i++)//there are four conts for each base - { - Isum += data[i]; + sum = data[0]; + for (i=1; i<4; i++) { + sum += data[i]; } - AF1=(data[2]+data[3])/Isum; - AF=(long) (100*AF1+0.5); - //(long) (sig+0.5) + AF1 = (data[2] + data[3]) / sum; + AF = (int) (100*AF1 + 0.5); + return AF; } -//--------------fins max value and its bin/index; returns bin; data is histo -int GetMax (int data[]) +//////////////////////////////////////////////////// +// find position of the max value in a sub-interval (i1<=i<=i2) of an array +//////////////////////////////////////////////////// +int GetMax (int data[], int i1, int i2) { - int Imax; - int i,ind; - - Imax = data[0]; - for (i=1; i<80; i++)// for histo: not latest bins (for bad reference ~100) - { - if( Imax < data[i]) - { - Imax = data[i]; - ind=i;// bin/index for max value - } - } - return ind; + int Imax; + int ind, i; + + Imax = data[i1]; + ind = i1; + for (i=i1; i40 && ind < 60) - ploid=2; - - return ploid; -} - -//-----------------------double check local max around 50+-10 + if ( ind >40 && ind < 60) { + ploid = 2; + } -int GetMax50 (int data[]) -{ - int Imax; - int i,ind; - - Imax = data[0]; - for (i=40; i<60; i++)// for histo: not latest bins (for bad reference ~100) - { - if( Imax < data[i]) - { - Imax = data[i]; - ind=i;// bin/index for max value - } - } - return ind; + return ploid; } -//------------------------- -int let2int(char letter) -{ - - int result; - - //initiate - result=0; - - if (letter=='a' || letter=='A') - result=1; - if (letter=='c' || letter=='C') - result=2; - if (letter=='g' || letter=='G') - result=3; - if (letter=='t' || letter=='T') - result=4; - - return result; - } - From 63dcb6c094612f16cbb292f75d61d23852533b06 Mon Sep 17 00:00:00 2001 From: Irina Abnizova Date: Fri, 29 Apr 2016 07:33:23 +0100 Subject: [PATCH 15/17] changed how to compute upper and lower thresholds: now automated --- src/contamination/get_thr_ploidy.c | 449 +++++++++++++++++++++++------ 1 file changed, 361 insertions(+), 88 deletions(-) diff --git a/src/contamination/get_thr_ploidy.c b/src/contamination/get_thr_ploidy.c index 9a6ac7d..f853cea 100755 --- a/src/contamination/get_thr_ploidy.c +++ b/src/contamination/get_thr_ploidy.c @@ -1,9 +1,11 @@ -/* File: get_thr_ploidy_vcfa_ch.c // filters 1 1 vcfq file and computes av std Depth depending on Depth of vcfq file +/* File: get_thr_ploidy.c // filters 1 1 vcfq file and computes av std Depth depending on Depth of vcfq file * Authors: designed by Irina Abnizova (ia1) and edited by Steve Leonard(srl) * Last edited: + 27 April output of the Actual Depth distribution + 14 April- merge with Steve's changes March 2016- prepared to read chr names and letters; skip bad lines @@ -37,7 +39,7 @@ #include // ******** predefined user constants, -#define NPAR 3 // Number of arguments +#define NPAR 4 // Number of arguments #define THR 1 // Default reference allele threshold #define THA 1 // Default alternative allele threshold #define NRID 6 // Number of fields (pos,Depth,DP4) in extracted_vcf files @@ -46,15 +48,22 @@ // ******** declarations of functions void usage(int code); // usage int GetAf (int []); // computes AF percent for a variant position -int GetMax (int [], int i1, int i2); // percent giving max AAF -int ploidy (int ind); // defines ploidy from results of GetMax ploid=2 if ~50% +int GetActualDepth (int []);// computes actual depth for a variant position, sumDP4 +int GetMaxIndex (int [], int i1, int i2); // percent giving max AAF +int ploidy (int ind); // defines ploidy from results of GetMaxIndex ploid=2 if ~50% + +int GetMax (int hist[], int nbins); +int GetMin(int hist[], int nbins); +int EstimateStd (int hist[],int bins[],int nbins, int mu, int height);// returns sd +void GetThresholds(int thr[],int depth_min, int depth_max, int avDP4, int stdDP4, int mode_depth, int sdmo, int bin_width); + //////////////////////////////////////////////////// // main //////////////////////////////////////////////////// int main (int argc, char *argv[]) { - // min_depths for Ref and Alternative alleles + // depth_mins for Ref and Alternative alleles int thR = THR; int thA = THA; @@ -62,10 +71,10 @@ int main (int argc, char *argv[]) { int verbose = 0; // file names - char *extract_vcf_file, *thr_file, *filtered_vcf_file; + char *extract_vcf_file, *thr_file, *filtered_vcf_file, *depth_distr_file; // file handles - FILE *extract_vcfFile, *thrFile, *filtered_vcfFile; + FILE *extract_vcfFile, *thrFile, *filtered_vcfFile, *depth_distrFile; // to read vcfq static const int line_size = 8192; // maximum line size @@ -74,25 +83,34 @@ int main (int argc, char *argv[]) { // values in vcfq file int DP4[4]; // 4xdepths int pos, D; // genome pos of error, depth + int bin_width;// for depth histo int count_beforeF; // variant count before filtering int count_afterF; // variant count after filtering // arrays - int histAF[100]; // maf percentages + int histAF[101]; // maf percentages + int histAAD[101];// distribution of average actual depth DP4 per one percentage of AAF + int bins_depth[101]; + int hist_depth[101]; + // stats float sumDP4, sumDP4_2; // sum of depths and sum of square depths used for mu/std calculation + float sDP4; + // to compute for outputs - int ploid; // polidy + int ploid; // ploidy int avDP4, stdDP4; // mean and std of depth after filtering - int avDP41; // adjusted mean depth - int thRR, thAA; // estimated thresholds based on actual depths avDP4 and stdDP4 + //int thRR, thAA; // estimated thresholds based on actual depths avDP4 and stdDP4 + int depth_max,depth_min; + int thr_low,thr_up; + int i; // array index - static struct option long_options[] = + static struct option long_options[] = { {"thR", 1, 0, 'r'}, {"thA", 1, 0, 'a'}, {"verbose", 0, 0, 'v'}, @@ -121,6 +139,7 @@ int main (int argc, char *argv[]) { extract_vcf_file = argv[optind+0]; thr_file = argv[optind+1]; filtered_vcf_file = argv[optind+2]; + depth_distr_file = argv[optind+3]; } // open files for read/write @@ -140,13 +159,29 @@ int main (int argc, char *argv[]) { exit(EXIT_FAILURE); } + //*depth_distr_file + + depth_distrFile = fopen(depth_distr_file,"w"); + if (depth_distrFile == NULL) { + fprintf(stderr, "cannot open depth_distr_file %s: %s\n", depth_distr_file, strerror(errno)); + exit(EXIT_FAILURE); + } + fprintf(stderr, "get_thr_ploidy\n"); - // initialise AAF histogram - for (i=0;i<100;i++) { + // initialise depth histograms + bin_width=10; + for (i=0;i<101;i++) { + hist_depth[i] = 0.0; + bins_depth[i]=1+i*bin_width; + } + + // initialise AAF and AAD histograma + for (i=0;i<101;i++) { histAF[i] = 0; + histAAD[i]=0; } - + // initialise counts count_beforeF = 0; count_afterF = 0; @@ -154,8 +189,13 @@ int main (int argc, char *argv[]) { // initialise sums sumDP4 = 0; sumDP4_2 = 0; + sDP4=30.0; + + depth_max=sDP4; + depth_min=sDP4; + - // read vcfq file - 6 fields position, depth and 4xdepths (ref/forward, ref/reverse, alt/forward and alt/reverse) + // read vcfq file - 6 fields position, depth and 4xdepths (ref/forward, ref/reverse, alt/forward and alt/reverse) while (fgets(line, line_size, extract_vcfFile)) { int k = sscanf(line, "%d,%d,%d,%d,%d,%d", &pos, &D, &DP4[0], &DP4[1], &DP4[2], &DP4[3]); if (k != NRID) { @@ -166,20 +206,35 @@ int main (int argc, char *argv[]) { // filter for DP4 separately for ref and alternative alleles if (DP4[0] >= thR && DP4[1]>= thR && DP4[2]>= thA && DP4[3]>= thA) { - int AF; - float sDP4; + int AF,AAD;//altrenative allele frequency, actual average depth per variant + //float sDP4; + count_afterF++; - // compute AF and update histo + // compute AF , AAD and update histos AF = GetAf(DP4); histAF[AF-1]++; + AAD =GetActualDepth(DP4); + histAAD[AF-1]=histAAD[AF-1]+AAD; + // calc sDP4 and update sumDP4, sumDP4_2 sDP4 = DP4[0] + DP4[1] + DP4[2] + DP4[3]; sumDP4 += sDP4; sumDP4_2 += sDP4*sDP4; + if (depth_max < sDP4) + depth_max=sDP4; + + if (depth_min > sDP4) + depth_min=sDP4; + + for (i=0;i<100;i++) { + if ((sDP4 >= bins_depth[i]) && (sDP4 < bins_depth[i+1])) + hist_depth[i]++; + } + // write to filtered vcf file if (fprintf(filtered_vcfFile,"%d,%d,%d,%d,%d,%d\n",pos,D, DP4[0], DP4[1],DP4[2], DP4[3]) <= 0) { fprintf(stderr, "error writing filteredVCF file: %s\n", strerror(errno)); @@ -187,6 +242,23 @@ int main (int argc, char *argv[]) { } } } + fprintf(stderr, "min depth after filtering 1 1= %d\n", depth_min); + fprintf(stderr, "max depth after filtering 1 1= %d\n", depth_max); + + for(i=0;i<101;i++){ + if (histAF[i] > 0) { + histAAD[i]=ceil(histAAD[i]/histAF[i]); + + } + } + // write distribution file + for (i=0;i<100;i++) { + if (fprintf(depth_distrFile,"%d %d\n",bins_depth[i], hist_depth[i]) <= 0) { + fprintf(stderr, "error writing depth distribution_file: %s\n", strerror(errno)); + exit(EXIT_FAILURE); + } + } + fprintf(stderr, "count of variants before filtering = %d\n", count_beforeF); fprintf(stderr, "count of variants after filtering = %d\n", count_afterF); @@ -195,9 +267,11 @@ int main (int argc, char *argv[]) { avDP4 = 0; // mean depth stdDP4 = 0; // stddev depth ploid = 1; // ploidy - avDP41 = 0; // adjustered mean depth - thRR = 0; // estimated reference threshold - thAA = 0; // estimated alternative threshold + //avDP41 = 0; // adjusted mean depth + //thRR =100; // estimated reference threshold: set up stupidly high in case not enough data + //thAA =100; // estimated alternative threshold + thr_low=100; + thr_up=0; // do we have enough data ? if (count_afterF < MIN_COUNT_AFTERF) { @@ -205,14 +279,37 @@ int main (int argc, char *argv[]) { } else { int ind, ind50; + int mode_depth,ind_mode; + int height,sdmo; float perc_left; - float ratio; + //float ratio; + + int thr[2];//for output low up thr + + //initialise thr up low + for (i=0; i<2;i++){ + thr[i]=0; + } + // ploidy detection - ind = GetMax(histAF, 1, 80);// percent giving max peak in hist in interval (1,80)% + ind = GetMaxIndex(histAF, 1, 80);// percent giving max peak in hist in interval (1,80)% fprintf(stderr, "percent of max AAF= %d\n", ind); - ind50 = GetMax(histAF, 40, 60);//percent giving max peak in hist in interval (40,60)% + + // -----------compute params of main mode: mode and its sdmo + // mode is the bin with its index + height = GetMax(hist_depth,100); + ind_mode = GetMaxIndex(hist_depth, 1, 100); + mode_depth=bins_depth[ind_mode]; + + //nbins=100; + sdmo = EstimateStd(hist_depth,bins_depth,100,mode_depth,height);//for depth hist + + fprintf(stderr, "mode depth= %d\n", mode_depth);// + fprintf(stderr, "std of mode depth= %d\n", sdmo);// + + ind50 = GetMaxIndex(histAF, 40, 60);//percent giving max peak in hist in interval (40,60)% fprintf(stderr, "percent of max AAF around 50= %d\n", ind50);// ploid = ploidy(ind);// automated detection of ploidy: 1 or 2 @@ -224,61 +321,31 @@ int main (int argc, char *argv[]) { stdDP4 = ceil(sqrt((sumDP4_2 / count_afterF) - avDP4 * avDP4));// perc_left = (100.0 * count_afterF / count_beforeF); + fprintf(stderr, "average depth after filtering= %d\n", avDP4); fprintf(stderr, "stdev depth after filtering= %d\n", stdDP4); - fprintf(stderr, "percent data left filtering= %.2f\n", perc_left); - // adjust depth to account for bad (very deep) regions + fprintf(stderr, "percent data left filtering= %.2f\n", perc_left); - avDP41 = avDP4; - ratio = sqrt((sumDP4_2 / count_afterF) - (avDP4 * avDP4)) / (sumDP4 / count_afterF); // stdDP4 / avDP4; - fprintf(stderr, "cv actual depth = %.2f\n", ratio); - if (ratio > 1.5) { - // bad regions - avDP41 = ceil(0.8*avDP4); - fprintf(stderr, "moderately deep regions detected\n"); - fprintf(stderr, "adjusted average depth = %d\n", avDP41); - } - - if (ratio > 3) { - // very bad regions - avDP41 = ceil(0.7*avDP4); - fprintf(stderr, "extremely deep covered detected\n"); - fprintf(stderr, "adjusted average depth = %d\n", avDP41); - } + //------------------------------ automated working out of thresholds: thr_low thr_upper - // calculate thresholds using adjusted Depth - if (avDP41 > 0 && avDP41 < 4 ) - { - thRR = 100; thAA = 100; // stupidly high ? - fprintf(stderr, "coverage too low, adjusted average depth= %d\n", avDP41); - fprintf(stderr, "theshold set to exclude all variants in mixture calculation\n"); - } - if (avDP41 >= 4 && avDP41 < 10) { - thRR = 1; thAA = 1; - } - if (avDP41 >= 10 && avDP41 < 70) { - thRR = 2; thAA = 2; - } - if (avDP41 >= 70 && avDP41 < 90) { - thRR = 3; thAA = 2; - } - if (avDP41 >= 90) { - thRR = 3; thAA = 3; - } - } + GetThresholds(thr,depth_min,depth_max, avDP4, stdDP4, mode_depth, sdmo, bin_width); + thr_low=thr[0]; + thr_up=thr[1]; + }// if enough data // write threshold file - if (fprintf(thrFile, "%d %d %d %d %d\n", thRR, thAA, avDP41, stdDP4, ploid) <= 0) { + if (fprintf(thrFile, "%d %d %d %d %d\n", thr_low, thr_up, avDP4, stdDP4, ploid) <= 0) { fprintf(stderr, "error writing threshold_file: %s\n", strerror(errno)); exit(EXIT_FAILURE); } - + // close files fclose(extract_vcfFile); fclose(thrFile); fclose(filtered_vcfFile); + fclose(depth_distrFile); fprintf(stderr, "done get_thr_ploidy.\n"); return 0; @@ -291,24 +358,24 @@ void usage(int code) { FILE *usagefp = stderr; - fprintf(usagefp, "get_thr_ploidy\n\n"); - fprintf(usagefp, - "Usage: get_thr_ploidy [options] input_vcf_file threshold_file filtered_vcf_file\n" - "\n" " computes minimum depth thresholds for reference and alternative alleles\n"); - fprintf(usagefp, "\n"); - fprintf(usagefp, " options:\n"); - fprintf(usagefp, "\n"); - fprintf(usagefp, " -r minimum reference allele depth\n"); - fprintf(usagefp, " default 1\n"); - fprintf(usagefp, "\n"); - fprintf(usagefp, " -a minimum reference allele depth\n"); - fprintf(usagefp, " default 1\n"); - fprintf(usagefp, "\n"); - fprintf(usagefp, " -v verbose\n"); - fprintf(usagefp, " default false\n"); - fprintf(usagefp, "\n"); - - exit(code); + fprintf(usagefp, "get_thr_ploidy\n\n"); + fprintf(usagefp, + "Usage: get_thr_ploidy [options] input_vcf_file threshold_file filtered_vcf_file\n" + "\n" " computes minimum depth thresholds for reference and alternative alleles\n"); + fprintf(usagefp, "\n"); + fprintf(usagefp, " options:\n"); + fprintf(usagefp, "\n"); + fprintf(usagefp, " -r minimum reference allele depth\n"); + fprintf(usagefp, " default 1\n"); + fprintf(usagefp, "\n"); + fprintf(usagefp, " -a minimum reference allele depth\n"); + fprintf(usagefp, " default 1\n"); + fprintf(usagefp, "\n"); + fprintf(usagefp, " -v verbose\n"); + fprintf(usagefp, " default false\n"); + fprintf(usagefp, "\n"); + + exit(code); } //////////////////////////////////////////////////// @@ -330,20 +397,37 @@ int GetAf (int data[]) return AF; } +//////////////////////////////////////////////////// +// Calculates actual depth from data=DP4 counts +// input - array (4-vector) of DP4 counts +// output -sumDP4 into array of percentages 0,..,100 +//////////////////////////////////////////////////// +int GetActualDepth (int data[]) +{ + float sum; + int i; + + sum = data[0]; + for (i=1; i<4; i++) { + sum += data[i]; + } + + return sum; +} //////////////////////////////////////////////////// // find position of the max value in a sub-interval (i1<=i<=i2) of an array //////////////////////////////////////////////////// -int GetMax (int data[], int i1, int i2) +int GetMaxIndex (int data[], int i1, int i2) { - int Imax; + int data_max; int ind, i; - Imax = data[i1]; + data_max = data[i1]; ind = i1; for (i=i1; i= mu)// to the Right of mode + { + if ((hist[n] > threshold) & (hist[n] >= hist[n+1]))// added monotonity condition 29 Jan + { + ma_bin = bins[n]; + } + else + { + break; + } + } + } + + for (n=nbins-1; n>=0; n--) + { + if (bins[n] <= mu)// to the Left of mode + { + if ((hist[n] > threshold) & (hist[n-1] <= hist[n]))// added monotonity condition 29 Jan + { + mi_bin = bins[n]; + } + else + { + break; + } + } + } + + sd = ceil(0.5 * (ma_bin - mi_bin)); + + if (mu==bins[0]) + sd = (ma_bin - mi_bin); + if (mu==bins[nbins-1]) + sd = (ma_bin - mi_bin); + + + return sd; +} +/////////////////////// +//// --------- maximum value in a vector +///// +int GetMax (int hist[], int nbins) +{ + float Hmax; + int i; + + Hmax = hist[0]; + for (i=0; i hist[i]) + Hmin = hist[i]; + + } + + return Hmin; +} + +/////////////////////// +//// --------- work out threshold up and low from depth_hist and its params: updates thr array +// input int thr[] = thr_up thr_low +///// +void GetThresholds(int thr[],int depth_min, int depth_max, int avDP4, int stdDP4, int mode_depth, int sdmo, int bin_width) +{ + int i; + float cvv; + float thrL,thrU; + int data[2]; + + //initialise thresholds + thrL=depth_min*0.25; + thrU=depth_max; + + // depending on depth distribution: its symmetry, range, variation + cvv=0.0; + if (avDP4 > 0){ + cvv=1.0*stdDP4/avDP4; + } + fprintf(stderr, "CV of depth= %.2f\n", cvv); + + if (cvv>1){ + fprintf(stderr, "some regions are stupidly deep covered\n"); + } + + + //1 assymetric distribution + + //1.1 skewed to the left: (avDP4-mode_depth) > bin_width + + if ((avDP4-mode_depth) > bin_width){ + data[0]=depth_min; + data[1]=mode_depth-sdmo; + thrU=GetMax(data,2); + + data[0]=depth_max; + data[1]=avDP4+3*stdDP4; + thrL=GetMin(data,2); + } + + //1.2 skewed to the right: (avDP4-mode_depth) < - bin_width + + if ((avDP4-mode_depth) < - bin_width){ + data[0]=depth_min; + data[1]=avDP4 - 2*stdDP4; + thrU=GetMax(data,2); + + data[0]=depth_max; + data[1]=mode_depth+sdmo; + thrL=GetMin(data,2); + } + + //1.3 symmetrical: mu+-3 std + + + if ( (avDP4-mode_depth) >= - bin_width && (avDP4-mode_depth) <= bin_width ){ + data[0]=depth_min; + data[1]=avDP4 - 2*stdDP4; + thrU=GetMax(data,2); + + data[0]=depth_max; + data[1]=avDP4 + 2*stdDP4;; + thrL=GetMin(data,2); + } + + + + thr[0]=ceil(thrU*0.25); + thr[1]=ceil(thrL); + + for (i=0;i<2;i++){ + fprintf(stderr, "thr filt low_up = %d\n", thr[i]); + + } + +} + + + + + + + + + + + + + + + + + + + + From 5894e034f8f3782f0f23f29d258bcaa9b5fb1e01 Mon Sep 17 00:00:00 2001 From: Irina Abnizova Date: Fri, 29 Apr 2016 07:37:09 +0100 Subject: [PATCH 16/17] adjusted for read depth bias in likelihoods of mixture; used new thresholds --- src/contamination/get_mixture.c | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/src/contamination/get_mixture.c b/src/contamination/get_mixture.c index c856d5d..10dde46 100755 --- a/src/contamination/get_mixture.c +++ b/src/contamination/get_mixture.c @@ -27,7 +27,7 @@ // ******** predefined user constants, #define NPAR 4 // Number of arguments -#define NTHR 5 // Number of fields (thR, thA, mu, std and ploidy (1=haploid 2=diploid)) +#define NTHR 5 // Number of fields (thr_low, thr_up, mu, std and ploidy (1=haploid 2=diploid)) #define NRID 6 // Number of fields (pos,Depth,DP4) in extracted_vcf files #define MIN_COUNT_AFTERF 200 // Minimum variant count after filtering #define DEFAULT_DIST 11 // Default bin width for hist to compute skewness @@ -59,7 +59,7 @@ int main (int argc, char *argv[]) { // values in threshold file int muD, stdD; - int thR, thA, ploid; + int thr_low, thr_up, ploid; // to read vcfa/vcfq static const int line_size = 8192; // maximum line size @@ -86,13 +86,13 @@ int main (int argc, char *argv[]) { int count_afterF; // variant count after filtering //---------------------------arrays - int histAF[100]; // to store mafs percentages + int histAF[101]; // to store mafs percentages int st[4], stc [4]; // starts of summing intervals for histo float sk[3], skc[3]; // three skewnesses for mode0,1,2 int mix[3], mixc[3]; // mixtures three modes and their complements float Lik[3]; // likelihood or confidences three modes - static struct option long_options[] = + static struct option long_options[] = { {"verbose", 0, 0, 'v'}, {"help", 0, 0, 'h'}, {0, 0, 0, 0} @@ -145,13 +145,13 @@ int main (int argc, char *argv[]) { fprintf(stderr, "get_mixture\n"); // initialise AAF histogram - for (i=0;i<100;i++) { + for (i=0;i<101;i++) { histAF[i] = 0; } // read thr file - 5 fields theshold ref, threshold alt, mean depath, stddev depth and ploidy while (fgets(line, line_size, thrFile)) { - int k = sscanf(line, "%d %d %d %d %d", &thR, &thA, &muD, &stdD, &ploid); + int k = sscanf(line, "%d %d %d %d %d", &thr_low, &thr_up, &muD, &stdD, &ploid); if (k != NTHR) { // number of fields read not correct fprintf (stderr, "corrupt threshold file %s\n", line); @@ -167,7 +167,7 @@ int main (int argc, char *argv[]) { // initialise sums sumDP4 = 0; - // read vcfq file - 6 fields position, depth and 4xdepth (ref/forward, ref/reverse, alt/forward and alt/reverse) + // read vcfq file - 6 fields position, depth and 4xdepth (ref/forward, ref/reverse, alt/forward and alt/reverse) while (fgets(line, line_size, extract_vcfFile)) { int k = sscanf(line, "%d,%d,%d,%d,%d,%d", &pos, &D, &DP4[0], &DP4[1], &DP4[2], &DP4[3]); if (k != NRID) { @@ -178,7 +178,7 @@ int main (int argc, char *argv[]) { count_beforeF++; // filter for DP4 separately for ref and alternative alleles and abnormaly large Depth - if( DP4[0] >= thR && DP4[1]>= thR && DP4[2]>= thA && DP4[3]>= thA && D < (muD+2*stdD) ) { + if( DP4[0] >= thr_low && DP4[1]>=thr_low && DP4[2]>= thr_low && DP4[3]>= thr_low && D < (thr_up) ) { int AF; float sDP4; @@ -254,7 +254,7 @@ int main (int argc, char *argv[]) { exit(EXIT_FAILURE); } } - + // write mixture file if (fprintf(mixFile,"mix low freq=%d\nconfidence low freq=%.4f\n", mix[0], Lik[0]) <= 0 ) { fprintf(stderr, "error writing mixture_file: %s\n", strerror(errno)); @@ -268,7 +268,7 @@ int main (int argc, char *argv[]) { fprintf(stderr, "error writing mixture_file: %s\n", strerror(errno)); exit(EXIT_FAILURE); } - if (fprintf(mixFile,"AvActDepth=%d\nmin_depthR=%d\nmin_depthA=%d\n", avDP4, thR, thA) <= 0 ) { + if (fprintf(mixFile,"AvActDepth=%d\nmin_depth=%d\nmax_depth=%d\n", avDP4, thr_low, thr_up) <= 0 ) { fprintf(stderr, "error writing mixture_file: %s\n", strerror(errno)); exit(EXIT_FAILURE); } @@ -369,7 +369,7 @@ float GetStd( int data[]) } //////////////////////////////////////////////////// -// new skewnesses for three modes +// skewnesses for three modes //////////////////////////////////////////////////// void skewnesses (int data[], float sk[], float skc[], int st[], int stc[]) { @@ -413,7 +413,7 @@ void skewnesses (int data[], float sk[], float skc[], int st[], int stc[]) } //////////////////////////////////////////////////// -// mixture mode +// mixture in a mode //////////////////////////////////////////////////// void MixMode(int mix[], int st[], int data[], int dist) { @@ -430,7 +430,7 @@ void MixMode(int mix[], int st[], int data[], int dist) } //////////////////////////////////////////////////// -// mixture mode complement +// mixture in the complementary modes, 100 - alfa //////////////////////////////////////////////////// void MixModeComplement(int mixc[], int stc[], int data[], int dist) { @@ -502,6 +502,7 @@ int GetMaxPeakInterval(int data[], int n1, int n2) //////////////////////////////////////////////////// // confidences modes 0,1,2 +// we assume that the larger is skewness of a mode (pronounced humph), the more likely mixture belongs there //////////////////////////////////////////////////// void Likely (float Lik[], float sk[],float skc[], int mixc[],float avDP4) { @@ -516,7 +517,9 @@ void Likely (float Lik[], float sk[],float skc[], int mixc[],float avDP4) l = sk[i]; lc = skc[2-i]; Lik[i] = (1-1/avDP4) * (l+lc+lp) / 3; - } + } + Lik[2]=(1-1/avDP4) * (l+lc*0.5+lp*0.5) / 3;// adjusted for majority of reads in this range +; } From 93a0e50a6dffc959b38a6adafb7553fa32598db5 Mon Sep 17 00:00:00 2001 From: Steven Leonard Date: Wed, 30 Nov 2016 15:36:21 +0000 Subject: [PATCH 17/17] work round a bwa bug, exclude properly paired reads if the mate is unmapped --- src/calibration_pu/calibration_pu.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/calibration_pu/calibration_pu.c b/src/calibration_pu/calibration_pu.c index 0458cc0..5cc5753 100644 --- a/src/calibration_pu/calibration_pu.c +++ b/src/calibration_pu/calibration_pu.c @@ -1553,6 +1553,7 @@ SurvTable **makeSurvTable(Settings *s, samfile_t *fp_bam, int *bam_ntiles, size_ if (BAM_FSECONDARY & bam->core.flag) continue; if (BAM_FSUPPLIMENTARY & bam->core.flag) continue; if (BAM_FPAIRED & bam->core.flag) { + if (BAM_FMUNMAP & bam->core.flag) continue; if (0 == (BAM_FPROPER_PAIR & bam->core.flag)) { continue; }