-
Notifications
You must be signed in to change notification settings - Fork 3
/
ngs.h
430 lines (338 loc) · 10.6 KB
/
ngs.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
/***************************************************************************
* Copyright (C) 2009 by BUI Quang Minh *
* minh.bui@univie.ac.at *
* *
* This program is free software; you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation; either version 2 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with this program; if not, write to the *
* Free Software Foundation, Inc., *
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *
***************************************************************************/
#ifndef NGS_H
#define NGS_H
#include "phylotree.h"
#include "alignmentpairwise.h"
#include "ratemeyerdiscrete.h"
/*
collection of classes for Next-generation sequencing
*/
/**
NGS Pairwise alignment
@author BUI Quang Minh <minh.bui@univie.ac.at>
*/
class NGSAlignment : public AlignmentPairwise
{
public:
/**
constructor
@param filename file in Fritz's format
*/
NGSAlignment(PhyloTree *atree);
NGSAlignment(const char *filename);
/**
constructor
@param nstate number of states
@param ncat number of categories
@param freq pair-state frequencies for all categories
*/
NGSAlignment(int nstate, int ncat, double *freq);
NGSAlignment(int nstate, string &seq1, string &seq2);
virtual char convertState(char state, SeqType seq_type);
/**
read file in Fritz's format
*/
void readFritzFile(const char *filename);
/**
compute empirical state frequencies from the alignment
@param state_freq (OUT) is filled with state frequencies, assuming state_freq was allocated with
at least num_states entries.
*/
virtual void computeStateFreq(double *state_freq);
/**
compute the sum of pair state frequencies over all categories
@param sum_pair_freq (OUT) will be filled in with num_states*num_states entries.
Memory has to be allocated before calling this function.
*/
void computeSumPairFreq (double *sum_pair_freq);
/**
compute empirical rates between state pairs
@param rates (OUT) vector of size num_states*(num_states-1)/2 for the rates
*/
virtual void computeEmpiricalRate (double *rates);
/**
compute the empirical distance for a category, used to initialize rate scaling factor
@param cat specific category, between 0 and ncategory-1
*/
double computeEmpiricalDist(int cat);
/**
negative likelihood function for a category with a rate scaling factor
@param cat specific category, between 0 and ncategory-1
@param value a rate scaling factor
@return negative log-likelihood (for minimization purpose)
*/
double computeFunctionCat(int cat, double value);
/**
negative likelihood and 1st and 2nd derivative function for a category with a rate scaling factor
@param cat specific category, between 0 and ncategory-1
@param value a rate scaling factor
@param df (OUT) 1st derivative
@param ddf (OUT) 2nd derivative
@return negative log-likelihood (for minimization purpose)
*/
double computeFuncDervCat(int cat, double value, double &df, double &ddf);
/**
number of category
*/
int ncategory;
//double *pair_freq;
};
class NGSTree : public PhyloTree {
public:
/**
* Constructor with given alignment
* @param params program parameters
* @param alignment
*/
NGSTree(Params ¶ms, NGSAlignment *alignment);
/**
compute the tree likelihood
@param pattern_lh (OUT) if not NULL, the function will assign pattern log-likelihoods to this vector
assuming pattern_lh has the size of the number of patterns
@return tree likelihood
*/
virtual double computeLikelihood(double *pattern_lh = NULL);
/**
optimize all branch lengths of the tree
@param iterations number of iterations to loop through all branches
@return the likelihood of the tree
*/
virtual double optimizeAllBranches(int my_iterations = 100, double tolerance = TOL_LIKELIHOOD);
};
class NGSRate : public RateMeyerDiscrete {
public:
/**
@param tree must be NGSTree type
*/
NGSRate(PhyloTree *tree);
/**
get rate category of a specified site-pattern.
@param ptn pattern ID
@return the rate category of the specified site-pattern
*/
virtual int getPtnCat(int ptn) { return 0; }
/**
optimize rates of all site-patterns
compute categorized rates from the "continuous" rate of the original Meyer & von Haeseler model.
The current implementation uses the k-means algorithm with k-means++ package.
*/
virtual double optimizeParameters(double epsilon);
/**
This function is inherited from Optimization class for optimizting site rates
@param value x-value of the function
@return f(value) of function f you want to minimize
*/
virtual double computeFunction(double value);
/**
This function calculate f(value), first derivative f'(value) and 2nd derivative f''(value).
@param value x-value of the function
@param df (OUT) first derivative
@param ddf (OUT) second derivative
@return f(value) of function f you want to minimize
*/
virtual double computeFuncDerv(double value, double &df, double &ddf);
/**
classify rates into categories.
@param tree_lh the current tree log-likelihood
*/
virtual double classifyRates(double tree_lh) { return tree_lh; }
/**
write information
@param out output stream
*/
virtual void writeInfo(ostream &out);
};
class NGSRateCat : public RateMeyerDiscrete {
public:
/**
@param tree must be NGSTree type
*/
NGSRateCat(PhyloTree *tree, int ncat);
/**
optimize rates of all site-patterns
compute categorized rates from the "continuous" rate of the original Meyer & von Haeseler model.
The current implementation uses the k-means algorithm with k-means++ package.
*/
virtual double optimizeParameters(double epsilon);
/**
return the number of dimensions
*/
virtual int getNDim();
/**
the target function which needs to be optimized
@param x the input vector x
@return the function value at x
*/
virtual double targetFunk(double x[]);
/**
write information
@param out output stream
*/
virtual void writeInfo(ostream &out);
/**
proportion of position categories
*/
double *proportion;
/**
sum of pair freq from all positions
*/
double *sum_pair_freq;
protected:
/**
this function is served for the multi-dimension optimization. It should pack the model parameters
into a vector that is index from 1 (NOTE: not from 0)
@param variables (OUT) vector of variables, indexed from 1
*/
virtual void setVariables(double *variables);
/**
this function is served for the multi-dimension optimization. It should assign the model parameters
from a vector of variables that is index from 1 (NOTE: not from 0)
@param variables vector of variables, indexed from 1
*/
virtual void getVariables(double *variables);
};
class NGSTreeCat : public NGSTree {
public:
/**
* Constructor with given alignment
* @param params program parameters
* @param alignment
*/
NGSTreeCat(Params ¶ms, NGSAlignment *alignment);
/**
compute the tree likelihood
@param pattern_lh (OUT) if not NULL, the function will assign pattern log-likelihoods to this vector
assuming pattern_lh has the size of the number of patterns
@return tree likelihood
*/
virtual double computeLikelihood(double *pattern_lh = NULL);
};
class NGSRead : public NGSAlignment {
public:
/**
constructor
*/
NGSRead(PhyloTree *atree);
void init();
//int orig_length;
/**
alignment score
*/
int score; //brauch ich das???
/**
read ID
*/
int id;
//int scaff_id;
/**
matched position in the reference sequence
*/
int match_pos;
/**
TRUE for mapping forward strand, FALSE for backward
*/
bool direction;
/**
name of the reference sequence for which match found
*/
string chr;
/**
mapped portion of reference sequence
*/
string scaff;
/**
mapped portion of read
*/
string read;
/**
number of times that read is mapped (multiple optimal alignment scores)
*/
double times;
/**
TRUE if it is the first match, FALSE otherwise (in case of multiple hits)
*/
bool flag;
/**
alignment identity
*/
float identity;
/**
read name
*/
string name;
double homo_rate;
void computePairFreq();
/**
compute the likelihood for a distance between two sequences. Used for the ML optimization of the distance.
@param value x-value of the function
@return log-likelihood
*/
virtual double computeFunction(double value);
/**
This function calculate f(value), first derivative f'(value) and 2nd derivative f''(value).
used by Newton raphson method to minimize the function.
@param value x-value of the function
@param df (OUT) first derivative
@param ddf (OUT) second derivative
@return f(value) of function f you want to minimize
*/
virtual double computeFuncDerv(double value, double &df, double &ddf);
};
struct ReadInfo {
int id;
float identity;
float distance;
float logl;
float homo_distance;
float homo_logl;
};
class NGSReadSet : public vector<ReadInfo> {
public:
/**
read in file containing mapped reads to the reference
@param filename file name
@param ref_ID reference sequence name to accept reads
@param ident identity threshold to accept reads
@param mismatches number of exact mismatches to accept reads
*/
void parseNextGen(string filename, string ref_ID="total",double ident=0.0,int mismatches=-1);
/**
this function will be called everytime a read is accepted from the parseNextGen()
@param tempread read at current position while parsing
*/
virtual void processReadWhileParsing(NGSRead &tempread);
void writeFreqMatrix(ostream &out);
/**
write information
*/
void writeInfo();
PhyloTree *tree;
double homo_rate;
vector<double*> pair_freq;
vector<double*> state_freq;
bool ngs_ignore_gaps;
};
/**
Main function
@param params input program parameters
*/
void runNGSAnalysis(Params ¶ms);
#endif