-
Notifications
You must be signed in to change notification settings - Fork 0
/
Chapter3.bib
1522 lines (1420 loc) · 125 KB
/
Chapter3.bib
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
%% This BibTeX bibliography file was created using BibDesk.
%% http://bibdesk.sourceforge.net/
%% Created for Michael P. Cummings at 2015-02-03 22:42:20 +0100
%% Saved with string encoding Unicode (UTF-8)
@article{hartigan1979algorithm,
title={Algorithm {AS} 136: A k-means clustering algorithm},
author={Hartigan, John A and Wong, Manchek A},
journal={Applied statistics},
pages={100--108},
year={1979},
publisher={JSTOR}}
@article{zhou2014compression,
Author = {Zhou, Jiarui and Ji, Zhen and Zhu, Zexuan and He, Shan},
Date-Added = {2015-02-03 20:08:27 +0000},
Date-Modified = {2015-02-03 20:08:27 +0000},
Journal = {BMC bioinformatics},
Number = {Suppl 15},
Pages = {S10},
Publisher = {BioMed Central},
Title = {Compression of next-generation sequencing quality scores using memetic algorithm},
Volume = {15},
Year = {2014}}
@article{janin2013adaptive,
Author = {Janin, Lilian and Rosone, Giovanna and Cox, Anthony J},
Date-Added = {2015-02-03 19:53:44 +0000},
Date-Modified = {2015-02-03 19:53:44 +0000},
Journal = {Bioinformatics},
Pages = {btt257},
Publisher = {Oxford Univ Press},
Title = {Adaptive reference-free compression of sequence quality scores},
Year = {2013}}
@electronic{sickle,
Author = {Joshi NA and Fass JN},
Date-Added = {2015-02-03 09:52:02 +0000},
Date-Modified = {2015-02-03 10:14:06 +0000},
Title = {Sickle: A sliding-window, adaptive, quality-based trimming tool for {FastQ} files (Version 1.33)},
Url = {https://github.com/najoshi/sickle},
Year = {2013}}
@article{asnani2012lossy,
Author = {Asnani, Himanshu and Bharadia, Dinesh and Chowdhury, Mainak and Ochoa, Idoia and Sharon, Itai and Weissman, Tsachy},
Date-Added = {2015-02-02 18:12:19 +0000},
Date-Modified = {2015-02-02 18:12:19 +0000},
Journal = {arXiv preprint arXiv:1207.5184},
Title = {Lossy compression of quality values via rate distortion theory},
Year = {2012}}
@article{Deorowicz:2013hq,
Abstract = {: Post-Sanger sequencing methods produce tons of data, and there is a general agreement that the challenge to store and process them must be addressed with data compression. In this review we first answer the question "why compression" in a quantitative manner. Then we also answer the questions "what" and "how", by sketching the fundamental compression ideas, describing the main sequencing data types and formats, and comparing the specialized compression algorithms and tools. Finally, we go back to the question "why compression" and give other, perhaps surprising answers, demonstrating the pervasiveness of data compression techniques in computational biology.},
Author = {Deorowicz, Sebastian and Grabowski, Szymon},
Date-Added = {2015-02-02 17:58:05 +0000},
Date-Modified = {2015-02-02 17:58:05 +0000},
Doi = {10.1186/1748-7188-8-25},
Journal = {Algorithms Mol Biol},
Journal-Full = {Algorithms for molecular biology : AMB},
Number = {1},
Pages = {25},
Pmc = {PMC3868316},
Pmid = {24252160},
Pst = {epublish},
Title = {Data compression for sequencing data},
Volume = {8},
Year = {2013},
Bdsk-Url-1 = {http://dx.doi.org/10.1186/1748-7188-8-25}}
@article{Koren:2013ye,
Abstract = {BACKGROUND: The short reads output by first- and second-generation DNA sequencing instruments cannot completely reconstruct microbial chromosomes. Therefore, most genomes have been left unfinished due to the significant resources required to manually close gaps in draft assemblies. Third-generation, single-molecule sequencing addresses this problem by greatly increasing sequencing read length, which simplifies the assembly problem.
RESULTS: To measure the benefit of single-molecule sequencing on microbial genome assembly, we sequenced and assembled the genomes of six bacteria and analyzed the repeat complexity of 2,267 complete bacteria and archaea. Our results indicate that the majority of known bacterial and archaeal genomes can be assembled without gaps, at finished-grade quality, using a single PacBio RS sequencing library. These single-library assemblies are also more accurate than typical short-read assemblies and hybrid assemblies of short and long reads.
CONCLUSIONS: Automated assembly of long, single-molecule sequencing data reduces the cost of microbial finishing to $1,000 for most genomes, and future advances in this technology are expected to drive the cost lower. This is expected to increase the number of completed genomes, improve the quality of microbial genome databases, and enable high-fidelity, population-scale studies of pan-genomes and chromosomal organization.},
Author = {Koren, Sergey and Harhay, Gregory P and Smith, Timothy P L and Bono, James L and Harhay, Dayna M and Mcvey, Scott D and Radune, Diana and Bergman, Nicholas H and Phillippy, Adam M},
Date-Added = {2015-02-02 13:28:10 +0000},
Date-Modified = {2015-02-02 13:28:10 +0000},
Doi = {10.1186/gb-2013-14-9-r101},
Journal = {Genome Biol},
Journal-Full = {Genome biology},
Mesh = {Algorithms; Base Sequence; Contig Mapping; Escherichia coli; Francisella tularensis; Genome Size; Genome, Archaeal; Genome, Bacterial; Genomic Library; Mannheimia haemolytica; Molecular Sequence Data; Salmonella enterica; Sequence Analysis, DNA; Software},
Number = {9},
Pages = {R101},
Pmc = {PMC4053942},
Pmid = {24034426},
Pst = {ppublish},
Title = {Reducing assembly complexity of microbial genomes with single-molecule sequencing},
Volume = {14},
Year = {2013},
Bdsk-Url-1 = {http://dx.doi.org/10.1186/gb-2013-14-9-r101}}
@article{Ferrarini:2013vf,
Abstract = {BACKGROUND: Second generation sequencing has permitted detailed sequence characterisation at the whole genome level of a growing number of non-model organisms, but the data produced have short read-lengths and biased genome coverage leading to fragmented genome assemblies. The PacBio RS long-read sequencing platform offers the promise of increased read length and unbiased genome coverage and thus the potential to produce genome sequence data of a finished quality containing fewer gaps and longer contigs. However, these advantages come at a much greater cost per nucleotide and with a perceived increase in error-rate. In this investigation, we evaluated the performance of the PacBio RS sequencing platform through the sequencing and de novo assembly of the Potentilla micrantha chloroplast genome.
RESULTS: Following error-correction, a total of 28,638 PacBio RS reads were recovered with a mean read length of 1,902 bp totalling 54,492,250 nucleotides and representing an average depth of coverage of 320× the chloroplast genome. The dataset covered the entire 154,959 bp of the chloroplast genome in a single contig (100% coverage) compared to seven contigs (90.59% coverage) recovered from an Illumina data, and revealed no bias in coverage of GC rich regions. Post-assembly the data were largely concordant with the Illumina data generated and allowed 187 ambiguities in the Illumina data to be resolved. The additional read length also permitted small differences in the two inverted repeat regions to be assigned unambiguously.
CONCLUSIONS: This is the first report to our knowledge of a chloroplast genome assembled de novo using PacBio sequence data. The PacBio RS data generated here were assembled into a single large contig spanning the P. micrantha chloroplast genome, with a higher degree of accuracy than an Illumina dataset generated at a much greater depth of coverage, due to longer read lengths and lower GC bias in the data. The results we present suggest PacBio data will be of immense utility for the development of genome sequence assemblies containing fewer unresolved gaps and ambiguities and a significantly smaller number of contigs than could be produced using short-read sequence data alone.},
Author = {Ferrarini, Marco and Moretto, Marco and Ward, Judson A and {\v S}urbanovski, Nada and Stevanovi{\'c}, Vladimir and Giongo, Lara and Viola, Roberto and Cavalieri, Duccio and Velasco, Riccardo and Cestaro, Alessandro and Sargent, Daniel J},
Date-Added = {2015-02-02 13:26:39 +0000},
Date-Modified = {2015-02-02 13:30:22 +0000},
Doi = {10.1186/1471-2164-14-670},
Journal = {BMC Genomics},
Journal-Full = {BMC genomics},
Mesh = {Base Composition; Base Sequence; Databases, Genetic; Genome, Chloroplast; Potentilla; Sequence Analysis, DNA; Software},
Pages = {670},
Pmc = {PMC3853357},
Pmid = {24083400},
Pst = {epublish},
Title = {An evaluation of the {PacBio RS} platform for sequencing and de novo assembly of a chloroplast genome},
Volume = {14},
Year = {2013},
Bdsk-Url-1 = {http://dx.doi.org/10.1186/1471-2164-14-670}}
@article{pedregosa2011scikit,
Author = {Pedregosa, Fabian and Varoquaux, Ga{\"e}l and Gramfort, Alexandre and Michel, Vincent and Thirion, Bertrand and Grisel, Olivier and Blondel, Mathieu and Prettenhofer, Peter and Weiss, Ron and Dubourg, Vincent and others},
Date-Added = {2015-02-02 13:10:41 +0000},
Date-Modified = {2015-02-02 13:10:41 +0000},
Journal = {The Journal of Machine Learning Research},
Pages = {2825--2830},
Publisher = {JMLR. org},
Title = {Scikit-learn: Machine learning in Python},
Volume = {12},
Year = {2011}}
@inproceedings{macqueen1967some,
Author = {MacQueen, James and others},
Booktitle = {Proceedings of the fifth Berkeley symposium on mathematical statistics and probability},
Date-Added = {2015-02-02 13:08:18 +0000},
Date-Modified = {2015-02-02 13:08:18 +0000},
Number = {14},
Organization = {Oakland, CA, USA.},
Pages = {281--297},
Title = {Some methods for classification and analysis of multivariate observations},
Volume = {1},
Year = {1967}}
@article{mcgill1978variations,
Author = {McGill, Robert and Tukey, John W and Larsen, Wayne A},
Date-Added = {2015-02-02 13:03:37 +0000},
Date-Modified = {2015-02-02 13:03:37 +0000},
Journal = {The American Statistician},
Number = {1},
Pages = {12--16},
Publisher = {Taylor \& Francis Group},
Title = {Variations of box plots},
Volume = {32},
Year = {1978}}
@article{Kozanitis:2011kl,
Abstract = {With the advent of next generation sequencing technologies, the cost of sequencing whole genomes is poised to go below $1000 per human individual in a few years. As more and more genomes are sequenced, analysis methods are undergoing rapid development, making it tempting to store sequencing data for long periods of time so that the data can be re-analyzed with the latest techniques. The challenging open research problems, huge influx of data, and rapidly improving analysis techniques have created the need to store and transfer very large volumes of data. Compression can be achieved at many levels, including trace level (compressing image data), sequence level (compressing a genomic sequence), and fragment-level (compressing a set of short, redundant fragment reads, along with quality-values on the base-calls). We focus on fragment-level compression, which is the pressing need today. Our article makes two contributions, implemented in a tool, SlimGene. First, we introduce a set of domain specific loss-less compression schemes that achieve over 40× compression of fragments, outperforming bzip2 by over 6×. Including quality values, we show a 5× compression using less running time than bzip2. Second, given the discrepancy between the compression factor obtained with and without quality values, we initiate the study of using "lossy" quality values. Specifically, we show that a lossy quality value quantization results in 14× compression but has minimal impact on downstream applications like SNP calling that use the quality values. Discrepancies between SNP calls made between the lossy and loss-less versions of the data are limited to low coverage areas where even the SNP calls made by the loss-less version are marginal.},
Author = {Kozanitis, Christos and Saunders, Chris and Kruglyak, Semyon and Bafna, Vineet and Varghese, George},
Date-Added = {2015-02-02 12:39:19 +0000},
Date-Modified = {2015-02-02 12:39:37 +0000},
Doi = {10.1089/cmb.2010.0253},
Journal = {J Comput Biol},
Journal-Full = {Journal of computational biology : a journal of computational molecular cell biology},
Mesh = {Algorithms; Data Compression; Genome, Human; Genomics; Humans; Polymorphism, Single Nucleotide; Sequence Analysis, DNA},
Month = {Mar},
Number = {3},
Pages = {401-13},
Pmc = {PMC3123913},
Pmid = {21385043},
Pst = {ppublish},
Title = {Compressing genomic sequence fragments using {SlimGene}},
Volume = {18},
Year = {2011},
Bdsk-Url-1 = {http://dx.doi.org/10.1089/cmb.2010.0253}}
@article{McKenna:2010bh,
Abstract = {Next-generation DNA sequencing (NGS) projects, such as the 1000 Genomes Project, are already revolutionizing our understanding of genetic variation among individuals. However, the massive data sets generated by NGS--the 1000 Genome pilot alone includes nearly five terabases--make writing feature-rich, efficient, and robust analysis tools difficult for even computationally sophisticated individuals. Indeed, many professionals are limited in the scope and the ease with which they can answer scientific questions by the complexity of accessing and manipulating the data produced by these machines. Here, we discuss our Genome Analysis Toolkit (GATK), a structured programming framework designed to ease the development of efficient and robust analysis tools for next-generation DNA sequencers using the functional programming philosophy of MapReduce. The GATK provides a small but rich set of data access patterns that encompass the majority of analysis tool needs. Separating specific analysis calculations from common data management infrastructure enables us to optimize the GATK framework for correctness, stability, and CPU and memory efficiency and to enable distributed and shared memory parallelization. We highlight the capabilities of the GATK by describing the implementation and application of robust, scale-tolerant tools like coverage calculators and single nucleotide polymorphism (SNP) calling. We conclude that the GATK programming framework enables developers and analysts to quickly and easily write efficient and robust NGS tools, many of which have already been incorporated into large-scale sequencing projects like the 1000 Genomes Project and The Cancer Genome Atlas.},
Author = {McKenna, Aaron and Hanna, Matthew and Banks, Eric and Sivachenko, Andrey and Cibulskis, Kristian and Kernytsky, Andrew and Garimella, Kiran and Altshuler, David and Gabriel, Stacey and Daly, Mark and DePristo, Mark A},
Date-Added = {2015-02-02 12:29:22 +0000},
Date-Modified = {2015-02-02 13:34:27 +0000},
Doi = {10.1101/gr.107524.110},
Journal = {Genome Res},
Journal-Full = {Genome research},
Mesh = {Base Sequence; Genome; Genomics; Sequence Analysis, DNA; Software},
Month = {Sep},
Number = {9},
Pages = {1297-303},
Pmc = {PMC2928508},
Pmid = {20644199},
Pst = {ppublish},
Title = {The {Genome Analysis Toolkit}: a {MapReduce} framework for analyzing next-generation {DNA} sequencing data},
Volume = {20},
Year = {2010},
Bdsk-Url-1 = {http://dx.doi.org/10.1101/gr.107524.110}}
@article{Pathak:2014zl,
Abstract = {MOTIVATION: Next-generation sequencing (NGS) technologies have revolutionized genomic research by reducing the cost of whole-genome sequencing. One of the biggest challenges posed by modern sequencing technology is economic storage of NGS data. Storing raw data is infeasible because of its enormous size and high redundancy. In this article, we address the problem of storage and transmission of large Fastq files using innovative compression techniques.
RESULTS: We introduce a new lossless non-reference-based fastq compression algorithm named lossless FastQ compressor. We have compared our algorithm with other state of the art big data compression algorithms namely gzip, bzip2, fastqz, fqzcomp, G-SQZ, SCALCE, Quip, DSRC, DSRC-LZ etc. This comparison reveals that our algorithm achieves better compression ratios. The improvement obtained is up to 225%. For example, on one of the datasets (SRR065390_1), the average improvement (over all the algorithms compared) is 74.62%. Availability and implementation: The implementations are freely available for non-commercial purposes. They can be downloaded from http://engr.uconn.edu/∼rajasek/FastqPrograms.zip.
CONTACT: rajasek@engr.uconn.edu.},
Author = {Pathak, Sudipta and Rajasekaran, Sanguthevar},
Date-Added = {2015-02-02 11:52:40 +0000},
Date-Modified = {2015-02-02 11:52:40 +0000},
Doi = {10.1093/bioinformatics/btu701},
Journal = {Bioinformatics},
Journal-Full = {Bioinformatics (Oxford, England)},
Month = {Oct},
Pmid = {25344499},
Pst = {aheadofprint},
Title = {LFQC: a lossless compression algorithm for FASTQ files},
Year = {2014},
Bdsk-Url-1 = {http://dx.doi.org/10.1093/bioinformatics/btu701}}
@article{Canovas:2014fr,
Abstract = {MOTIVATION: Next-generation sequencing technologies are revolutionizing medicine. Data from sequencing technologies are typically represented as a string of bases, an associated sequence of per-base quality scores and other metadata, and in aggregate can require a large amount of space. The quality scores show how accurate the bases are with respect to the sequencing process, that is, how confident the sequencer is of having called them correctly, and are the largest component in datasets in which they are retained. Previous research has examined how to store sequences of bases effectively; here we add to that knowledge by examining methods for compressing quality scores. The quality values originate in a continuous domain, and so if a fidelity criterion is introduced, it is possible to introduce flexibility in the way these values are represented, allowing lossy compression over the quality score data.
RESULTS: We present existing compression options for quality score data, and then introduce two new lossy techniques. Experiments measuring the trade-off between compression ratio and information loss are reported, including quantifying the effect of lossy representations on a downstream application that carries out single nucleotide polymorphism and insert/deletion detection. The new methods are demonstrably superior to other techniques when assessed against the spectrum of possible trade-offs between storage required and fidelity of representation.
AVAILABILITY AND IMPLEMENTATION: An implementation of the methods described here is available at https://github.com/rcanovas/libCSAM.
CONTACT: rcanovas@student.unimelb.edu.au
SUPPLEMENTARY INFORMATION: Supplementary data are available at Bioinformatics online.},
Author = {C{\'a}novas, Rodrigo and Moffat, Alistair and Turpin, Andrew},
Date-Added = {2015-02-02 11:50:05 +0000},
Date-Modified = {2015-02-02 11:50:05 +0000},
Doi = {10.1093/bioinformatics/btu183},
Journal = {Bioinformatics},
Journal-Full = {Bioinformatics (Oxford, England)},
Mesh = {Algorithms; Base Sequence; Data Compression; Genome; Genomics; High-Throughput Nucleotide Sequencing; Polymorphism, Single Nucleotide; Quality Control},
Month = {Aug},
Number = {15},
Pages = {2130-6},
Pmid = {24728856},
Pst = {ppublish},
Title = {Lossy compression of quality scores in genomic data},
Volume = {30},
Year = {2014},
Bdsk-Url-1 = {http://dx.doi.org/10.1093/bioinformatics/btu183}}
@article{Ochoa:2013rt,
Abstract = {BACKGROUND: Next Generation Sequencing technologies have revolutionized many fields in biology by reducing the time and cost required for sequencing. As a result, large amounts of sequencing data are being generated. A typical sequencing data file may occupy tens or even hundreds of gigabytes of disk space, prohibitively large for many users. This data consists of both the nucleotide sequences and per-base quality scores that indicate the level of confidence in the readout of these sequences. Quality scores account for about half of the required disk space in the commonly used FASTQ format (before compression), and therefore the compression of the quality scores can significantly reduce storage requirements and speed up analysis and transmission of sequencing data.
RESULTS: In this paper, we present a new scheme for the lossy compression of the quality scores, to address the problem of storage. Our framework allows the user to specify the rate (bits per quality score) prior to compression, independent of the data to be compressed. Our algorithm can work at any rate, unlike other lossy compression algorithms. We envisage our algorithm as being part of a more general compression scheme that works with the entire FASTQ file. Numerical experiments show that we can achieve a better mean squared error (MSE) for small rates (bits per quality score) than other lossy compression schemes. For the organism PhiX, whose assembled genome is known and assumed to be correct, we show that it is possible to achieve a significant reduction in size with little compromise in performance on downstream applications (e.g., alignment).
CONCLUSIONS: QualComp is an open source software package, written in C and freely available for download at https://sourceforge.net/projects/qualcomp.},
Author = {Ochoa, Idoia and Asnani, Himanshu and Bharadia, Dinesh and Chowdhury, Mainak and Weissman, Tsachy and Yona, Golan},
Date-Added = {2015-02-02 11:49:42 +0000},
Date-Modified = {2015-02-02 13:35:38 +0000},
Doi = {10.1186/1471-2105-14-187},
Journal = {BMC Bioinformatics},
Journal-Full = {BMC bioinformatics},
Mesh = {Algorithms; Animals; Data Compression; Genome; Genomics; High-Throughput Nucleotide Sequencing; Mice; Polymorphism, Single Nucleotide; Software},
Pages = {187},
Pmc = {PMC3698011},
Pmid = {23758828},
Pst = {epublish},
Title = {{QualComp}: a new lossy compressor for quality scores based on rate distortion theory},
Volume = {14},
Year = {2013},
Bdsk-Url-1 = {http://dx.doi.org/10.1186/1471-2105-14-187}}
@article{Wan:2012kq,
Abstract = {MOTIVATION: The growth of next-generation sequencing means that more effective and efficient archiving methods are needed to store the generated data for public dissemination and in anticipation of more mature analytical methods later. This article examines methods for compressing the quality score component of the data to partly address this problem.
RESULTS: We compare several compression policies for quality scores, in terms of both compression effectiveness and overall efficiency. The policies employ lossy and lossless transformations with one of several coding schemes. Experiments show that both lossy and lossless transformations are useful, and that simple coding methods, which consume less computing resources, are highly competitive, especially when random access to reads is needed.
AVAILABILITY AND IMPLEMENTATION: Our C++ implementation, released under the Lesser General Public License, is available for download at http://www.cb.k.u-tokyo.ac.jp/asailab/members/rwan.
SUPPLEMENTARY INFORMATION: Supplementary data are available at Bioinformatics online.},
Author = {Wan, Raymond and Anh, Vo Ngoc and Asai, Kiyoshi},
Date-Added = {2015-02-02 11:44:29 +0000},
Date-Modified = {2015-02-03 20:18:20 +0000},
Doi = {10.1093/bioinformatics/btr689},
Journal = {Bioinformatics},
Journal-Full = {Bioinformatics (Oxford, England)},
Mesh = {Data Compression; Sequence Analysis, DNA; Software},
Month = {Mar},
Number = {5},
Pages = {628-35},
Pmid = {22171329},
Pst = {ppublish},
Title = {Transformations for the compression of {FASTQ} quality scores of next-generation sequencing data},
Volume = {28},
Year = {2012},
Bdsk-Url-1 = {http://dx.doi.org/10.1093/bioinformatics/btr689}}
@misc{webp,
Date-Added = {2014-10-14 13:13:26 +0000},
Date-Modified = {2014-10-14 13:21:51 +0000},
Description = {Documentation for {WebP}},
Howpublished = {https://developers.google.com/speed/webp/},
Title = {Documentation for {WebP}},
Url = {https://developers.google.com/speed/webp/},
Viewport = {width=device-width, initial-scale=1, minimum-scale=1, maximum-scale=1, user-scalable=0},
Xsrf_Token = {6s9fUrp-qHI4xuQD4IXT-joxNDEzMjkyMzkyNDY1NTkw},
Bdsk-Url-1 = {https://developers.google.com/speed/webp/}}
@article{Giancarlo:2014rw,
Abstract = {High-throughput sequencing technologies produce large collections of data, mainly DNA sequences with additional information, requiring the design of efficient and effective methodologies for both their compression and storage. In this context, we first provide a classification of the main techniques that have been proposed, according to three specific research directions that have emerged from the literature and, for each, we provide an overview of the current techniques. Finally, to make this review useful to researchers and technicians applying the existing software and tools, we include a synopsis of the main characteristics of the described approaches, including details on their implementation and availability. Performance of the various methods is also highlighted, although the state of the art does not lend itself to a consistent and coherent comparison among all the methods presented here.},
Author = {Giancarlo, R and Rombo, SE and Utro, F},
Date-Added = {2014-10-09 15:45:39 +0000},
Date-Modified = {2014-10-11 17:37:09 +0000},
Doi = {10.1093/bib/bbt088},
Journal = {Brief Bioinform},
Journal-Full = {Briefings in bioinformatics},
Keywords = {analysis of large biological sequence collections; compressive sequence analysis; data compression in bioinformatics; data compression of large sequence collections; storage and management of HTS data; succinct data structures for bioinformatics},
Month = {May},
Number = {3},
Pages = {390-406},
Pmid = {24347576},
Pst = {ppublish},
Title = {Compressive biological sequence analysis and archival in the era of high-throughput sequencing technologies},
Volume = {15},
Year = {2014},
Bdsk-Url-1 = {http://dx.doi.org/10.1093/bib/bbt088}}
@article{Zhu:2013qr,
Abstract = {The exponential growth of high-throughput DNA sequence data has posed great challenges to genomic data storage, retrieval and transmission. Compression is a critical tool to address these challenges, where many methods have been developed to reduce the storage size of the genomes and sequencing data (reads, quality scores and metadata). However, genomic data are being generated faster than they could be meaningfully analyzed, leaving a large scope for developing novel compression algorithms that could directly facilitate data analysis beyond data transfer and storage. In this article, we categorize and provide a comprehensive review of the existing compression methods specialized for genomic data and present experimental results on compression ratio, memory usage, time for compression and decompression. We further present the remaining challenges and potential directions for future research.},
Author = {Zhu, Z and Zhang, Y and Ji, Z and He, S and Yang, X},
Date-Added = {2014-10-09 15:45:03 +0000},
Date-Modified = {2014-10-11 17:38:02 +0000},
Doi = {10.1093/bib/bbt087},
Journal = {Brief Bioinform},
Journal-Full = {Briefings in bioinformatics},
Keywords = {compression; next-generation sequencing; reference-based compression; reference-free compression},
Month = {Dec},
Pmid = {24300111},
Pst = {aheadofprint},
Title = {High-throughput {DNA} sequence data compression},
Year = {2013},
Bdsk-Url-1 = {http://dx.doi.org/10.1093/bib/bbt087}}
@electronic{wc3-png:qy,
Author = {WC3},
Date-Added = {2014-09-27 20:42:43 +0000},
Date-Modified = {2014-09-27 20:43:54 +0000},
Title = {{Portable Network Graphics (PNG) Specification (Second Edition)}},
Url = {http://www.w3.org/TR/PNG/},
Bdsk-Url-1 = {http://www.w3.org/TR/PNG/}}
@article{Minoche:2011km,
Abstract = {BACKGROUND: The generation and analysis of high-throughput sequencing data are becoming a major component of many studies in molecular biology and medical research. Illumina's Genome Analyzer (GA) and HiSeq instruments are currently the most widely used sequencing devices. Here, we comprehensively evaluate properties of genomic HiSeq and GAIIx data derived from two plant genomes and one virus, with read lengths of 95 to 150 bases.
RESULTS: We provide quantifications and evidence for GC bias, error rates, error sequence context, effects of quality filtering, and the reliability of quality values. By combining different filtering criteria we reduced error rates 7-fold at the expense of discarding 12.5% of alignable bases. While overall error rates are low in HiSeq data we observed regions of accumulated wrong base calls. Only 3% of all error positions accounted for 24.7% of all substitution errors. Analyzing the forward and reverse strands separately revealed error rates of up to 18.7%. Insertions and deletions occurred at very low rates on average but increased to up to 2% in homopolymers. A positive correlation between read coverage and GC content was found depending on the GC content range.
CONCLUSIONS: The errors and biases we report have implications for the use and the interpretation of Illumina sequencing data. GAIIx and HiSeq data sets show slightly different error profiles. Quality filtering is essential to minimize downstream analysis artifacts. Supporting previous recommendations, the strand-specificity provides a criterion to distinguish sequencing errors from low abundance polymorphisms.},
Author={Minoche, Andr{\'e} E and Dohm, Juliane C and Himmelbauer, Heinz and others},
Date-Added = {2014-09-27 14:59:19 +0000},
Date-Modified = {2014-09-27 15:00:13 +0000},
Doi = {10.1186/gb-2011-12-11-r112},
Journal = {Genome Biology},
Journal-Full = {Genome Biology},
Mesh = {Arabidopsis; Artifacts; Automation, Laboratory; Bacteriophage phi X 174; Base Composition; Base Sequence; Beta vulgaris; Genomics; High-Throughput Nucleotide Sequencing; Molecular Sequence Data; Mutagenesis, Insertional; Polymorphism, Genetic; Reproducibility of Results; Sensitivity and Specificity; Sequence Analysis, DNA; Sequence Deletion},
Number = {11},
Pages = {R112},
Pmc = {PMC3334598},
Pmid = {22067484},
Pst = {epublish},
Title = {Evaluation of genomic high-throughput sequencing data generated on {Illumina HiSeq} and {Genome Analyzer} systems},
Volume = {12},
Year = {2011},
Bdsk-Url-1 = {http://dx.doi.org/10.1186/gb-2011-12-11-r112}}
@article{Carneiro:2012xw,
Abstract = {BACKGROUND: Pacific Biosciences technology provides a fundamentally new data type that provides the potential to overcome some limitations of current next generation sequencing platforms by providing significantly longer reads, single molecule sequencing, low composition bias and an error profile that is orthogonal to other platforms. With these potential advantages in mind, we here evaluate the utility of the Pacific Biosciences RS platform for human medical amplicon resequencing projects.
RESULTS: We evaluated the Pacific Biosciences technology for SNP discovery in medical resequencing projects using the Genome Analysis Toolkit, observing high sensitivity and specificity for calling differences in amplicons containing known true or false SNPs. We assessed data quality: most errors were indels (~14%) with few apparent miscalls (~1%). In this work, we define a custom data processing pipeline for Pacific Biosciences data for human data analysis.
CONCLUSION: Critically, the error properties were largely free of the context-specific effects that affect other sequencing technologies. These data show excellent utility for follow-up validation and extension studies in human data and medical genetics projects, but can be extended to other organisms with a reference genome.},
Author = {Carneiro, MO and Russ, C and Ross, MG and Gabriel, SB and Nusbaum, C and DePristo, MA},
Date-Added = {2014-09-27 14:01:15 +0000},
Date-Modified = {2014-09-27 14:02:23 +0000},
Doi = {10.1186/1471-2164-13-375},
Journal = {BMC Genomics},
Journal-Full = {BMC genomics},
Mesh = {Genetic Variation; Genome, Human; Genotype; Humans; Polymorphism, Single Nucleotide; Sequence Analysis, DNA; Software; User-Computer Interface},
Pages = {375},
Pmc = {PMC3443046},
Pmid = {22863213},
Pst = {epublish},
Title = {{Pacific Biosciences} sequencing technology for genotyping and variation discovery in human data},
Volume = {13},
Year = {2012},
Bdsk-Url-1 = {http://dx.doi.org/10.1186/1471-2164-13-375}}
@inproceedings{DBLP:conf/icip/Queiroz02b,
Author = {de Queiroz, RL},
Bibsource = {dblp computer science bibliography, http://dblp.org},
Biburl = {http://dblp.uni-trier.de/rec/bib/conf/icip/Queiroz02b},
Booktitle = {{ICIP} {(2)}},
Date-Added = {2014-09-26 20:21:17 +0000},
Date-Modified = {2014-09-26 20:21:43 +0000},
Doi = {10.1109/ICIP.2002.1039967},
Pages = {381--384},
Timestamp = {Fri, 26 Sep 2014 22:20:58 +0200},
Title = {Improved transforms for the compression of color and multispectral images},
Url = {http://dx.doi.org/10.1109/ICIP.2002.1039967},
Year = {2002},
Bdsk-Url-1 = {http://dx.doi.org/10.1109/ICIP.2002.1039967}}
@electronic{png:yb,
Date-Added = {2014-09-26 16:40:39 +0000},
Date-Modified = {2014-09-26 16:41:38 +0000},
Title = {{Portable Network Graphics (PNG) Specification and Extensions}},
Url = {http://www.libpng.org/pub/png/spec/},
Bdsk-Url-1 = {http://www.libpng.org/pub/png/spec/}}
@book{png-book,
Added-At = {2011-04-20T00:00:00.000+0200},
Author = {Roelofs, G},
Biburl = {http://www.bibsonomy.org/bibtex/2a9f1eaa558e79224b93774fcf9c8e2ac/dblp},
Date-Added = {2014-09-26 16:32:16 +0000},
Date-Modified = {2014-09-26 16:37:44 +0000},
Interhash = {9dfe1dae8aabff346ce36dc20c96637f},
Intrahash = {a9f1eaa558e79224b93774fcf9c8e2ac},
Isbn = {978-1-56592-542-7},
Keywords = {dblp},
Pages = {I-XIX, 1-321},
Publisher = {O'Reilly},
Timestamp = {2011-04-20T00:00:00.000+0200},
Title = {{PNG} --- the definitive guide: creating and programming portable network graphics.},
Year = 1999}
@electronic{mng-vlc:jo,
Date-Added = {2014-09-26 16:16:38 +0000},
Date-Modified = {2014-09-26 16:17:31 +0000},
Title = {{MNG-VLC (Multiple-image Network Graphics--Very Low Complexity) Format Version 1.0}},
Url = {http://www.libpng.org/pub/mng/spec/mng-vlc.html},
Bdsk-Url-1 = {http://www.libpng.org/pub/mng/spec/mng-vlc.html}}
@electronic{mng-lc:bv,
Date-Added = {2014-09-26 16:13:16 +0000},
Date-Modified = {2014-09-26 16:18:30 +0000},
Title = {{MNG-LC (Multiple-image Network Graphics--Low Complexity) Format Version 1.0}},
Url = {http://www.libpng.org/pub/mng/spec/mng-lc.html},
Bdsk-Url-1 = {http://www.libpng.org/pub/mng/spec/mng-lc.html}}
@electronic{mng:hb,
Date-Added = {2014-09-26 15:54:19 +0000},
Date-Modified = {2014-09-26 16:15:22 +0000},
Lastchecked = {26 September 2014},
Title = {{MNG (Multiple-image Network Graphics) Format Version 1.0}},
Url = {http://www.libpng.org/pub/mng/spec/},
Bdsk-Url-1 = {http://www.libpng.org/pub/mng/spec/}}
@electronic{libpng:ph,
Date-Added = {2014-09-26 15:01:44 +0000},
Date-Modified = {2014-09-26 15:50:51 +0000},
Lastchecked = {26 September 2014},
Title = {libpng},
Url = {http://www.libpng.org/pub/png/libpng.html},
Bdsk-Url-1 = {http://www.libpng.org/pub/png/libpng.html}}
@article{Ziv77auniversal,
Author={Ziv, Jacob and Lempel, Abraham},
Date-Added = {2014-09-25 12:19:25 +0000},
Date-Modified = {2014-09-25 12:20:50 +0000},
Journal = {IEEE Transactions on Information Theory},
Number = {3},
Pages = {337--343},
Title = {A universal algorithm for sequential data compression},
Volume = {23},
Year = {1977}}
@webpage{zopfli:kx,
Author={Alakuijala, Jyrki and Vandevenne, Lode},
Date-Added = {2014-09-24 21:59:04 +0000},
Date-Modified = {2014-09-24 22:02:10 +0000},
Keywords = {PNG compression},
Lastchecked = {24 September 2014},
Robots = {NOARCHIVE},
Title = {Zopfli Compression Algorithm},
Url = {https://code.google.com/p/zopfli/},
Bdsk-Url-1 = {https://code.google.com/p/zopfli/}}
@inproceedings{DBLP:conf/recomb/YuYB14,
Author={Yu, Y William and Y{\"{o}}r{\"{u}}koglu, Deniz and Berger, Bonnie},
Bibsource = {dblp computer science bibliography, http://dblp.org},
Biburl = {http://dblp.uni-trier.de/rec/bib/conf/recomb/YuYB14},
Booktitle = {Research in Computational Molecular Biology - 18th Annual International Conference, {RECOMB} 2014, Pittsburgh, PA, USA, April 2-5, 2014, Proceedings},
Crossref = {DBLP:conf/recomb/2014},
Date-Added = {2014-09-24 14:04:37 +0000},
Date-Modified = {2015-02-03 20:31:56 +0000},
Pages = {385--399},
Timestamp = {Wed, 24 Sep 2014 16:04:29 +0200},
Title = {Traversing the $k$-mer Landscape of {NGS} Read Datasets for Quality Score Sparsification},
Year = {2014},
Bdsk-Url-1 = {http://dx.doi.org/10.1007/978-3-319-05269-4_31}}
@proceedings{DBLP:conf/recomb/2014,
Bibsource = {dblp computer science bibliography, http://dblp.org},
Biburl = {http://dblp.uni-trier.de/rec/bib/conf/recomb/2014},
Date-Added = {2014-09-24 14:04:37 +0000},
Date-Modified = {2014-09-24 14:17:12 +0000},
Editor = {Roded Sharan},
Isbn = {978-3-319-05268-7},
Publisher = {Springer},
Series = {Lecture Notes in Computer Science},
Timestamp = {Wed, 24 Sep 2014 16:04:29 +0200},
Title = {Research in Computational Molecular Biology - 18th Annual International Conference, {RECOMB} 2014, Pittsburgh, PA, USA, April 2-5, 2014, Proceedings},
Volume = {8394},
Year = {2014},
Bdsk-Url-1 = {http://dx.doi.org/10.1007/978-3-319-05269-4}}
@misc{beck2001agile,
Added-At = {2007-12-30T11:39:05.000+0100},
Author={Beck, Kent and Beedle, Mike and Van Bennekum, Arie and Cockburn, Alistair and Cunningham, Ward and Fowler, Martin and Grenning, James and Highsmith, Jim and Hunt, Andrew and Jeffries, Ron and others},
Biburl = {http://www.bibsonomy.org/bibtex/28954248a545d88dd2c0e688d1c7e2f9d/juve},
Booktitle = {Manifesto for Agile Software Development},
Date-Added = {2014-09-24 12:44:59 +0000},
Date-Modified = {2014-09-24 12:46:27 +0000},
Description = {Manifesto for Agile Software Development},
Interhash = {098cc7e185f10c3da390459a01e0d535},
Intrahash = {8954248a545d88dd2c0e688d1c7e2f9d},
Keywords = {imported},
Timestamp = {2007-12-30T11:39:05.000+0100},
Title = {Manifesto for Agile Software Development},
Url = {http://www.agilemanifesto.org/},
Year = 2001,
Bdsk-Url-1 = {http://www.agilemanifesto.org/}}
@article{Earl:2011fv,
Abstract = {Low-cost short read sequencing technology has revolutionized genomics, though it is only just becoming practical for the high-quality de novo assembly of a novel large genome. We describe the Assemblathon 1 competition, which aimed to comprehensively assess the state of the art in de novo assembly methods when applied to current sequencing technologies. In a collaborative effort, teams were asked to assemble a simulated Illumina HiSeq data set of an unknown, simulated diploid genome. A total of 41 assemblies from 17 different groups were received. Novel haplotype aware assessments of coverage, contiguity, structure, base calling, and copy number were made. We establish that within this benchmark: (1) It is possible to assemble the genome to a high level of coverage and accuracy, and that (2) large differences exist between the assemblies, suggesting room for further improvements in current methods. The simulated benchmark, including the correct answer, the assemblies, and the code that was used to evaluate the assemblies is now public and freely available from http://www.assemblathon.org/.},
Author = {Earl, D and Bradnam, K and St John, J and Darling, A and Lin, D and Fass, J and Yu, HOK and Buffalo, V and Zerbino, DR and Diekhans, M and Nguyen, N and Ariyaratne, PN and Sung, W-K and Ning, Z and Haimel, M and Simpson, JT and Fonseca, NA and Birol, {\.I} and Docking, TR and Ho, IY and Rokhsar, DS and Chikhi, R and Lavenier, D and Chapuis, G and Naquin, D and Maillet, N and Schatz, MC and Kelley, DR and Phillippy, AM and Koren, S and Yang, S-P and Wu, W and Chou, W-C and Srivastava, A and Shaw, TI and Ruby, JG and Skewes-Cox, P and Betegon, M and Dimon, MT and Solovyev, V and Seledtsov, I and Kosarev, P and Vorobyev, D and Ramirez-Gonzalez, R and Leggett, R and MacLean, D and Xia, F and Luo, R and Li, Z and Xie, Y and Liu, B and Gnerre, S and MacCallum, I and Przybylski, D and Ribeiro, FJ and Yin, S and Sharpe, T and Hall, G and Kersey, PJ and Durbin, R and Jackman, SD and Chapman, JA and Huang, X and DeRisi, JL and Caccamo, M and Li, Y and Jaffe, DB and Green, RE and Haussler, D and Korf, I and Paten, B},
Date-Added = {2014-09-23 20:38:14 +0000},
Date-Modified = {2014-09-24 11:57:06 +0000},
Doi = {10.1101/gr.126599.111},
Journal = {Genome Res},
Journal-Full = {Genome research},
Mesh = {Genome; Genomics; Sequence Analysis, DNA},
Month = {Dec},
Number = {12},
Pages = {2224-41},
Pmc = {PMC3227110},
Pmid = {21926179},
Pst = {ppublish},
Title = {Assemblathon 1: a competitive assessment of de novo short read assembly methods},
Volume = {21},
Year = {2011},
Bdsk-Url-1 = {http://dx.doi.org/10.1101/gr.126599.111}}
@article{Schmieder:2011gd,
Abstract = {SUMMARY: Here, we present PRINSEQ for easy and rapid quality control and data preprocessing of genomic and metagenomic datasets. Summary statistics of FASTA (and QUAL) or FASTQ files are generated in tabular and graphical form and sequences can be filtered, reformatted and trimmed by a variety of options to improve downstream analysis.
AVAILABILITY AND IMPLEMENTATION: This open-source application was implemented in Perl and can be used as a stand alone version or accessed online through a user-friendly web interface. The source code, user help and additional information are available at http://prinseq.sourceforge.net/.},
Author={Schmieder, Robert and Edwards, Robert},
Date-Added = {2014-09-23 20:32:49 +0000},
Date-Modified = {2014-09-23 20:57:32 +0000},
Doi = {10.1093/bioinformatics/btr026},
Journal = {Bioinformatics},
Journal-Full = {Bioinformatics (Oxford, England)},
Mesh = {Computer Graphics; Information Storage and Retrieval; Internet; Metagenomics; Programming Languages; Quality Control; Sequence Analysis, DNA; Software},
Month = {Mar},
Number = {6},
Pages = {863-4},
Pmc = {PMC3051327},
Pmid = {21278185},
Pst = {ppublish},
Title = {Quality control and preprocessing of metagenomic datasets},
Volume = {27},
Year = {2011},
Bdsk-Url-1 = {http://dx.doi.org/10.1093/bioinformatics/btr026}}
@article{EJ200,
Abstract = {When small RNA is sequenced on current sequencing machines, the resulting reads are usually longer than the RNA and therefore contain parts of the 3' adapter. That adapter must be found and removed error-tolerantly from each read before read mapping. Previous solutions are either hard to use or do not offer required features, in particular support for color space data. As an easy to use alternative, we developed the command-line tool cutadapt, which supports 454, Illumina and SOLiD (color space) data, offers two adapter trimming algorithms, and has other useful features. Cutadapt, including its MIT-licensed source code, is available for download at http://code.google.com/p/cutadapt/},
Author = {Marcel Martin},
Date-Added = {2014-09-23 20:30:35 +0000},
Date-Modified = {2014-09-23 20:30:35 +0000},
Issn = {2226-6089},
Journal = {EMBnet.journal},
Keywords = {next generation sequencing; small RNA; microRNA; adapter removal},
Number = {1},
Title = {Cutadapt removes adapter sequences from high-throughput sequencing reads},
Url = {http://journal.embnet.org/index.php/embnetjournal/article/view/200},
Volume = {17},
Year = {2011},
Bdsk-Url-1 = {http://journal.embnet.org/index.php/embnetjournal/article/view/200}}
@electronic{citeulike:11583827,
Author = {Andrews, S.},
Citeulike-Article-Id = {11583827},
Citeulike-Linkout-0 = {http://www.bioinformatics.babraham.ac.uk/projects/fastqc/},
Date-Added = {2014-09-23 20:28:40 +0000},
Date-Modified = {2014-09-23 20:28:40 +0000},
Journal = {http://www.bioinformatics.babraham.ac.uk/projects/fastqc/},
Keywords = {bioinformatics, ngs, qc},
Posted-At = {2012-10-30 18:10:53},
Priority = {2},
Title = {{FastQC} A Quality Control tool for High Throughput Sequence Data},
Url = {http://www.bioinformatics.babraham.ac.uk/projects/fastqc/},
Bdsk-Url-1 = {http://www.bioinformatics.babraham.ac.uk/projects/fastqc/}}
@book{hastie_09_elements-of.statistical-learning,
Added-At = {2010-06-03T15:15:09.000+0200},
Author={Hastie, Trevor and Tibshirani, Robert and Friedman, Jerome and Franklin, James},
Biburl = {http://www.bibsonomy.org/bibtex/200d858c0bd2826d4eb5f39450192d1f5/ukoethe},
Date-Added = {2014-09-23 20:07:42 +0000},
Date-Modified = {2014-09-23 20:10:46 +0000},
Edition = 2,
File = {:Books\\HastieTibshiraniFriedman-09-Elements-of-Statistical-Learning-2nd-edition\\hastie_09_elements-of.statistical-learning.pdf:PDF},
Interhash = {52d1772f39be836e3b298d37b8c0cfa1},
Intrahash = {00d858c0bd2826d4eb5f39450192d1f5},
Keywords = {inference mathmatics dataanalysis method clutering statistics},
Publisher = {Springer},
Timestamp = {2010-06-03T15:15:09.000+0200},
Title = {The elements of statistical learning: data mining, inference and prediction},
Url = {http://www-stat.stanford.edu/~tibs/ElemStatLearn/},
Year = 2009,
Bdsk-Url-1 = {http://www-stat.stanford.edu/~tibs/ElemStatLearn/}}
@misc{seq-squeeze,
Author = {Pistoia Alliance},
Date-Added = {2013-05-31 17:12:30 +0000},
Date-Modified = {2013-05-31 17:18:52 +0000},
Note = {http://www.sequencesqueeze.org},
Title = {{The Pistoia Alliance Sequence Squeeze Competition}},
Urldate = {31 May 2013},
Year = 2013}
@article{Nagarajan:2011fk,
Abstract = {Reassortments in the influenza virus--a process where strains exchange genetic segments--have been implicated in two out of three pandemics of the 20th century as well as the 2009 H1N1 outbreak. While advances in sequencing have led to an explosion in the number of whole-genome sequences that are available, an understanding of the rate and distribution of reassortments and their role in viral evolution is still lacking. An important factor in this is the paucity of automated tools for confident identification of reassortments from sequence data due to the challenges of analyzing large, uncertain viral phylogenies. We describe here a novel computational method, called GiRaF (Graph-incompatibility-based Reassortment Finder), that robustly identifies reassortments in a fully automated fashion while accounting for uncertainties in the inferred phylogenies. The algorithms behind GiRaF search large collections of Markov chain Monte Carlo (MCMC)-sampled trees for groups of incompatible splits using a fast biclique enumeration algorithm coupled with several statistical tests to identify sets of taxa with differential phylogenetic placement. GiRaF correctly finds known reassortments in human, avian, and swine influenza populations, including the evolutionary events that led to the recent 'swine flu' outbreak. GiRaF also identifies several previously unreported reassortments via whole-genome studies to catalog events in H5N1 and swine influenza isolates.},
Author={Nagarajan, Niranjan and Kingsford, Carl},
Date-Added = {2013-05-29 16:13:42 +0000},
Date-Modified = {2013-05-29 16:14:08 +0000},
Doi = {10.1093/nar/gkq1232},
Journal = {Nucleic Acids Res},
Journal-Full = {Nucleic acids research},
Mesh = {Algorithms; Computational Biology; Data Mining; Influenza A Virus, H1N1 Subtype; Influenza A Virus, H3N2 Subtype; Influenza A virus; Phylogeny; Reassortant Viruses},
Month = {Mar},
Number = {6},
Pages = {e34},
Pmc = {PMC3064795},
Pmid = {21177643},
Pst = {ppublish},
Title = {{GiRaF}: robust, computational identification of influenza reassortments via graph mining},
Volume = {39},
Year = {2011},
Bdsk-Url-1 = {http://dx.doi.org/10.1093/nar/gkq1232}}
@inproceedings{Traina00slim-trees:high,
Author = {Traina, C and Traina, A and Seeger, B and Faloutsos, C},
Booktitle = {7th International Conference on Extending Database Technology (EDBT 2000)},
Date-Added = {2013-05-28 16:42:02 +0000},
Date-Modified = {2013-05-28 16:44:19 +0000},
Pages = {51--65},
Publisher = {Springer-Verlag},
Title = {Slim-trees: High performance metric trees minimizing overlap between nodes},
Year = {2000}}
@article{Roberts:2013ly,
Abstract = {We present eXpress, a software package for efficient probabilistic assignment of ambiguously mapping sequenced fragments. eXpress uses a streaming algorithm with linear run time and constant memory use. It can determine abundances of sequenced molecules in real time and can be applied to ChIP-seq, metagenomics and other large-scale sequencing data. We demonstrate its use on RNA-seq data and show that eXpress achieves greater efficiency than other quantification methods.},
Author = {Roberts, A and Pachter, L},
Date-Added = {2013-05-27 19:31:41 +0000},
Date-Modified = {2013-05-27 19:32:00 +0000},
Doi = {10.1038/nmeth.2251},
Journal = {Nat Methods},
Journal-Full = {Nature methods},
Mesh = {Algorithms; Chromatin Immunoprecipitation; Computational Biology; Gene Expression Profiling; High-Throughput Nucleotide Sequencing; Humans; Oligonucleotide Array Sequence Analysis; RNA; Sequence Analysis, DNA; Sequence Analysis, RNA; Software},
Month = {Jan},
Number = {1},
Pages = {71-3},
Pmid = {23160280},
Pst = {ppublish},
Title = {Streaming fragment assignment for real-time analysis of sequencing experiments},
Volume = {10},
Year = {2013},
Bdsk-Url-1 = {http://dx.doi.org/10.1038/nmeth.2251}}
@article{21816040,
Abstract = {BACKGROUND:RNA-Seq is revolutionizing the way transcript abundances are measured. A key challenge in transcript quantification from RNA-Seq data is the handling of reads that map to multiple genes or isoforms. This issue is particularly important for quantification with de novo transcriptome assemblies in the absence of sequenced genomes, as it is difficult to determine which transcripts are isoforms of the same gene. A second significant issue is the design of RNA-Seq experiments, in terms of the number of reads, read length, and whether reads come from one or both ends of cDNA fragments.RESULTS:We present RSEM, an user-friendly software package for quantifying gene and isoform abundances from single-end or paired-end RNA-Seq data. RSEM outputs abundance estimates, 95% credibility intervals, and visualization files and can also simulate RNA-Seq data. In contrast to other existing tools, the software does not require a reference genome. Thus, in combination with a de novo transcriptome assembler, RSEM enables accurate transcript quantification for species without sequenced genomes. On simulated and real data sets, RSEM has superior or comparable performance to quantification methods that rely on a reference genome. Taking advantage of RSEM's ability to effectively use ambiguously-mapping reads, we show that accurate gene-level abundance estimates are best obtained with large numbers of short single-end reads. On the other hand, estimates of the relative frequencies of isoforms within single genes may be improved through the use of paired-end reads, depending on the number of possible splice forms for each gene.CONCLUSIONS:RSEM is an accurate and user-friendly software tool for quantifying transcript abundances from RNA-Seq data. As it does not rely on the existence of a reference genome, it is particularly useful for quantification with de novo transcriptome assemblies. In addition, RSEM has enabled valuable guidance for cost-efficient design of quantification experiments with RNA-Seq, which is currently relatively expensive.},
Author = {Li, B and Dewey, C},
Date-Added = {2013-05-27 19:26:28 +0000},
Date-Modified = {2013-05-27 19:26:56 +0000},
Doi = {10.1186/1471-2105-12-323},
Issn = {1471-2105},
Journal = {BMC Bioinformatics},
Number = {1},
Pages = {323},
Pubmedid = {21816040},
Title = {{RSEM}: accurate transcript quantification from {RNA-Seq} data with or without a reference genome},
Url = {http://www.biomedcentral.com/1471-2105/12/323},
Volume = {12},
Year = {2011},
Bdsk-Url-1 = {http://www.biomedcentral.com/1471-2105/12/323},
Bdsk-Url-2 = {http://dx.doi.org/10.1186/1471-2105-12-323}}
@article{NCBI-Resource-Coordinators:2013zr,
Abstract = {In addition to maintaining the GenBank{\textregistered} nucleic acid sequence database, the National Center for Biotechnology Information (NCBI, http://www.ncbi.nlm.nih.gov) provides analysis and retrieval resources for the data in GenBank and other biological data made available through the NCBI web site. NCBI resources include Entrez, the Entrez Programming Utilities, MyNCBI, PubMed, PubMed Central, Gene, the NCBI Taxonomy Browser, BLAST, BLAST Link (BLink), Primer-BLAST, COBALT, Splign, RefSeq, UniGene, HomoloGene, ProtEST, dbMHC, dbSNP, dbVar, Epigenomics, the Genetic Testing Registry, Genome and related tools, the Map Viewer, Model Maker, Evidence Viewer, Trace Archive, Sequence Read Archive, BioProject, BioSample, Retroviral Genotyping Tools, HIV-1/Human Protein Interaction Database, Gene Expression Omnibus, Probe, Online Mendelian Inheritance in Animals, the Molecular Modeling Database, the Conserved Domain Database, the Conserved Domain Architecture Retrieval Tool, Biosystems, Protein Clusters and the PubChem suite of small molecule databases. Augmenting many of the web applications are custom implementations of the BLAST program optimized to search specialized data sets. All of these resources can be accessed through the NCBI home page.},
Author = {{NCBI Resource Coordinators}},
Date-Added = {2013-05-27 18:03:26 +0000},
Date-Modified = {2013-05-27 18:03:53 +0000},
Doi = {10.1093/nar/gks1189},
Journal = {Nucleic Acids Res},
Journal-Full = {Nucleic acids research},
Mesh = {Animals; Databases, Chemical; Databases, Genetic; Databases, Nucleic Acid; Databases, Protein; Disease; Gene Expression; Genetic Testing; Genomics; Humans; Internet; National Library of Medicine (U.S.); Protein Structure, Tertiary; PubMed; Registries; Sequence Alignment; United States},
Month = {Jan},
Number = {Database issue},
Pages = {D8-D20},
Pmc = {PMC3531099},
Pmid = {23193264},
Pst = {ppublish},
Title = {Database resources of the {National Center for Biotechnology Information}},
Volume = {41},
Year = {2013},
Bdsk-Url-1 = {http://dx.doi.org/10.1093/nar/gks1189}}
@article{Benson01012013,
Abstract = {GenBank{\textregistered} (http://www.ncbi.nlm.nih.gov) is a comprehensive database that contains publicly available nucleotide sequences for almost 260 000 formally described species. These sequences are obtained primarily through submissions from individual laboratories and batch submissions from large-scale sequencing projects, including whole-genome shotgun (WGS) and environmental sampling projects. Most submissions are made using the web-based BankIt or standalone Sequin programs, and GenBank staff assigns accession numbers upon data receipt. Daily data exchange with the European Nucleotide Archive (ENA) and the DNA Data Bank of Japan (DDBJ) ensures worldwide coverage. GenBank is accessible through the NCBI Entrez retrieval system, which integrates data from the major DNA and protein sequence databases along with taxonomy, genome, mapping, protein structure and domain information, and the biomedical journal literature via PubMed. BLAST provides sequence similarity searches of GenBank and other sequence databases. Complete bimonthly releases and daily updates of the GenBank database are available by FTP. To access GenBank and its related retrieval and analysis services, begin at the NCBI home page: www.ncbi.nlm.nih.gov.},
Author = {Benson, DA and Cavanaugh, M and Clark, K and Karsch-Mizrachi, I and Lipman, DJ. and Ostell, J and Sayers, EW},
Date-Added = {2013-05-27 17:56:11 +0000},
Date-Modified = {2013-05-27 17:57:40 +0000},
Doi = {10.1093/nar/gks1195},
Eprint = {http://nar.oxfordjournals.org/content/41/D1/D36.full.pdf+html},
Journal = {Nucleic Acids Research},
Number = {D1},
Pages = {D36-D42},
Title = {{GenBank}},
Url = {http://nar.oxfordjournals.org/content/41/D1/D36.abstract},
Volume = {41},
Year = {2013},
Bdsk-Url-1 = {http://nar.oxfordjournals.org/content/41/D1/D36.abstract},
Bdsk-Url-2 = {http://dx.doi.org/10.1093/nar/gks1195}}
@article{Hach:2012ys,
Abstract = {MOTIVATION: The high throughput sequencing (HTS) platforms generate unprecedented amounts of data that introduce challenges for the computational infrastructure. Data management, storage and analysis have become major logistical obstacles for those adopting the new platforms. The requirement for large investment for this purpose almost signalled the end of the Sequence Read Archive hosted at the National Center for Biotechnology Information (NCBI), which holds most of the sequence data generated world wide. Currently, most HTS data are compressed through general purpose algorithms such as gzip. These algorithms are not designed for compressing data generated by the HTS platforms; for example, they do not take advantage of the specific nature of genomic sequence data, that is, limited alphabet size and high similarity among reads. Fast and efficient compression algorithms designed specifically for HTS data should be able to address some of the issues in data management, storage and communication. Such algorithms would also help with analysis provided they offer additional capabilities such as random access to any read and indexing for efficient sequence similarity search. Here we present SCALCE, a 'boosting' scheme based on Locally Consistent Parsing technique, which reorganizes the reads in a way that results in a higher compression speed and compression rate, independent of the compression algorithm in use and without using a reference genome.
RESULTS: Our tests indicate that SCALCE can improve the compression rate achieved through gzip by a factor of 4.19-when the goal is to compress the reads alone. In fact, on SCALCE reordered reads, gzip running time can improve by a factor of 15.06 on a standard PC with a single core and 6 GB memory. Interestingly even the running time of SCALCE + gzip improves that of gzip alone by a factor of 2.09. When compared with the recently published BEETL, which aims to sort the (inverted) reads in lexicographic order for improving bzip2, SCALCE + gzip provides up to 2.01 times better compression while improving the running time by a factor of 5.17. SCALCE also provides the option to compress the quality scores as well as the read names, in addition to the reads themselves. This is achieved by compressing the quality scores through order-3 Arithmetic Coding (AC) and the read names through gzip through the reordering SCALCE provides on the reads. This way, in comparison with gzip compression of the unordered FASTQ files (including reads, read names and quality scores), SCALCE (together with gzip and arithmetic encoding) can provide up to 3.34 improvement in the compression rate and 1.26 improvement in running time.
AVAILABILITY: Our algorithm, SCALCE (Sequence Compression Algorithm using Locally Consistent Encoding), is implemented in C++ with both gzip and bzip2 compression options. It also supports multithreading when gzip option is selected, and the pigz binary is available. It is available at http://scalce.sourceforge.net.
CONTACT: fhach@cs.sfu.ca or cenk@cs.sfu.ca
SUPPLEMENTARY INFORMATION: Supplementary data are available at Bioinformatics online.},
Author={Hach, Faraz and Numanagi{\'c}, Ibrahim and Alkan, Can and Sahinalp, S Cenk},
Date-Added = {2013-05-27 17:39:21 +0000},
Date-Modified = {2013-05-27 17:39:57 +0000},
Doi = {10.1093/bioinformatics/bts593},
Journal = {Bioinformatics},
Journal-Full = {Bioinformatics (Oxford, England)},
Month = {Dec},
Number = {23},
Pages = {3051-7},
Pmc = {PMC3509486},
Pmid = {23047557},
Pst = {ppublish},
Title = {{SCALCE}: boosting sequence compression algorithms using locally consistent encoding},
Volume = {28},
Year = {2012},
Bdsk-Url-1 = {http://dx.doi.org/10.1093/bioinformatics/bts593}}
@article{Ayres:2012vn,
Abstract = {Phylogenetic inference is fundamental to our understanding of most aspects of the origin and evolution of life, and in recent years, there has been a concentration of interest in statistical approaches such as Bayesian inference and maximum likelihood estimation. Yet, for large data sets and realistic or interesting models of evolution, these approaches remain computationally demanding. High-throughput sequencing can yield data for thousands of taxa, but scaling to such problems using serial computing often necessitates the use of nonstatistical or approximate approaches. The recent emergence of graphics processing units (GPUs) provides an opportunity to leverage their excellent floating-point computational performance to accelerate statistical phylogenetic inference. A specialized library for phylogenetic calculation would allow existing software packages to make more effective use of available computer hardware, including GPUs. Adoption of a common library would also make it easier for other emerging computing architectures, such as field programmable gate arrays, to be used in the future. We present BEAGLE, an application programming interface (API) and library for high-performance statistical phylogenetic inference. The API provides a uniform interface for performing phylogenetic likelihood calculations on a variety of compute hardware platforms. The library includes a set of efficient implementations and can currently exploit hardware including GPUs using NVIDIA CUDA, central processing units (CPUs) with Streaming SIMD Extensions and related processor supplementary instruction sets, and multicore CPUs via OpenMP. To demonstrate the advantages of a common API, we have incorporated the library into several popular phylogenetic software packages. The BEAGLE library is free open source software licensed under the Lesser GPL and available from http://beagle-lib.googlecode.com. An example client program is available as public domain software.},
Author = {Ayres, DL and Darling, A and Zwickl, DJ and Beerli, P and Holder, MT and Lewis, PO and Huelsenbeck, JP and Ronquist, F and Swofford, DL and Cummings, MP and Rambaut, A and Suchard, MA},
Date-Added = {2013-05-27 15:26:59 +0000},
Date-Modified = {2013-05-27 15:28:18 +0000},
Doi = {10.1093/sysbio/syr100},
Journal = {Syst Biol},
Journal-Full = {Systematic biology},
Mesh = {Algorithms; Computational Biology; Computing Methodologies; Evolution, Molecular; Genome; Phylogeny; Software},
Month = {Jan},
Number = {1},
Pages = {170-3},
Pmc = {PMC3243739},
Pmid = {21963610},
Pst = {ppublish},
Title = {{BEAGLE}: an application programming interface and high-performance computing library for statistical phylogenetics},
Volume = {61},
Year = {2012},
Bdsk-Url-1 = {http://dx.doi.org/10.1093/sysbio/syr100}}
@article{Breiman:2001kx,
Author = {Breiman, L},
Journal = {Machine Learning},
Number = {1},
Pages = {5-32},
Publisher = {Kluwer Academic Publishers},
Title = {Random Forests},
Volume = {45},
Year = {2001}}
@article{Hansen:2010uq,
Abstract = {Generation of cDNA using random hexamer priming induces biases in the nucleotide composition at the beginning of transcriptome sequencing reads from the Illumina Genome Analyzer. The bias is independent of organism and laboratory and impacts the uniformity of the reads along the transcriptome. We provide a read count reweighting scheme, based on the nucleotide frequencies of the reads, that mitigates the impact of the bias.},
Author = {Hansen, KD and Brenner, SE and Dudoit, S},
Date-Added = {2013-05-25 16:13:34 +0000},
Date-Modified = {2013-05-25 16:14:03 +0000},
Doi = {10.1093/nar/gkq224},
Journal = {Nucleic Acids Res},
Journal-Full = {Nucleic acids research},
Mesh = {DNA Primers; Gene Expression Profiling; Nucleotides; Sequence Analysis, DNA},
Month = {Jul},
Number = {12},
Pages = {e131},
Pmc = {PMC2896536},
Pmid = {20395217},
Pst = {ppublish},
Title = {Biases in {Illumina} transcriptome sequencing caused by random hexamer priming},
Volume = {38},
Year = {2010},
Bdsk-Url-1 = {http://dx.doi.org/10.1093/nar/gkq224}}
@article{Gibbs:2004fk,
Abstract = {The laboratory rat (Rattus norvegicus) is an indispensable tool in experimental medicine and drug development, having made inestimable contributions to human health. We report here the genome sequence of the Brown Norway (BN) rat strain. The sequence represents a high-quality 'draft' covering over 90% of the genome. The BN rat sequence is the third complete mammalian genome to be deciphered, and three-way comparisons with the human and mouse genomes resolve details of mammalian evolution. This first comprehensive analysis includes genes and proteins and their relation to human disease, repeated sequences, comparative genome-wide studies of mammalian orthologous chromosomal regions and rearrangement breakpoints, reconstruction of ancestral karyotypes and the events leading to existing species, rates of variation, and lineage-specific and lineage-independent evolutionary events such as expansion of gene families, orthology relations and protein evolution.},
Author = {Gibbs, RA and Weinstock, GM and Metzker, ML and Muzny, DM and Sodergren, EJ and Scherer, S and Scott, G and Steffen, D and Worley, KC and Burch, PE and Okwuonu, G and Hines, S and Lewis, L and DeRamo, C and Delgado, O and Dugan-Rocha, S and Miner, G and Morgan, M and Hawes, A and Gill, R and Celera and Holt, RA and Adams, MD and Amanatides, PG and Baden-Tillson, H and Barnstead, M and Chin, S and Evans, CA and Ferriera, S and Fosler, C and Glodek, A and Gu, Z and Jennings, D and Kraft, CL and Nguyen, T and Pfannkoch, CM and Sitter, C and Sutton, GG and Venter, J and Woodage, T and Smith, D and Lee, H-M and Gustafson, E and Cahill, P and Kana, A and Doucette-Stamm, L and Weinstock, K and Fechtel, K and Weiss, RB and Dunn, DM and Green, ED and Blakesley, RW and Bouffard, GG and De Jong, PJ and Osoegawa, K and Zhu, B and Marra, M and Schein, J and Bosdet, I and Fjell, C and Jones, S and Krzywinski, M and Mathewson, C and Siddiqui, A and Wye, N and McPherson, J and Zhao, S and Fraser, CM and Shetty, J and Shatsman, S and Geer, K and Chen, Y and Abramzon, S and Nierman, WC and Havlak, PH and Chen, R and Durbin, KJ and Egan, A and Ren, Y and Song, X-Z and Li, B and Liu, Y and Qin, X and Cawley, S and Worley, KC and Cooney, A J and D'Souza, LM and Martin, K and Wu, JQ and Gonzalez-Garay, ML and Jackson, AR and Kalafus, KJ and McLeod, MP and Milosavljevic, A and Virk, D and Volkov, A and Wheeler, DA and Zhang, Z and Bailey, JA and Eichler, EE and Tuzun, E and Birney, E and Mongin, E and Ureta-Vidal, A and Woodwark, C and Zdobnov, E and Bork, P and Suyama, M and Torrents, D and Alexandersson, M and Trask, BJ and Young, JM and Huang, H and Wang, H and Xing, H and Daniels, S and Gietzen, D and Schmidt, J and Stevens, K and Vitt, U and Wingrove, J and Camara, F and Mar Alb\`{a}, M and Abril, JF and Guigo, R and Smit, A and Dubchak, I and Rubin, EM and Couronne, O and Poliakov, A and H\"{u}bner, N and Ganten, D and Goesele, C and Hummel, O and Kreitler, T and Lee, Y-A and Monti, J and Schulz, H and Zimdahl, H and Himmelbauer, H and Lehrach, H and Jacob, HJ and Bromberg, S and Gullings-Handley, J and Jensen-Seaman, MI and Kwitek, AE and Lazar, Jf and Pasko, D and Tonellato, PJ and Twigger, S and Ponting, CP and Duarte, JM and Rice, S and Goodstadt, L and Beatson, SA and Emes, RD and Winter, EE and Webber, C and Brandt, P and Nyakatura, G and Adetobi, M and Chiaromonte, F and Elnitski, L and Eswara, P and Hardison, RC and Hou, M and Kolbe, D and Makova, K and Miller, W and Nekrutenko, A and Riemer, C and Schwartz, S and Taylor, J and Yang, S and Zhang, Y and Lindpaintner, K and Andrews, TD and Caccamo, M and Clamp, M and Clarke, L and Curwen, V and Durbin, R and Eyras, E and Searle, SM and Cooper, GM and Batzoglou, S and Brudno, M and Sidow, A and Stone, EA and Venter, JC and Payseur, BA and Bourque, G and L\'{o}pez-Ot\'{i}n, C and Puente, XS and Chakrabarti, K and Chatterji, S and Dewey, C and Pachter, L and Bray, N and Yap, VB and Caspi, A and Tesler, G and Pevzner, PA and Haussler, D and Roskin, KM and Baertsch, R and Clawson, H and Furey, TS and Hinrichs, AS and Karolchik, D and Kent, WJ and Rosenbloom, KR and Trumbower, H and Weirauch, M and Cooper, DN and Stenson, PD and Ma, B and Brent, M and Arumugam, M and Shteynberg, D and Copley, RR and Taylor, MS and Riethman, H and Mudunuri, U and Peterson, J and Guyer, M and Felsenfeld, A and Old, S and Mockrin, S and Collins, F and {Rat Genome Sequencing Project Consortium}},
Date-Added = {2013-05-25 14:55:41 +0000},
Date-Modified = {2013-05-25 15:24:15 +0000},
Doi = {10.1038/nature02426},
Journal = {Nature},
Journal-Full = {Nature},
Mesh = {Animals; Base Composition; Centromere; Chromosomes, Mammalian; CpG Islands; DNA Transposable Elements; DNA, Mitochondrial; Evolution, Molecular; Gene Duplication; Genome; Genomics; Humans; Introns; Male; Mice; Models, Molecular; Mutagenesis; Polymorphism, Single Nucleotide; RNA Splice Sites; RNA, Untranslated; Rats; Rats, Inbred BN; Regulatory Sequences, Nucleic Acid; Retroelements; Sequence Analysis, DNA; Telomere},
Month = {Apr},
Number = {6982},
Pages = {493-521},
Pmid = {15057822},
Pst = {ppublish},
Title = {Genome sequence of the {Brown Norway} rat yields insights into mammalian evolution},
Volume = {428},
Year = {2004},
Bdsk-Url-1 = {http://dx.doi.org/10.1038/nature02426}}
@article{Havlak01042004,
Abstract = {Atlas is a suite of programs developed for assembly of genomes by a ``combined approach'' that uses DNA sequence reads from both BACs and whole-genome shotgun (WGS) libraries. The BAC clones afford advantages of localized assembly with reduced computational load, and provide a robust method for dealing with repeated sequences. Inclusion of WGS sequences facilitates use of different clone insert sizes and reduces data production costs. A core function of Atlas software is recruitment of WGS sequences into appropriate BACs based on sequence overlaps. Because construction of consensus sequences is from local assembly of these reads, only small (<0.1%) units of the genome are assembled at a time. Once assembled, each BAC is used to derive a genomic layout. This ``sequence-based'' growth of the genome map has greater precision than with non-sequence-based methods. Use of BACs allows correction of artifacts due to repeats at each stage of the process. This is aided by ancillary data such as BAC fingerprint, other genomic maps, and syntenic relations with other genomes. Atlas was used to assemble a draft DNA sequence of the rat genome; its major components including overlapper and split-scaffold are also being used in pure WGS projects.},
Author = {Havlak, P and Chen, R and Durbin, KJ and Egan, A and Ren, Y and Song, X-Z and Weinstock, GM and Gibbs, RA},
Date-Added = {2013-05-25 14:49:11 +0000},
Date-Modified = {2013-05-25 14:53:24 +0000},
Doi = {10.1101/gr.2264004},
Eprint = {http://genome.cshlp.org/content/14/4/721.full.pdf+html},
Journal = {Genome Research},
Number = {4},
Pages = {721-732},
Title = {The {Atlas} Genome Assembly System},
Url = {http://genome.cshlp.org/content/14/4/721.abstract},
Volume = {14},
Year = {2004},
Bdsk-Url-1 = {http://genome.cshlp.org/content/14/4/721.abstract},
Bdsk-Url-2 = {http://dx.doi.org/10.1101/gr.2264004}}
@article{VidalRuiz1986145,
Author = {Ruiz, EV},
Date-Added = {2013-05-25 14:02:54 +0000},
Date-Modified = {2013-05-25 14:04:13 +0000},
Doi = {10.1016/0167-8655(86)90013-9},
Issn = {0167-8655},
Journal = {Pattern Recognition Letters},
Keywords = {pattern recognition},
Number = {3},
Pages = {145-157},
Title = {An algorithm for finding nearest neighbours in (approximately) constant average time},
Url = {http://www.sciencedirect.com/science/article/pii/0167865586900139},
Volume = {4},
Year = {1986},
Bdsk-Url-1 = {http://www.sciencedirect.com/science/article/pii/0167865586900139},
Bdsk-Url-2 = {http://dx.doi.org/10.1016/0167-8655(86)90013-9}}
@inproceedings{DBLP:conf/spire/NavarroPC02,
Abstract = {A t-spanner, a subgraph that approximates graph distances within a precision factor t, is a well known concept in graph theory. In this paper we use it in a novel way, namely as a data structure for searching metric spaces. The key idea is to consider the t-spanner as an approximation of the complete graph of distances among the objects, and use it as a compact device to simulate the large matrix of distances required by successful search algorithms like AESA [Vidal 1986]. The t-spanner provides a time-space tradeoff where full AESA is just one extreme. We show that the resulting algorithm is competitive against current approaches, e.g., 1.5 times the time cost of AESA using only 3.21% of its space requirement, in a metric space of strings; and 1.09 times the time cost of AESA using only 3.83 % of its space requirement, in a metric space of documents. We also show that t-spanners provide better space-time tradeoffs than classical alternatives such as pivot-based indexes. Furthermore, we show that the concept of t-spanners has potential for large improvements.},
Author = {Navarro, G and Paredes, R and Ch\'{a}vez, E},
Bibsource = {DBLP, http://dblp.uni-trier.de},
Booktitle = {SPIRE},
Crossref = {DBLP:conf/spire/2002},
Date-Added = {2013-05-25 13:49:35 +0000},
Date-Modified = {2013-05-25 14:20:23 +0000},
Pages = {298-309},
Title = {$t$-{Spanners} as a Data Structure for Metric Space Searching},
Year = {2002},
Bdsk-Url-1 = {http://dx.doi.org/10.1007/3-540-45735-6_26}}
@proceedings{DBLP:conf/spire/2002,
Bibsource = {DBLP, http://dblp.uni-trier.de},
Booktitle = {SPIRE},
Date-Added = {2013-05-25 13:49:35 +0000},
Date-Modified = {2013-05-25 13:53:10 +0000},
Editor = {Laender, AHF and Oliveira, AL},
Isbn = {3-540-44158-1},
Publisher = {Springer},
Series = {Lecture Notes in Computer Science},
Title = {String Processing and Information Retrieval, 9th International Symposium, SPIRE 2002, Lisbon, Portugal, September 11-13, 2002, Proceedings},
Volume = {2476},
Year = {2002}}
@article{Tatusov24101997,
Abstract = {In order to extract the maximum amount of information from the rapidly accumulating genome sequences, all conserved genes need to be classified according to their homologous relationships. Comparison of proteins encoded in seven complete genomes from five major phylogenetic lineages and elucidation of consistent patterns of sequence similarities allowed the delineation of 720 clusters of orthologous groups (COGs). Each COG consists of individual orthologous proteins or orthologous sets of paralogs from at least three lineages. Orthologs typically have the same function, allowing transfer of functional information from one member to an entire COG. This relation automatically yields a number of functional predictions for poorly characterized genomes. The COGs comprise a framework for functional and evolutionary genome analysis.},
Author = {Tatusov, RL and Koonin, EV and Lipman, DJ},
Date-Added = {2013-05-24 14:18:02 +0000},
Date-Modified = {2013-05-25 13:54:44 +0000},
Doi = {10.1126/science.278.5338.631},
Eprint = {http://www.sciencemag.org/content/278/5338/631.full.pdf},
Journal = {Science},
Number = {5338},
Pages = {631-637},
Title = {A Genomic Perspective on Protein Families},
Url = {http://www.sciencemag.org/content/278/5338/631.abstract},
Volume = {278},
Year = {1997},
Bdsk-Url-1 = {http://www.sciencemag.org/content/278/5338/631.abstract},
Bdsk-Url-2 = {http://dx.doi.org/10.1126/science.278.5338.631}}
@article{Ostlund:2010ys,
Abstract = {The InParanoid project gathers proteomes of completely sequenced eukaryotic species plus Escherichia coli and calculates pairwise ortholog relationships among them. The new release 7.0 of the database has grown by an order of magnitude over the previous version and now includes 100 species and their collective 1.3 million proteins organized into 42.7 million pairwise ortholog groups. The InParanoid algorithm itself has been revised and is now both more specific and sensitive. Based on results from our recent benchmarking of low-complexity filters in homology assignment, a two-pass BLAST approach was developed that makes use of high-precision compositional score matrix adjustment, but avoids the alignment truncation that sometimes follows. We have also updated the InParanoid web site (http://InParanoid.sbc.su.se). Several features have been added, the response times have been improved and the site now sports a new, clearer look. As the number of ortholog databases has grown, it has become difficult to compare among these resources due to a lack of standardized source data and incompatible representations of ortholog relationships. To facilitate data exchange and comparisons among ortholog databases, we have developed and are making available two XML schemas: SeqXML for the input sequences and OrthoXML for the output ortholog clusters.},
Author = {Ostlund, G and Schmitt, T and Forslund, K and K\"{o}stler, T and Messina, DN and Roopra, S and Frings, O and Sonnhammer, ELL},
Date-Added = {2013-05-24 14:13:19 +0000},
Date-Modified = {2013-05-25 13:54:25 +0000},
Doi = {10.1093/nar/gkp931},
Journal = {Nucleic Acids Res},
Journal-Full = {Nucleic acids research},
Mesh = {Algorithms; Animals; Cluster Analysis; Computational Biology; Databases, Genetic; Databases, Nucleic Acid; Escherichia coli; Eukaryotic Cells; Genome, Bacterial; Humans; Information Storage and Retrieval; Internet; Protein Structure, Tertiary; Proteins; Proteomics; Software},
Month = {Jan},
Number = {Database issue},
Pages = {D196-203},
Pmc = {PMC2808972},
Pmid = {19892828},
Pst = {ppublish},
Title = {{InParanoid} 7: new algorithms and tools for eukaryotic orthology analysis},
Volume = {38},
Year = {2010},
Bdsk-Url-1 = {http://dx.doi.org/10.1093/nar/gkp931}}
@article{DBLP:journals/tit/ZivL78,
Author={Ziv, Jacob and Lempel, Abraham},
Bibsource = {DBLP, http://dblp.uni-trier.de},
Date-Added = {2013-05-23 17:22:22 +0000},
Date-Modified = {2013-05-25 17:04:21 +0000},
Journal = {IEEE Transactions on Information Theory},
Number = {5},
Pages = {530-536},
Title = {Compression of Individual Sequences via Variable-Rate Coding},
Volume = {24},
Year = {1978}}
@inproceedings{berger2,
Author = {Daniels, N and Gallant, A and Peng, J and Cowen, L and Baym, M and Berger, B},
Booktitle = {Proceedings of the International Symposium on Intelligent Systems for Molecular Biology},
Title = {Compressive Genomics for Protein Databases},
Year = 2013}
@article{Smith:1981uq,
Author = {Smith, TF and Waterman, MS},
Date-Added = {2013-05-23 16:31:35 +0000},
Date-Modified = {2013-05-25 13:53:42 +0000},
Journal = {J Mol Biol},
Journal-Full = {Journal of molecular biology},
Mesh = {Base Sequence; Models, Chemical},
Month = {Mar},
Number = {1},
Pages = {195-7},
Pmid = {7265238},
Pst = {ppublish},
Title = {Identification of common molecular subsequences},
Volume = {147},
Year = {1981}}
@article{Altschul:1990fk,
Abstract = {A new approach to rapid sequence comparison, basic local alignment search tool (BLAST), directly approximates alignments that optimize a measure of local similarity, the maximal segment pair (MSP) score. Recent mathematical results on the stochastic properties of MSP scores allow an analysis of the performance of this method as well as the statistical significance of alignments it generates. The basic algorithm is simple and robust; it can be implemented in a number of ways and applied in a variety of contexts including straightforward DNA and protein sequence database searches, motif searches, gene identification searches, and in the analysis of multiple regions of similarity in long DNA sequences. In addition to its flexibility and tractability to mathematical analysis, BLAST is an order of magnitude faster than existing sequence comparison tools of comparable sensitivity.},
Author = {Altschul, SF and Gish, W and Miller, W and Myers, EW and Lipman, DJ},
Date-Added = {2013-05-23 16:27:13 +0000},
Date-Modified = {2013-05-25 13:55:54 +0000},
Doi = {10.1016/S0022-2836(05)80360-2},
Journal = {J Mol Biol},
Journal-Full = {Journal of molecular biology},
Mesh = {Algorithms; Amino Acid Sequence; Base Sequence; Databases, Factual; Mutation; Sensitivity and Specificity; Sequence Homology, Nucleic Acid; Software},
Month = {Oct},
Number = {3},
Pages = {403-10},
Pmid = {2231712},
Pst = {ppublish},
Title = {Basic local alignment search tool},
Volume = {215},
Year = {1990},
Bdsk-Url-1 = {http://dx.doi.org/10.1016/S0022-2836(05)80360-2}}
@article{Gnerre:2011kx,
Abstract = {Massively parallel DNA sequencing technologies are revolutionizing genomics by making it possible to generate billions of relatively short (similar to 100-base) sequence reads at very low cost. Whereas such data can be readily used for a wide range of biomedical applications, it has proven difficult to use them to generate high-quality de novo genome assemblies of large, repeat-rich vertebrate genomes. To date, the genome assemblies generated from such data have fallen far short of those obtained with the older (but much more expensive) capillary-based sequencing approach. Here, we report the development of an algorithm for genome assembly, ALLPATHS-LG, and its application to massively parallel DNA sequence data from the human and mouse genomes, generated on the Illumina platform. The resulting draft genome assemblies have good accuracy, short-range contiguity, long-range connectivity, and coverage of the genome. In particular, the base accuracy is high (>= 99.95\%) and the scaffold sizes (N50 size = 11.5 Mb for human and 7.2 Mb for mouse) approach those obtained with capillary-based sequencing. The combination of improved sequencing technology and improved computational methods should now make it possible to increase dramatically the de novo sequencing of large genomes. The ALLPATHS-LG program is available at http://www.broadinstitute.org/science/programs/genome-biology/crd.},
Author = {Gnerre, S and MacCallum, I and Przybylski, D and Ribeiro, FJ and Burton, JN and Walker, BJ and Sharpe, T and Hall, G and Shea, TP and Sykes, S and Berlin, AM and Aird, D and Costello, M and Daza, R and Williams, L and Nicol, R and Gnirke, A and Nusbaum, C and Lander, ES and Jaffe, DB},
Date-Added = {2013-05-19 15:00:51 +0000},
Date-Modified = {2013-05-19 15:04:19 +0000},
Doi = {DOI 10.1073/pnas.1017351108},
Isi = {000286594800058},
Isi-Recid = {194444511},
Isi-Ref-Recids = {185031160 192666104 136600289 194056605 173883755 162176337 181836664 54070268 104563109 184456089 178848194 157247471 186083492 186083480 147665707 185031159 176303533 188715254 121272145 186721557 171604615 180943931 162219845 127311022 162176339},
Iso-Source-Abbreviation = {P Natl Acad Sci Usa},
Journal = {Proc Natl Acad Sci USA},
Pages = {1513--1518},
Times-Cited = {91},
Title = {High-quality draft assemblies of mammalian genomes from massively parallel sequence data},
Volume = {108},
Year = {2011},
Bdsk-Url-1 = {http://ws.isiknowledge.com/cps/openurl/service?url_ver=Z39.88-2004&rft_id=info:ut/000286594800058},
Bdsk-Url-2 = {http://dx.doi.org/10.1073/pnas.1017351108}}
@article{Bryant:2009uq,
Abstract = {BACKGROUND: New rapid high-throughput sequencing technologies have sparked the creation of a new class of assembler. Since all high-throughput sequencing platforms incorporate errors in their output, short-read assemblers must be designed to account for this error while utilizing all available data.
RESULTS: We have designed and implemented an assembler, Quality-value guided Short Read Assembler, created to take advantage of quality-value scores as a further method of dealing with error. Compared to previous published algorithms, our assembler shows significant improvements not only in speed but also in output quality.
CONCLUSION: QSRA generally produced the highest genomic coverage, while being faster than VCAKE. QSRA is extremely competitive in its longest contig and N50/N80 contig lengths, producing results of similar quality to those of EDENA and VELVET. QSRA provides a step closer to the goal of de novo assembly of complex genomes, improving upon the original VCAKE algorithm by not only drastically reducing runtimes but also increasing the viability of the assembly algorithm through further error handling capabilities.},
Author={Bryant, Douglas W and Wong, Weng-Keen and Mockler, Todd C},
Date-Added = {2013-05-19 14:24:05 +0000},
Date-Modified = {2013-05-19 14:26:27 +0000},
Doi = {10.1186/1471-2105-10-69},
Journal = {BMC Bioinformatics},
Journal-Full = {BMC bioinformatics},
Mesh = {Algorithms; Computational Biology; Programming Languages; Sequence Analysis, DNA},
Pages = {69},
Pmc = {PMC2653489},
Pmid = {19239711},
Pst = {epublish},
Title = {{QSRA}: a quality-value guided de novo short read assembler},
Volume = {10},
Year = {2009},
Bdsk-Url-1 = {http://dx.doi.org/10.1186/1471-2105-10-69}}
@article{Bonfield:2013fk,
Abstract = {Storage and transmission of the data produced by modern DNA sequencing instruments has become a major concern, which prompted the Pistoia Alliance to pose the SequenceSqueeze contest for compression of FASTQ files. We present several compression entries from the competition, Fastqz and Samcomp/Fqzcomp, including the winning entry. These are compared against existing algorithms for both reference based compression (CRAM, Goby) and non-reference based compression (DSRC, BAM) and other recently published competition entries (Quip, SCALCE). The tools are shown to be the new Pareto frontier for FASTQ compression, offering state of the art ratios at affordable CPU costs. All programs are freely available on SourceForge. Fastqz: https://sourceforge.net/projects/fastqz/, fqzcomp: https://sourceforge.net/projects/fqzcomp/, and samcomp: https://sourceforge.net/projects/samcomp/.},
Author = {Bonfield, JK and Mahoney, MV},
Date-Added = {2013-05-18 15:32:37 +0000},
Date-Modified = {2013-05-19 14:27:09 +0000},
Doi = {10.1371/journal.pone.0059190},
Journal = {PLoS One},
Journal-Full = {PloS one},
Number = {3},
Pages = {e59190},
Pmc = {PMC3606433},
Pmid = {23533605},
Pst = {ppublish},
Title = {Compression of {FASTQ} and {SAM} format sequencing data},
Volume = {8},
Year = {2013},