diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index ddff083b..fd106171 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -79,7 +79,8 @@ repos: docs/slides/datageneration/index\.qmd| docs/slides/foundations/index\.qmd| docs/exercises/data_filtering/index\.qmd| - docs/slides/population_structure/index\.qmd + docs/slides/population_structure/index\.qmd| + docs/slides/simulation/index\.qmd )$ minimum_pre_commit_version: "2.13.0" - id: parsable-R diff --git a/docs/_quarto.yml b/docs/_quarto.yml index 56030df0..02a8f7e5 100644 --- a/docs/_quarto.yml +++ b/docs/_quarto.yml @@ -10,6 +10,7 @@ project: render: - "*.qmd" - "!lectures/*.qmd" + - "*.ipynb" website: title: "Population genomics in practice" diff --git a/docs/assets/bibliography.bib b/docs/assets/bibliography.bib index b197faf6..a0811f67 100644 --- a/docs/assets/bibliography.bib +++ b/docs/assets/bibliography.bib @@ -537,6 +537,24 @@ @article{hou_BalancingEfficientAnalysis_2021 langid = {english}, keywords = {Data processing,Genome informatics,Software} } +@incollection{hubisz_InferenceAncestralRecombination_2020, + title = {Inference of {{Ancestral Recombination Graphs Using ARGweaver}}}, + booktitle = {Statistical {{Population Genomics}}}, + author = {Hubisz, Melissa and Siepel, Adam}, + editor = {Dutheil, Julien Y.}, + year = {2020}, + series = {Methods in {{Molecular Biology}}}, + pages = {231--266}, + publisher = {{Springer US}}, + address = {{New York, NY}}, + doi = {10.1007/978-1-0716-0199-0_10}, + url = {https://doi.org/10.1007/978-1-0716-0199-0_10}, + urldate = {2021-06-03}, + abstract = {This chapter describes the usage of the program ARGweaver, which estimates the ancestral recombination graph for as many as about 100 genome sequences. The ancestral recombination graph is a detailed description of the coalescence and recombination events that define the relationships among the sampled sequences. This rich description is useful for a wide variety of population genetic analyses. We describe the preparation of data and major considerations for running ARGweaver, as well as the interpretation of results. We then demonstrate an analysis using the DARC (Duffy) gene as an example, and show how ARGweaver can be used to detect signatures of natural selection and Neandertal introgression, as well as to estimate the dates of mutation events. This chapter provides sufficient detail to get a new user up and running with this complex but powerful analysis tool.}, + isbn = {978-1-07-160199-0}, + langid = {english}, + keywords = {Ancestral recombination graph,Local ancestry,Markov chain Monte Carlo,Sequentially Markov coalescent}, +} @article{hurst_GeneticsUnderstandingSelection_2009, title = {Genetics and the Understanding of Selection}, author = {Hurst, Laurence D.}, @@ -555,6 +573,7 @@ @article{hurst_GeneticsUnderstandingSelection_2009 langid = {english}, keywords = {Journal club EBC}, } + @article{hurst_GeneticsUnderstandingSelection_2009, title = {Genetics and the Understanding of Selection}, author = {Hurst, Laurence D.}, @@ -590,6 +609,42 @@ @article{johri_RecommendationsImprovingStatistical_2022 langid = {english}, keywords = {Deletion mutation,Evolutionary processes,Genome analysis,Genomics,Population genetics,Population size,Statistical data,Statistical distributions}, } +@article{kelleher_EfficientPedigreeRecording_2018, + title = {Efficient Pedigree Recording for Fast Population Genetics Simulation}, + author = {Kelleher, Jerome and Thornton, Kevin R. and Ashander, Jaime and Ralph, Peter L.}, + year = {2018}, + month = nov, + journal = {PLOS Computational Biology}, + volume = {14}, + number = {11}, + pages = {e1006581}, + issn = {1553-7358}, + doi = {10.1371/journal.pcbi.1006581}, + url = {https://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1006581}, + urldate = {2019-11-03}, + abstract = {In this paper we describe how to efficiently record the entire genetic history of a population in forwards-time, individual-based population genetics simulations with arbitrary breeding models, population structure and demography. This approach dramatically reduces the computational burden of tracking individual genomes by allowing us to simulate only those loci that may affect reproduction (those having non-neutral variants). The genetic history of the population is recorded as a succinct tree sequence as introduced in the software package msprime, on which neutral mutations can be quickly placed afterwards. Recording the results of each breeding event requires storage that grows linearly with time, but there is a great deal of redundancy in this information. We solve this storage problem by providing an algorithm to quickly `simplify' a tree sequence by removing this irrelevant history for a given set of genomes. By periodically simplifying the history with respect to the extant population, we show that the total storage space required is modest and overall large efficiency gains can be made over classical forward-time simulations. We implement a general-purpose framework for recording and simplifying genealogical data, which can be used to make simulations of any population model more efficient. We modify two popular forwards-time simulation frameworks to use this new approach and observe efficiency gains in large, whole-genome simulations of one to two orders of magnitude. In addition to speed, our method for recording pedigrees has several advantages: (1) All marginal genealogies of the simulated individuals are recorded, rather than just genotypes. (2) A population of N individuals with M polymorphic sites can be stored in O(N log N + M) space, making it feasible to store a simulation's entire final generation as well as its history. (3) A simulation can easily be initialized with a more efficient coalescent simulation of deep history. The software for recording and processing tree sequences is named tskit.}, + langid = {english}, + keywords = {Algorithms,Genome analysis,Natural selection,Phylogenetic analysis,Population genetics,Population size,Simulation and modeling,Trees}, +} + +@article{kelleher_InferringWholegenomeHistories_2019, + title = {Inferring Whole-Genome Histories in Large Population Datasets}, + author = {Kelleher, Jerome and Wong, Yan and Wohns, Anthony W. and Fadil, Chaimaa and Albers, Patrick K. and McVean, Gil}, + year = {2019}, + month = sep, + journal = {Nature Genetics}, + volume = {51}, + number = {9}, + pages = {1330--1338}, + issn = {1546-1718}, + doi = {10.1038/s41588-019-0483-y}, + url = {https://www.nature.com/articles/s41588-019-0483-y}, + urldate = {2019-12-19}, + abstract = {A new method for inferring genealogical histories from large-scale genetic data is used to characterize population structure using data from the 1000 Genomes Project, the UK Biobank and the Simons Genome Diversity Project.}, + copyright = {2019 The Author(s), under exclusive licence to Springer Nature America, Inc.}, + langid = {english}, +} + @book{kimura_1983, title = {The Neutral Theory of Molecular Evolution}, @@ -599,6 +654,26 @@ @book{kimura_1983 address = {{Cambridge}}, doi = {10.1017/CBO9780511623486} } + +@article{kimura_ProteinPolymorphismPhase_1971, + title = {Protein {{Polymorphism}} as a {{Phase}} of {{Molecular Evolution}}}, + author = {Kimura, Motoo and Ohta, Tomoko}, + year = {1971}, + month = feb, + journal = {Nature}, + volume = {229}, + number = {5285}, + pages = {467--469}, + issn = {1476-4687}, + doi = {10.1038/229467a0}, + url = {https://www.nature.com/articles/229467a0}, + urldate = {2018-10-24}, + abstract = {It is proposed that random genetic drift of neutral mutations in finite populations can account for observed protein polymorphisms.}, + copyright = {1971 Nature Publishing Group}, + langid = {english}, + keywords = {Journal club EBC,Population Genetics Gillespie book}, +} + @article{korunes_PixyUnbiasedEstimation_2021, title = {Pixy: {{Unbiased}} Estimation of Nucleotide Diversity and Divergence in the Presence of Missing Data}, shorttitle = {Pixy}, @@ -616,8 +691,6 @@ @article{korunes_PixyUnbiasedEstimation_2021 langid = {english}, keywords = {bioinfomatics/phyloinfomatics,genomics/proteomics,molecular evolution,population genetics – empirical,software}, } - - @article{kreitman_NucleotidePolymorphismAlcohol_1983, title = {Nucleotide Polymorphism at the Alcohol Dehydrogenase Locus of {{Drosophila}} Melanogaster}, author = {Kreitman, Martin}, @@ -635,7 +708,6 @@ @article{kreitman_NucleotidePolymorphismAlcohol_1983 langid = {english}, keywords = {Population Genetics Gillespie book}, } - @article{kumar_PredictingEffectsCoding_2009, title = {Predicting the Effects of Coding Non-Synonymous Variants on Protein Function Using the {{SIFT}} Algorithm}, author = {Kumar, Prateek and Henikoff, Steven and Ng, Pauline C.}, @@ -706,7 +778,6 @@ @article{li_BetterUnderstandingArtifacts_2014 urldate = {2022-12-15}, abstract = {Motivation: Whole-genome high-coverage sequencing has been widely used for personal and cancer genomics as well as in various research areas. However, in the lack of an unbiased whole-genome truth set, the global error rate of variant calls and the leading causal artifacts still remain unclear even given the great efforts in the evaluation of variant calling methods. Results: We made 10 single nucleotide polymorphism and INDEL call sets with two read mappers and five variant callers, both on a haploid human genome and a diploid genome at a similar coverage. By investigating false heterozygous calls in the haploid genome, we identified the erroneous realignment in low-complexity regions and the incomplete reference genome with respect to the sample as the two major sources of errors, which press for continued improvements in these two areas. We estimated that the error rate of raw genotype calls is as high as 1 in 10\textendash 15 kb, but the error rate of post-filtered calls is reduced to 1 in 100\textendash 200 kb without significant compromise on the sensitivity. Availability and implementation: BWA-MEM alignment and raw variant calls are available at http://bit.ly/1g8XqRt scripts and miscellaneous data at https://github.com/lh3/varcmp . Contact:hengli@broadinstitute.orgSupplementary information:Supplementary data are available at Bioinformatics online.}, } - @article{li_InferenceHumanPopulation_2011, title = {Inference of Human Population History from Individual Whole-Genome Sequences}, author = {Li, Heng and Durbin, Richard}, @@ -890,6 +961,22 @@ @article{nielsen_MolecularSignaturesNatural_2005 pmid = {16285858}, keywords = {nmo journal club} } +@incollection{ohta_MolecularEvolutionNearly_2013, + title = {Molecular {{Evolution}}: {{Nearly Neutral Theory}}}, + shorttitle = {Molecular {{Evolution}}}, + booktitle = {{{eLS}}}, + author = {Ohta, Tomoko}, + year = {2013}, + publisher = {{American Cancer Society}}, + doi = {10.1002/9780470015902.a0001801.pub4}, + url = {https://onlinelibrary.wiley.com/doi/abs/10.1002/9780470015902.a0001801.pub4}, + urldate = {2019-02-20}, + abstract = {Nearly neutral theory is an extension of the neutral theory and contends that the borderline mutations, whose effects lie between the selected and the neutral classes, are important at the molecular level. Under the strict neutral theory, the evolutionary rate is equal to the neutral mutation rate. Under the near-neutrality, the situation is not so simple and the most significant difference between the neutral and the nearly neutral theories is that the latter predicts a negative correlation between evolutionary rate and species population size. The nearly neutral theory also predicts abundant rare alleles in the population as compared with strict neutrality. Genome-wide data on protein evolution are mostly in accord with the nearly neutral theory. Genetic regulatory systems are highly complex. The near-neutrality concept may be extended to the evolution of such systems, where epigenetics and robustness are important for gene expression and many mutations are weakly selected. Key Concepts: The emphasis of significance of weak selection in evolution distinguishes the nearly neutral theory from the neutral theory. The nearly neutral theory contends that the interplay of drift and weak selection is important and predicts that evolution is more rapid in small populations than in large populations. Many observed patterns of protein evolution by measuring synonymous and nonsynonymous divergences are in accord with the nearly neutral theory. Observed molecular polymorphisms within a population often show abundance of rare alleles, in accord with the prediction of the nearly neutral theory. Numerous complex systems work together in living cells such as those in chromatin modelling/remodelling and in various signalling pathways. Interplay of drift and weak selection is important for evolution of such complex systems.}, + copyright = {eLS \textcopyright{} 2013, John Wiley \& Sons, Ltd. www.els.net}, + isbn = {978-0-470-01590-2}, + langid = {english}, + keywords = {drift,effectiveness of selection,interplay of drift and selection for evolution of complex systems,nmo journal club,slightly deleterious mutations}, +} @article{patterson_PopulationStructureEigenanalysis_2006, title = {Population {{Structure}} and {{Eigenanalysis}}}, author = {Patterson, Nick and Price, Alkes L. and Reich, David}, @@ -922,6 +1009,7 @@ @article{pedersen_MosdepthQuickCoverage_2018 urldate = {2022-08-26}, abstract = {Mosdepth is a new command-line tool for rapidly calculating genome-wide sequencing coverage. It measures depth from BAM or CRAM files at either each nucleotide position in a genome or for sets of genomic regions. Genomic regions may be specified as either a BED file to evaluate coverage across capture regions, or as a fixed-size window as required for copy-number calling. Mosdepth uses a simple algorithm that is computationally efficient and enables it to quickly produce coverage summaries. We demonstrate that mosdepth is faster than existing tools and provides flexibility in the types of coverage profiles produced.mosdepth is available from https://github.com/brentp/mosdepth under the MIT license.Supplementary data are available at Bioinformatics online.} } + @article{peter_AdmixturePopulationStructure_2016, title = {Admixture, {{Population Structure}}, and {{F-Statistics}}}, author = {Peter, Benjamin M.}, @@ -993,7 +1081,6 @@ @article{schlotterer_SequencingPoolsIndividuals_2014 keywords = {C_Cornwallis_1502,Data Mining,Genome; Human,Genotype,High-Throughput Nucleotide Sequencing,Humans,Metagenomics,Phenotype,Polymorphism; Genetic,Sequence Analysis; DNA,Software}, annotation = {00126} } - @article{shen_SeqKitCrossPlatformUltrafast_2016, title = {{{SeqKit}}: {{A Cross-Platform}} and {{Ultrafast Toolkit}} for {{FASTA}}/{{Q File Manipulation}}}, shorttitle = {{{SeqKit}}}, @@ -1049,6 +1136,7 @@ @misc{webstermatthew_GeneticVariationMountain_2022 urldate = {2023-05-24}, abstract = {The abundance of many species of bumblebees is declining and climate change is a major contributing factor. The distributions of bumblebees are shifting northwards to track suitable climates across continents. Some of the species that are most vulnerable inhabit the Arctic and have no possibility to move further north. An important indicator of the viability of animal populations is genetic diversity, which can be tracked over time by genetic monitoring to identify populations that are declining or at risk. Species of arctic bumblebees found in the Swedish mountains are considered high priority for inclusion in genetic monitoring due to the threats they face from climate change. In this report we use whole-genome sequencing to assess genetic variation in seven species of bumblebee that inhabit the Swedish mountains. Six of the species are restricted to arctic and mountain environments, whereas one is widespread throughout Europe. In total, we sequenced 333 samples of these species from across their ranges in Sweden. Estimates of effective population size (NE) predicted from levels of genetic variation vary between \textasciitilde 55,000 for the species with the most restricted high alpine distributions, to 200,000 for the more common mountain species, and 220,000 for the widespread species. Population fragmentation is generally very low or undetectable in Swedish mountains, suggesting an absence of barriers to gene flow between sub-populations. None of the species appear to be at immediate risk of negative genetic effects due to high levels of genetic drift. Reconstruction of historical fluctuations in NE indicate that the arctic specialist species Bombus hyperboreus has already experienced substantial population declines since the last ice age and we detected one highly inbred sample of this species, suggesting it may be particularly vulnerable. The ranges of all of the mountain bumblebee species are predicted to shrink drastically due to the effects of climate change. Future genetic monitoring would be a useful way to detect whether populations have been adversely affected by declines or fragmentation that could threaten their viability. The data presented here serves as an important first step towards future genetic monitoring programs for these species.}, } + @misc{wetterstrandka_DNASequencingCosts_, title = {{{DNA Sequencing Costs}}: {{Data}} from the {{NHGRI Genome Sequencing Program}} ({{GSP}})}, shorttitle = {{{DNA Sequencing Costs}}}, @@ -1091,38 +1179,3 @@ @article{wisely_GeneticDiversityFitness_2002 urldate = {2023-08-15}, abstract = {The black-footed ferret (Mustela nigripes) is an endangered North American carnivore that underwent a well-documented population bottleneck in the mid-1980s. To better understand the effects of a bottleneck on a free-ranging carnivore population, we used 24 microsatellite loci to compare genetic diversity before versus during the bottleneck, and compare the last wild population to two historical populations. We also compared genetic diversity in black-footed ferrets to that of two sibling species, the steppe polecat (Mustela eversmanni) and the European polecat (Mustela putorius). Black-footed ferrets during the bottleneck had less genetic diversity than steppe polecats. The three black-footed ferret populations were well differentiated (FST = 0.57 {$\pm$} 0.15; mean {$\pm$} SE). We attributed the decrease in genetic diversity in black-footed ferrets to localized extinction of these genetically distinct subpopulations and to the bottleneck in the surviving subpopulation. Although genetic diversity decreased, female fecundity and juvenile survival were not affected by the population bottleneck.}, } -@article{kimura_ProteinPolymorphismPhase_1971, - title = {Protein {{Polymorphism}} as a {{Phase}} of {{Molecular Evolution}}}, - author = {Kimura, Motoo and Ohta, Tomoko}, - year = {1971}, - month = feb, - journal = {Nature}, - volume = {229}, - number = {5285}, - pages = {467--469}, - issn = {1476-4687}, - doi = {10.1038/229467a0}, - url = {https://www.nature.com/articles/229467a0}, - urldate = {2018-10-24}, - abstract = {It is proposed that random genetic drift of neutral mutations in finite populations can account for observed protein polymorphisms.}, - copyright = {1971 Nature Publishing Group}, - langid = {english}, - keywords = {Journal club EBC,Population Genetics Gillespie book}, -} - -@incollection{ohta_MolecularEvolutionNearly_2013, - title = {Molecular {{Evolution}}: {{Nearly Neutral Theory}}}, - shorttitle = {Molecular {{Evolution}}}, - booktitle = {{{eLS}}}, - author = {Ohta, Tomoko}, - year = {2013}, - publisher = {{American Cancer Society}}, - doi = {10.1002/9780470015902.a0001801.pub4}, - url = {https://onlinelibrary.wiley.com/doi/abs/10.1002/9780470015902.a0001801.pub4}, - urldate = {2019-02-20}, - abstract = {Nearly neutral theory is an extension of the neutral theory and contends that the borderline mutations, whose effects lie between the selected and the neutral classes, are important at the molecular level. Under the strict neutral theory, the evolutionary rate is equal to the neutral mutation rate. Under the near-neutrality, the situation is not so simple and the most significant difference between the neutral and the nearly neutral theories is that the latter predicts a negative correlation between evolutionary rate and species population size. The nearly neutral theory also predicts abundant rare alleles in the population as compared with strict neutrality. Genome-wide data on protein evolution are mostly in accord with the nearly neutral theory. Genetic regulatory systems are highly complex. The near-neutrality concept may be extended to the evolution of such systems, where epigenetics and robustness are important for gene expression and many mutations are weakly selected. Key Concepts: The emphasis of significance of weak selection in evolution distinguishes the nearly neutral theory from the neutral theory. The nearly neutral theory contends that the interplay of drift and weak selection is important and predicts that evolution is more rapid in small populations than in large populations. Many observed patterns of protein evolution by measuring synonymous and nonsynonymous divergences are in accord with the nearly neutral theory. Observed molecular polymorphisms within a population often show abundance of rare alleles, in accord with the prediction of the nearly neutral theory. Numerous complex systems work together in living cells such as those in chromatin modelling/remodelling and in various signalling pathways. Interplay of drift and weak selection is important for evolution of such complex systems.}, - copyright = {eLS \textcopyright{} 2013, John Wiley \& Sons, Ltd. www.els.net}, - isbn = {978-0-470-01590-2}, - langid = {english}, - keywords = {drift,effectiveness of selection,interplay of drift and selection for evolution of complex systems,nmo journal club,slightly deleterious mutations}, -} diff --git a/docs/exercises/index.qmd b/docs/exercises/index.qmd index 3e49ccd0..fa4b8fde 100644 --- a/docs/exercises/index.qmd +++ b/docs/exercises/index.qmd @@ -5,7 +5,10 @@ description: "List of all exercises" listing: type: grid grid-columns: 4 - fields: [image, title, author] + fields: [image, title, subtitle, author] + sort: false + contents: + - test/test.ipynb date: "" toc: false sidebar: false diff --git a/docs/schedule.qmd b/docs/schedule.qmd index 4ed69275..72765cc4 100644 --- a/docs/schedule.qmd +++ b/docs/schedule.qmd @@ -24,56 +24,56 @@ format: ::: {.contents-table .column-body-outset} -| Topic | Time | Author | Links | -|------------------------------------------|-------------|------------|-----------------------------------------| -| [**6-Nov-2023 (Mon)**]{.highlight} | | | | -| Coffee and welcome | 08:30-09:00 | | | -| Introduction to PGIP | 09:00-09:30 | JH, NO, PU | | -| Population genomics in practice | 09:30-10:00 | PU | [Slides](slides/pgip/index.html) | -| Coffee break | 10:00-10:30 | | | -| Population genetics foundations | 10:30-12:00 | PU | [Slides](slides/foundations/index.html) | -| Lunch | 12:00-13:00 | | | -| Introduction to simulation | 13:00-15:00 | PU | Slides, Exercise | -| Coffee break | 15:00-15:30 | | | -| Introduction to simulation | 15:30-17:00 | PU | Slides, Exercise | -| [**7-Nov-2023 (Tue)**]{.highlight} | | | | -| Recap day 1 | 09:00-09:30 | PU | | -| Variant calling and filtering | 09:30-10:00 | PU | Slides, Exercise | -| Coffee break | 10:00-10:30 | | | -| Variant calling and filtering, continued | 10:30-12:00 | PU | Slides, Exercise | -| Lunch | 12:00-13:00 | | | -| Genetic diversity | 13:00-15:00 | PU | Slides, Exercise | -| Coffee break | 15:00-15:30 | | | -| TBA | 15:30-17:00 | | | -| [**8-Nov-2023 (Wed)**]{.highlight} | | | | -| Recap day 2 | 09:00-09:30 | PU | | -| PCA | 09:30-10:00 | | | -| Coffee break | 10:00-10:30 | | | -| Fst | 10:30-12:00 | | | -| Lunch | 12:00-13:00 | | | -| Admixture | 13:00-15:00 | | | -| Coffee break | 15:00-15:30 | | | -| f-statistics | 15:30-17:00 | | | -| | | | | -| [**9-Nov-2023 (Thu)**]{.highlight} | | | | -| Recap day 3 | 09:00-09:30 | | | -| Demography | 09:30-10:00 | | | -| Coffee break | 10:00-10:30 | | | -| Demography and psmc | 10:30-12:00 | | | -| Lunch | 12:00-13:00 | | | -| Selection | 13:00-15:00 | | | -| Coffee break | 15:00-15:30 | | | -| Selection | 15:30-17:00 | | | -| | | | | -| [**10-Nov-2023 (Fri)**]{.highlight} | | | | -| Recap | 09:00-10:00 | JH, NO, PU | | -| Coffee break | 10:00-10:30 | | | -| Guest lecture | 10:30-11:30 | | | -| Recap continued | 11:30-12:00 | JH, NO, PU | | -| Lunch | 12:00-13:00 | | | -| Guest lecture | 13:00-14:00 | | | -| Recap continued | 14:00-15:00 | JH, NO, PU | | -| Project discussion | 15:00-17:00 | | | +| Topic | Time | Author | Links | +|------------------------------------------|-------------|------------|----------------------------------------------------------------------------------------------------------| +| [**6-Nov-2023 (Mon)**]{.highlight} | | | | +| Coffee and welcome | 08:30-09:00 | | | +| Introduction to PGIP | 09:00-09:30 | JH, NO, PU | | +| Population genomics in practice | 09:30-10:00 | PU | [Slides](slides/pgip/index.html) | +| Coffee break | 10:00-10:30 | | | +| Population genetics foundations | 10:30-12:00 | PU | [Slides](slides/foundations/index.html) | +| Lunch | 12:00-13:00 | | | +| Introduction to simulation | 13:00-15:00 | PU | [Slides](slides/simulation/index.html), [Exercise](https://percyfal.github.io/pgip-jlite/lab/index.html) | +| Coffee break | 15:00-15:30 | | | +| Introduction to simulation | 15:30-17:00 | PU | [Slides](slides/simulation/index.html), [Exercise](https://percyfal.github.io/pgip-jlite/lab/index.html) | +| [**7-Nov-2023 (Tue)**]{.highlight} | | | | +| Recap day 1 | 09:00-09:30 | PU | | +| Variant calling and filtering | 09:30-10:00 | PU | Slides, Exercise | +| Coffee break | 10:00-10:30 | | | +| Variant calling and filtering, continued | 10:30-12:00 | PU | Slides, Exercise | +| Lunch | 12:00-13:00 | | | +| Genetic diversity | 13:00-15:00 | PU | Slides, Exercise | +| Coffee break | 15:00-15:30 | | | +| TBA | 15:30-17:00 | | | +| [**8-Nov-2023 (Wed)**]{.highlight} | | | | +| Recap day 2 | 09:00-09:30 | PU | | +| PCA | 09:30-10:00 | | | +| Coffee break | 10:00-10:30 | | | +| Fst | 10:30-12:00 | | | +| Lunch | 12:00-13:00 | | | +| Admixture | 13:00-15:00 | | | +| Coffee break | 15:00-15:30 | | | +| f-statistics | 15:30-17:00 | | | +| | | | | +| [**9-Nov-2023 (Thu)**]{.highlight} | | | | +| Recap day 3 | 09:00-09:30 | | | +| Demography | 09:30-10:00 | | | +| Coffee break | 10:00-10:30 | | | +| Demography and psmc | 10:30-12:00 | | | +| Lunch | 12:00-13:00 | | | +| Selection | 13:00-15:00 | | | +| Coffee break | 15:00-15:30 | | | +| Selection | 15:30-17:00 | | | +| | | | | +| [**10-Nov-2023 (Fri)**]{.highlight} | | | | +| Recap | 09:00-10:00 | JH, NO, PU | | +| Coffee break | 10:00-10:30 | | | +| Guest lecture | 10:30-11:30 | | | +| Recap continued | 11:30-12:00 | JH, NO, PU | | +| Lunch | 12:00-13:00 | | | +| Guest lecture | 13:00-14:00 | | | +| Recap continued | 14:00-15:00 | JH, NO, PU | | +| Project discussion | 15:00-17:00 | | | ::: diff --git a/docs/slides/simulation/assets/images/ferretti-table1.jpg b/docs/slides/simulation/assets/images/ferretti-table1.jpg new file mode 100644 index 00000000..9b3b29fe Binary files /dev/null and b/docs/slides/simulation/assets/images/ferretti-table1.jpg differ diff --git a/docs/slides/simulation/assets/images/hejase-2020-fig2.jpg b/docs/slides/simulation/assets/images/hejase-2020-fig2.jpg new file mode 100644 index 00000000..a72fb810 Binary files /dev/null and b/docs/slides/simulation/assets/images/hejase-2020-fig2.jpg differ diff --git a/docs/slides/simulation/assets/images/kelleher-nature-2019-fig1c.png b/docs/slides/simulation/assets/images/kelleher-nature-2019-fig1c.png new file mode 100644 index 00000000..f6e44c12 Binary files /dev/null and b/docs/slides/simulation/assets/images/kelleher-nature-2019-fig1c.png differ diff --git a/docs/slides/simulation/assets/images/kelleher-plos-2018-fig3.png b/docs/slides/simulation/assets/images/kelleher-plos-2018-fig3.png new file mode 100644 index 00000000..de79eed0 Binary files /dev/null and b/docs/slides/simulation/assets/images/kelleher-plos-2018-fig3.png differ diff --git a/docs/slides/simulation/assets/images/nielsen-fig2-2005.jpeg b/docs/slides/simulation/assets/images/nielsen-fig2-2005.jpeg new file mode 100644 index 00000000..98c054ae Binary files /dev/null and b/docs/slides/simulation/assets/images/nielsen-fig2-2005.jpeg differ diff --git a/docs/slides/simulation/index.qmd b/docs/slides/simulation/index.qmd index 1223492e..8c407391 100644 --- a/docs/slides/simulation/index.qmd +++ b/docs/slides/simulation/index.qmd @@ -26,42 +26,116 @@ library(ggraph) library(tidygraph) ``` -::: {.notes } +## The Wright-Fisher model and simulations -The goal of simulation primer must be to convey intuition about -evolutionary processes: +:::: {.columns} -- how genealogies change under different scenarios -- how they connect with summary statistics (e.g., Tajima's) -- duality tree-based stats - summary stats (Ralph/Kelleher) -- how they can be used to calculate outlier stats +::: {.column width="50%"} -A compelling argument for the necessity of simulations is given by -[@gillespie_PopulationGeneticsConcise_2004], who observes that hardly -any analytical solutions exist for multi-locus models and that -therefore simulation is the only resort to understand allele frequency -dynamics. +```{r } +#| label: fig-wf-model-reproductive-success-1 +#| echo: false +#| eval: true +#| fig-cap: Wright-Fisher model for 50 generations, 30 individuals +#| out-width: 80% +set.seed(132) +g <- wright_fisher_pop(n = 30, generations = 50) +x_range <- range(vertex_attr(g, "x")) +y_range <- range(vertex_attr(g, "y")) +x1 <- x_range[1] - 1 +x_range <- c(x1 - 4, x_range[2]) +g %>% + mutate( + allele = + ifelse(degree(., mode = "out") > 0, + "b", "a" + ) + ) %>% + ggplot_wf(., fill = allele) + + geom_segment( + aes( + x = x1, y = y_range[1], + xend = x1, yend = y_range[2] + ), + arrow = arrow( + length = unit(0.5, "cm"), + type = "closed" + ) + ) + + annotate("text", + label = "time", + x = x1 - 1, y = mean(y_range), + size = 10, angle = 90 + ) + + annotate("text", + label = "past", + x = x1 - .5, y = y_range[1], + size = 10, hjust = 1, + ) + + annotate("text", + label = "present", + x = x1 - .5, y = y_range[2], + size = 10, hjust = 1, + ) + + expand_limits(x = x_range, y = y_range) +``` + +::: + +::: {.column width="50%"} -Simulated data are often used to validate new methods. +### Recap + +Model of a population describing **genealogies** under the following assumptions + +- discrete and non-overlapping generations +- haploid individuals or two subpopulations (males and females) +- constant population size +- all individuals are equally fit +- population has no geographical or social structure +- no recombination + +::: {.fragment} + +**Forward** simulation ::: -# The coalescent +::: + +::: + +:::: + +::: {.notes } + +[p. 15 @hein2005gene] shows the fraction of genes without descendants. +Focus on the number of descendants for *one* gene *i*, which is X ~ +Bin(2N, 1/2N). Since E(X)=1, for large 2N, X is almos Po(1), such that +P(no descendants) = P(X=0) = e^{-1} + +Low reproductive success: forward simulations costly! + +::: -## Wright-Fisher model again +## The Wright-Fisher model and simulations :::: {.columns} ::: {.column width="50%"} ```{r } -#| label: fig-wf-model-reproductive-success +#| label: fig-wf-model-reproductive-success-2 #| echo: false #| eval: true #| fig-cap: Wright-Fisher model for 50 generations, 30 individuals #| out-width: 80% set.seed(132) g <- wright_fisher_pop(n = 30, generations = 50) +x_range <- range(vertex_attr(g, "x")) +y_range <- range(vertex_attr(g, "y")) +x1 <- x_range[1] - 1 +x_range <- c(x1 - 4, x_range[2]) g %>% mutate( allele = @@ -69,22 +143,47 @@ g %>% "b", "a" ) ) %>% - ggplot_wf(., fill = allele) + ggplot_wf(., fill = allele) + + geom_segment( + aes( + x = x1, y = y_range[1], + xend = x1, yend = y_range[2] + ), + arrow = arrow( + length = unit(0.5, "cm"), + type = "closed" + ) + ) + + annotate("text", + label = "time", + x = x1 - 1, y = mean(y_range), + size = 10, angle = 90 + ) + + annotate("text", + label = "past", + x = x1 - .5, y = y_range[1], + size = 10, hjust = 1, + ) + + annotate("text", + label = "present", + x = x1 - .5, y = y_range[2], + size = 10, hjust = 1, + ) + + expand_limits(x = x_range, y = y_range) ``` ::: ::: {.column width="50%"} -::: {.fragment} - ```{r } #| label: fig-wf-model-reproductive-success-graph #| echo: false #| eval: true #| cache: false -#| fig-cap: Reproductive success in percent per generation. -#| out-width: 60% +#| fig-cap: +#| Reproductive success in percent per generation. +#| out-width: 55% gdf <- g %>% mutate(allele = ifelse(degree(., mode = "out") > 0, "b", "a")) %>% as.data.frame() @@ -94,13 +193,15 @@ x <- tapply(gdf$allele, gdf$y, function(x) { xmean <- sprintf("%.1f%%", mean(x) * 100) df <- data.frame(x = seq_along(x), y = x) ggplot(subset(df, x < 50), aes(x = x, y = y * 100)) + - geom_line() + + geom_line(linewidth=1.2) + xlab("Generation") + - ylab("Reproductive success (%)") + ylab("Reproductive success (%)") + + theme(text=element_text(size=36)) ``` -Mean reproductive success = `r sprintf("%.1f%%", mean(x) * 100)`. Can show for -large populations P(no descendants)=$1 - e^{-1} \approx 0.632$ +Mean reproductive success = `r sprintf("%.1f%%", mean(x) * 100)`. Can +show for large populations P(no descendants)=$1 - e^{-1} \approx +0.632$ ::: @@ -110,27 +211,335 @@ large populations P(no descendants)=$1 - e^{-1} \approx 0.632$ ::: {.notes } -Repeat WF model. Point out that it can be used for *forward* -simulation. Show how many individuals are lost each generation (limit -e-1): lose lots of compute resources on the way to extant sample. - [p. 15 @hein2005gene] shows the fraction of genes without descendants. Focus on the number of descendants for *one* gene *i*, which is X ~ Bin(2N, 1/2N). Since E(X)=1, for large 2N, X is almos Po(1), such that P(no descendants) = P(X=0) = e^{-1} +Low reproductive success: forward simulations costly! + +::: + +## Forward and backward simulation + +```{r } +#| label: wright-fisher-model-graph +#| echo: false +#| eval: true +#| cache: false +set.seed(2023) +wf <- wright_fisher_pop(n = 10, generations = 16) +``` + +:::: {.columns} + +::: {.column width="50%"} + +```{r} +#| label: fig-wf-model-genealogy-forward +#| echo: false +#| eval: true +#| fig-cap: +#| Forward simulation. +#| out-width: 60% +obj <- wf +x_range <- range(vertex_attr(obj, "x")) +y_range <- range(vertex_attr(obj, "y")) +x1 <- x_range[1] - 1 +x_range <- c(x1 - 3, x_range[2]) +i <- V(obj)[unlist(ego(obj, order = 16, nodes = c(152, 155, 158), mode = "in"))] +E(obj)$color <- "lightgray" +E(obj)$width <- .1 +E(obj)[.to(i)]$width <- .2 +E(obj)[.to(i)]$color <- "black" +ggraph(obj, layout = cbind(V(obj)$x, V(obj)$y)) + + geom_edge_link(aes(color = color, width = width)) + + scale_edge_color_manual(values = c( + "black" = "black", + "lightgray" = "black" + )) + + geom_node_point(fill = "black", color = "black", shape = 21, size = 3) + + theme(legend.position = "none", axis.text.x = element_blank()) + + scale_y_reverse() + scale_edge_width(range = c(.8, 3)) + + geom_segment( + aes( + x = x1, y = y_range[1], + xend = x1, yend = y_range[2] + ), + arrow = arrow( + length = unit(0.5, "cm"), + type = "closed" + ) + ) + + annotate("text", + label = "time", + x = x1 - 1, y = mean(y_range), + size = 10, angle = 90 + ) + + annotate("text", + label = "past", + x = x1 - .5, y = y_range[1], + size = 10, hjust = 1, + ) + + annotate("text", + label = "present", + x = x1 - .5, y = y_range[2], + size = 10, hjust = 1, + ) + + expand_limits(x = x_range, y = y_range) +``` + +::: + +::: {.column width="50%"} + +```{r} +#| label: fig-wf-model-genealogy-backward +#| echo: false +#| eval: true +#| fig-cap: +#| Backward simulation. +#| out-width: 60% +obj <- wf +x_range <- range(vertex_attr(obj, "x")) +y_range <- range(vertex_attr(obj, "y")) +x1 <- x_range[1] - 1 +x_range <- c(x1 - 3, x_range[2]) +i <- V(obj)[unlist(ego(obj, order = 16, nodes = c(152, 155, 158), mode = "in"))] +E(obj)$color <- "lightgray" +E(obj)$width <- .1 +E(obj)[.to(i)]$width <- .2 +E(obj)[.to(i)]$color <- "black" +V(obj)[i]$allele <- "A" +ggraph(obj, layout = cbind(V(obj)$x, V(obj)$y)) + + geom_edge_link(aes(color = color, width = width)) + + scale_edge_color_manual(values = c( + "black" = "black", + "lightgray" = "lightgray" + )) + + geom_node_point(ggplot2::aes(fill = allele), color = "black", shape = 21, size = 3) + + scale_fill_manual(values=list(a="white", A="black")) + + theme(legend.position = "none", axis.text.x = element_blank()) + + scale_y_reverse() + scale_edge_width(range = c(.8, 3)) + + geom_segment( + aes( + x = x1, y = y_range[2], + xend = x1, yend = y_range[1] + ), + arrow = arrow( + length = unit(0.5, "cm"), + type = "closed" + ) + ) + + annotate("text", + label = "time", + x = x1 - 1, y = mean(y_range), + size = 10, angle = 90 + ) + + annotate("text", + label = "past", + x = x1 - .5, y = y_range[1], + size = 10, hjust = 1, + ) + + annotate("text", + label = "present", + x = x1 - .5, y = y_range[2], + size = 10, hjust = 1, + ) + + expand_limits(x = x_range, y = y_range) +``` + +::: + +:::: + +Simulated nodes are filled with black. Genealogy of interest is +highlighted in thick black lines. + +## Why do we want simulations anyway + +::: {.incremental} + +- No analytical solutions for linked selection and the like + $\rightarrow$ must use simulations +- Use to gain understanding and improve interpretation of mutational + processes +- Generate neutral null distributions to compare with observed data +- Generate training data for machine learning + +::: + +# The coalescent + +## Coalescent simulations + +:::: {.columns} + +::: {.column width="50%"} + +The coalescent simulates the genealogy of a **sample** of individuals +on which mutations are "sprinkled" according to a Poisson process. + +1. Simulate ancestry (genealogy) + +::: + +::: {.column width="50%"} + +```{r} +#| label: wf-model-genealogy-backward-1 +#| echo: false +#| eval: true +#| out-width: 75% +obj <- wf +x_range <- range(vertex_attr(obj, "x")) +y_range <- range(vertex_attr(obj, "y")) +y_range <- c(y_range[1], y_range[2] + 1) +sample_nodes <- c(152, 155, 158) +i <- V(obj)[unlist(ego(obj, order = 16, nodes = sample_nodes, mode = "in"))] +V(obj)$sample <- "" +V(obj)[sample_nodes]$sample <- c(1,2,3) +E(obj)$color <- "lightgray" +E(obj)$width <- .1 +E(obj)[.to(i)]$width <- .2 +E(obj)[.to(i)]$color <- "black" +V(obj)[i]$allele <- "A" +ggraph(obj, layout = cbind(V(obj)$x, V(obj)$y)) + + geom_edge_link(aes(color = color, width = width)) + + scale_edge_color_manual(values = c( + "black" = "black", + "lightgray" = "lightgray" + )) + + geom_node_point(ggplot2::aes(fill = allele), color = "black", shape = 21, size = 3) + + scale_fill_manual(values=list(a="white", A="black")) + + theme(legend.position = "none", axis.text.x = element_blank()) + + scale_y_reverse() + scale_edge_width(range = c(.8, 3)) + + geom_node_text(aes(label = sample), size=10, vjust=1.5) + + expand_limits(x = x_range, y = y_range) +``` + +::: + +:::: + +## Coalescent simulations + +:::: {.columns} + +::: {.column width="50%"} + +The coalescent simulates the genealogy of a **sample** of individuals +on which mutations are "sprinkled" according to a Poisson process. + +1. Simulate ancestry (genealogy) +2. Simulate mutations + +::: + +::: {.column width="50%"} + +```{r} +#| label: wf-model-genealogy-backward-mutations +#| echo: false +#| eval: true +#| out-width: 75% +obj <- wf +x_range <- range(vertex_attr(obj, "x")) +y_range <- range(vertex_attr(obj, "y")) +y_range <- c(y_range[1], y_range[2] + 1) +sample_nodes <- c(152, 155, 158) +i <- V(obj)[unlist(ego(obj, order = 16, nodes = sample_nodes, mode = "in"))] +E(obj)$color <- "lightgray" +E(obj)$width <- .1 +E(obj)$mut <- "" +E(obj)[.to(i)]$width <- .2 +E(obj)[.to(i)]$color <- "black" +V(obj)[i]$allele <- "A" +V(obj)$sample <- "" +V(obj)[sample_nodes]$sample <- c(1,2,3) +set.seed(23) +E(obj)[sample(E(obj)[.to(i)], 5)]$mut <- "*" +ggraph(obj, layout = cbind(V(obj)$x, V(obj)$y)) + + geom_edge_link(aes(color = color, width = width, label=mut), + angle_calc="along", label_size=18) + + scale_edge_color_manual(values = c( + "black" = "black", + "lightgray" = "lightgray" + )) + + geom_node_point(ggplot2::aes(fill = allele), color = "black", shape = 21, size = 3) + + scale_fill_manual(values=list(a="white", A="black")) + + theme(legend.position = "none", axis.text.x = element_blank()) + + scale_y_reverse() + scale_edge_width(range = c(.8, 3)) + + geom_node_text(aes(label = sample), size=10, vjust=1.5) + + expand_limits(x = x_range, y = y_range) +``` + +::: + +:::: + +## Coalescent simulations + +:::: {.columns} + +::: {.column width="50%"} + +The coalescent simulates the genealogy of a **sample** of individuals +on which mutations are "sprinkled" according to a Poisson process. + +1. Simulate ancestry (genealogy) +2. Simulate mutations + +Question: how many mutations are common to all samples? How many +mutations does sample 1 have? Sample 2? + +Assuming the ancestral state is denoted `0` (prior to the *first* +generation) and the derived state `1`, what are the sequences of the +samples? + ::: -## The coalescent +::: {.column width="50%"} -Introduce concept. Describe algorithm (show code - include as -exercise?). Genealogy and mutations: +```{r} +#| label: wf-model-genealogy-backward-mutations-1 +#| echo: false +#| eval: true +#| out-width: 75% +obj <- wf +x_range <- range(vertex_attr(obj, "x")) +y_range <- range(vertex_attr(obj, "y")) +y_range <- c(y_range[1], y_range[2] + 1) +sample_nodes <- c(152, 155, 158) +i <- V(obj)[unlist(ego(obj, order = 16, nodes = sample_nodes, mode = "in"))] +E(obj)$color <- "lightgray" +E(obj)$width <- .1 +E(obj)$mut <- "" +E(obj)[.to(i)]$width <- .2 +E(obj)[.to(i)]$color <- "black" +V(obj)[i]$allele <- "A" +V(obj)$sample <- "" +V(obj)[sample_nodes]$sample <- c(1,2,3) +set.seed(23) +E(obj)[sample(E(obj)[.to(i)], 5)]$mut <- "*" +ggraph(obj, layout = cbind(V(obj)$x, V(obj)$y)) + + geom_edge_link(aes(color = color, width = width, label=mut), + angle_calc="along", label_size=18) + + scale_edge_color_manual(values = c( + "black" = "black", + "lightgray" = "lightgray" + )) + + geom_node_point(ggplot2::aes(fill = allele), color = "black", shape = 21, size = 3) + + scale_fill_manual(values=list(a="white", A="black")) + + theme(legend.position = "none", axis.text.x = element_blank()) + + scale_y_reverse() + scale_edge_width(range = c(.8, 3)) + + geom_node_text(aes(label = sample), size=10, vjust=1.5) + + expand_limits(x = x_range, y = y_range) +``` -- sim_ancestry -- sim_mutations +::: -Note that #mutations propto branch length -> don't actually even need -to know the mutations (e.g. fastsimcoal) +:::: ## Simulating genealogies [@hahn_MolecularPopulationGenetics_2019, p. 115] @@ -1342,6 +1751,21 @@ node at ($(t2r) !.5! (t1r)$) (T2) {$T_2$} ::: {.column width="50%"} +Expected waiting time to coalesce when $i$ lineages: $E(T_i) = +\frac{2}{i(i-1)}$ + +Branch lengths can be derived from waiting times. For instance, +$\tau_1=\tau_2=T_5+T_4$ and $\tau_4=\tau_5=T_5$ + +Time to the **most recent common ancestor (MRCA)**: $T_{MRCA} = +\sum_{i=2}^n T_i$ + +with expected value $E(T_{MRCA}) = \sum_{i=2}^nE(T_i) = 2\left(1 - +\frac{1}{n}\right)$ + +The expected **total tree height** is $E(T_{total}) = \sum_{i=2}^n +iE(T_i) = 2\sum_{i=2}^n\frac{1}{i-1}$ + ::: :::: @@ -1384,6 +1808,28 @@ mykronoviz(trees, type = "c", show.tip.label = FALSE, edge.width = 3) ::: +## Diminishing returns of adding more samples + +```{r } +#| label: coalescent-diminishing-returns +#| echo: false +#| eval: true +#| fig-width: 16 +#| fig-height: 8 +n <- 20 +t_total <- function(n) {2*sum(1/(seq(2,n)-1))} +t_mrca <- function(n) {2*(1 - 1/n)} + +data <- data.frame(y=c(c(0, unlist(lapply(2:n, t_total))), + c(0, unlist(lapply(2:n, t_mrca)))), + x=c(1:n, 1:n), + stat=c(rep("E[T_total]", n), rep("E[T_MRCA]", n)) + ) +ggplot(data, aes(x=x, y=y, linetype=factor(stat))) + + geom_line(linewidth=1.2) + xlab("n") + ylab("") + + guides(linetype=guide_legend(title="Statistic")) +``` + ## Adding mutations ```{tikz, fig.ext="svg" } @@ -1409,86 +1855,375 @@ node at ($(1) !.5! (4)$) (mid) {} node[above of=mid, node distance=300px] (root) {} node at ($(4) !.6! (root) $) (6) {} node at ($(2) !.5! (6) $) (5) {} - +node[above of=root, node distance=10px, outer sep=10px] (ancestral) {00000} +node[below of=1, rotate=60, node distance=8px, anchor=south east] (l1) {11000} +node[below of=2, rotate=60, node distance=8px, anchor=south east] (l2) {00110} +node[below of=3, rotate=60, node distance=8px, anchor=south east] (l3) {00100} +node[below of=4, rotate=60, node distance=8px, anchor=south east] (l4) {00001} +node at ($(1) !.33! (root) - (.5, 0)$) (m1l) {0} +node at ($(1) !.33! (root) + (.5, 0)$) (m1r) {1} +node at ($(1) !.67! (root) - (.5, 0)$) (m2l) {0} +node at ($(1) !.67! (root) + (.5, 0)$) (m2r) {1} +node at ($(2) !.5! (5) - (.5, 0)$) (m3l) {0} +node at ($(2) !.5! (5) + (.5, 0)$) (m3r) {1} +node at ($(5) !.5! (6) - (.5, 0)$) (m4l) {0} +node at ($(5) !.5! (6) + (.5, 0)$) (m4r) {1} +node at ($(4) !.5! (6) - (.5, 0)$) (m5l) {0} +node at ($(4) !.5! (6) + (.5, 0)$) (m5r) {1} ; \draw (1) -- (root) (4) -- (root) (2) -- (6) (3) -- (5); +\draw[>=latex, ->] (m1l) -- (m1r); +\draw[>=latex, ->] (m2l) -- (m2r); +\draw[>=latex, ->] (m3l) -- (m3r); +\draw[>=latex, ->] (m4l) -- (m4r); +\draw[>=latex, ->] (m5l) -- (m5r); + +\end{tikzpicture} +``` +## The coalescent and diversity {.smaller} + +:::: {.columns} + +::: {.column width="50%"} + +```{r, engine="tikz", fig.ext="svg" } +#| label: coalescent-tree-dna-variation +#| echo: false +#| eval: true +#| out-height: 600 +\begin{tikzpicture}[outer sep=0, inner sep=0, thick, + node distance=5px, every node/.style={font=\ttfamily}] +\tikzstyle{mut} = [circle, minimum height=0.2cm, fill=gray, draw]; +\draw +node (1) at (0, 0) {} +node (2) at (1, 0) {} +node (3) at (2, 0) {} +node (4) at (3, 0) {} +node (5) at ($(1) !.5! (2) + (0, 2)$) {} +node (6) at ($(1-|5) !.5! (3) + (0, 3)$) {} +node (7) at ($(1-|6) !.5! (4) + (0, 5)$) {} +node[above of=7, anchor=south] (rootl) {000000} +node[rotate=45, below left of=1, anchor=north east] (1l) {010100} +node[rotate=45, below left of=2, anchor=north east] (2l) {010100} +node[rotate=45, below left of=3, anchor=north east] (3l) {001000} +node[rotate=45, below left of=4, anchor=north east] (4l) {100011} +; + +\draw +(1) -- (5) +(2) -- (5) +(5) -- (6) +(3) -- (6) +(6) -- (7) +(4) -- (7) +node[mut] (m1) at ($(5) !.33! (6)$) {} +node[mut] (m2) at ($(5) !.67! (6)$) {} +node[mut] (m3) at ($(3) !.5! (6)$) {} +node[mut] (m4) at ($(4) !.25! (7)$) {} +node[mut] (m5) at ($(4) !.5! (7)$) {} +node[mut] (m6) at ($(4) !.75! (7)$) {} +; \end{tikzpicture} ``` -## The coalescent: diminishing returns +::: + +::: {.column width="50%"} + +Many statistical quantities can be related to the **site frequency +spectrum** (**SFS**), which is a summary of the frequencies of the +segregating sites. Let $\xi_i$ be the number of chromosomes in the +sample with $i$ minor alleles. In the figure we have 6 mutations on +$n=4$ chromosomes. + +| | Name | Count | +|---------|-----------|-------| +| $\xi_1$ | singleton | 4 | +| $\xi_2$ | doubleton | 2 | +| $\xi_3$ | | 0 | + +Note that the number of segregating sites is $S=\sum_{i=1}^{n-1}\xi_i$ + +In this notation one can show that $\pi$, the nucleotide diversity, is + +$$ +\pi = \frac{\sum_{i=1}^{n-1}i(n-i)\xi_i}{n(n-1)/2} +$$ -Show plot of diminishing returns (better to add sites, not samples) +::: {.fragment} -## The coalescent and diversity +#### Exercise -Recalculate example +Calculate the nucleotide diversity and compare the results from the foundation lecture -## Evolutionary processes and genealogies +::: -Non-neutral processes change topologies in ways that we detect when -applying tests [@ferretti_DecomposingSiteFrequency_2017] +::: {.fragment} -genealogies change due to non -Show some qualitative examples of how trees change for +$$ +\pi = \frac{1(4-1)\xi_1 + 2(4-2)\xi_2 + 3(4-3)\xi_3}{4(4-1)/2} = +\frac{1}{6}\left(12 + 8 + 0\right) = 3\frac{1}{3} +$$ + +::: + +::: + +:::: + +::: {.notes } + +Point out that we record the number of mutations per branch which together with + +::: + +## The impact of topology on the SFS + +:::: {.columns} + +::: {.column width="50%"} + + + +![Interpretation of neutrality tests [@ferretti_DecomposingSiteFrequency_2017]](assets/images/ferretti-table1.jpg){width=480} + + + +::: + +::: {.column width="50%"} -- bottleneck + -## Exercise on the coalescent +![The SFS under neutrality and selection](assets/images/nielsen-fig2-2005.jpeg){width=480} + + + +Many tests for selection are based on the SFS which in turn is +influenced by the topology of the tree. + +::: + +:::: + +::: {.notes } + +Take home: topologies influence shape of SFS + +### Example: Tajima's D + +Tests common versus rare alleles. Numerator compares nucleotide +diversity $\pi$ to Watterson's theta, $\theta_W$. + +D < 0 +: recent population increase *or* positive selection + +D > 0 +: population contraction *or* balancing selection + +::: + +# Exercise on the coalescent {.unnumbered .unlisted} # The coalescent with recombination -## Chromosome are mosaics of non-recombining units +## On non-recombining chromosomes and assortment + +:::: {.columns} + +::: {.column width="50%"} + +```{r, engine="tikz", fig.ext="svg" } +#| label: non-recombining-chromosomes-assortment-1 +#| echo: false +#| eval: true +#| out-width: 300 +\tikzstyle{female}=[circle,thick,minimum size=1cm,draw=black] +\tikzstyle{male}=[rectangle,thick,minimum width=1cm, minimum height=1cm, draw=black] + \begin{tikzpicture}[>=latex,font=\small, thick, + plab/.style={font=\scriptsize, outer sep=0cm}] + \node[male, fill=gray] (grandfather) at (0, 8) { }; + \node[female] (grandmother) at (4, 8) { }; + +\node[inner sep=0cm] (grandparents) at (2, 8) { }; + + \node[male, fill=gray] (father) at (2, 4) { }; + \node[female] (mother) at (6, 4) { }; + +\node[inner sep=0cm] (parents) at (4, 4) { }; +\node[inner sep=0cm] (siblings) at (4, 2) { }; + + \node[female, fill=gray] (sister1) at (2, 0) { }; + \node[female, fill=gray] (sister2) at (6, 0) { }; + +\draw (grandfather) -- (grandmother); +\draw (grandparents) -- (father); +\draw (parents) -- (siblings); +\draw (father) -- (mother); +\draw (siblings) -| (sister1); +\draw (siblings) -| (sister2); +\end{tikzpicture} +``` + +Both siblings inherit chromosome from paternal grandfather + +Chromosomes coalesce at father + +::: + +::: {.column width="50%"} + +::: {.fragment} + +```{r, engine='tikz', fig.ext='svg' } +#| label: non-recombining-chromosomes-assortment-2 +#| echo: false +#| eval: true +#| out-width: 300 +\usetikzlibrary{shapes} +\tikzstyle{female}=[circle,thick,minimum size=1cm,draw=black] +\tikzstyle{male}=[rectangle,thick,minimum width=1cm, minimum height=1cm, draw=black] +\tikzstyle{splitmale}=[rectangle split, rectangle split + parts=2,thick,minimum width=1cm, minimum + height=1cm, draw=black] + \begin{tikzpicture}[>=latex,font=\small, thick, + plab/.style={font=\scriptsize, outer sep=0cm}] + \node[male, fill=gray] (grandfather) at (0, 8) { }; + \node[female, fill=lightgray] (grandmother) at (4, 8) { }; + +\node[inner sep=0cm] (grandparents) at (2, 8) { }; + +\node[splitmale, rotate=90, rectangle split part fill={gray, + lightgray}] (father) at (2, 4) { }; + \node[female] (mother) at (6, 4) { }; + +\node[inner sep=0cm] (parents) at (4, 4) { }; +\node[inner sep=0cm] (siblings) at (4, 2) { }; + + \node[female, fill=gray] (sister1) at (2, 0) { }; + \node[female, fill=lightgray] (sister2) at (6, 0) { }; + +\draw (grandfather) -- (grandmother); +\draw (grandparents) -- (father); +\draw (parents) -- (siblings); +\draw (father) -- (mother); +\draw (siblings) -| (sister1); +\draw (siblings) -| (sister2); +\end{tikzpicture} +``` + +Siblings inherit different grandparental chromosomes $\Rightarrow$ +chromosomes coalesce God knows when in the past + +Genealogies **differ** + +::: + +::: + +:::: ## The ancestral recombination graph -## The Sequential Markovian Coalescent +:::: {.columns} + +::: {.column width="50%"} + +![The ancestral recombination graph, +[@hubisz_InferenceAncestralRecombination_2020, Fig. +1]](assets/images/hejase-2020-fig1.jpg){width=400} + +::: + +::: {.column width="50%"} -## Tree sequences +Properties: -## msprime +- marginal trees constitute a sequence of trees (**tree sequence**) + along a chromosome +- each tree represents the genealogy of a non-recombining part of the + chromosome +- neighbouring trees are **correlated** -## Exercises with msprime +Interpretation: chromosomes are mosaics of non-recombining units -Use CLI to generate genealogies and mutations. +::: -# Forward simulation +:::: -## Forward simulation +## msprime stores variation data as tree sequences -Begin with forward simulation as it makes more intuitive sense. +:::: {.columns} -Outline principles of forward simulation from Wright-Fisher model -"algorithm". +::: {.column width="50%"} -## SLiM +![Tree sequences [@kelleher_EfficientPedigreeRecording_2018, Fig. 3]](assets/images/kelleher-plos-2018-fig3.png){width=500} -Mention advances in forward simulations +::: -## Forward simulation in SLiM +::: {.column width="50%"} -Outline steps in SLiM - relate to WF model +![Data compression [@kelleher_InferringWholegenomeHistories_2019, Fig. 1c]](assets/images/kelleher-nature-2019-fig1c.png){width=600} -## Example in SLiM +::: -## Exercise in SLiM +:::: + +## Simulating ancestry with msprime -Optional: do a simple neutral model + -# Combining forward and backward simulations +From [msprime +quickstart](https://tskit.dev/msprime/docs/stable/quickstart.html) -## Recapitation + + +```{python } +#| label: sim_ancestry +#| echo: true +#| eval: true +#| fig-format: svg +#| output: asis +#| out-width: 300 +import msprime +# Simulate an ancestral history for 3 diploid samples under the coalescent +# with recombination on a 5kb region with human-like parameters. +ts = msprime.sim_ancestry( + samples=3, + recombination_rate=1e-8, + sequence_length=5_000, + population_size=10_000, + random_seed=123456) +ts.draw_svg() +``` -Combine the best of two worlds +## Simulating mutations with msprime -## Recipes +```{python } +#| label: sim_mutations +#| echo: true +#| eval: true +#| output: asis +#| fig-format: svg +import msprime +ts = msprime.sim_ancestry( + samples=3, + recombination_rate=1e-8, + sequence_length=5_000, + population_size=10_000, + random_seed=123456) +mutated_ts = msprime.sim_mutations(ts, rate=1e-8, random_seed=54321) +mutated_ts.draw_svg() +``` -Show some recipes and the pgip CLI +# msprime exercise {.unnumbered .unlisted} -## PGIP data set +## Bibliography {.unnumbered .unlisted} -Show the pgip data set and how it was generated +::: { #refs } +:::