-
Notifications
You must be signed in to change notification settings - Fork 1
/
01-methylkit.Rmd
1579 lines (1294 loc) · 79.4 KB
/
01-methylkit.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
---
title: "01-Methylkit"
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
### Load libraries
```{r message=FALSE, warning=FALSE, results=FALSE}
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("methylKit")
require(methylKit)
list.of.packages <- c("tidyverse", "reshape2", "here", "methylKit", "ggforce", "matrixStats", "Pstat", "viridis", "plotly", "readr", "GenomicRanges", "vegan", "factoextra") #add new libraries here
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)
# Load all libraries
lapply(list.of.packages, FUN = function(X) {
do.call("require", list(X))
})
```
### Obtain session information
```{r}
sessionInfo()
```
### Check current directory
```{r}
getwd()
```
### Download files from `gannet`
### Create list of all bismark data files, which have reads that are already trimmed and aligned. These BAM files are also sorted and indexed.
### IMPORTANT NOTE: The location of these files depends on where the user saved them. I (Laura) used an external hard drive.
Files were downloaded from gannet on March 9th, 2020 from [https://gannet.fish.washington.edu/seashell/bu-mox/scrubbed/020320-oly/](https://gannet.fish.washington.edu/seashell/bu-mox/scrubbed/020320-oly/) using `curl -O [URL]`:
https://gannet.fish.washington.edu/seashell/bu-mox/scrubbed/020320-oly/zr1394_1_s456_trimmed_bismark_bt2.deduplicated.sorted.bam
https://gannet.fish.washington.edu/seashell/bu-mox/scrubbed/020320-oly/zr1394_2_s456_trimmed_bismark_bt2.deduplicated.sorted.bam
https://gannet.fish.washington.edu/seashell/bu-mox/scrubbed/020320-oly/zr1394_3_s456_trimmed_bismark_bt2.deduplicated.sorted.bam
https://gannet.fish.washington.edu/seashell/bu-mox/scrubbed/020320-oly/zr1394_4_s456_trimmed_bismark_bt2.deduplicated.sorted.bam
https://gannet.fish.washington.edu/seashell/bu-mox/scrubbed/020320-oly/zr1394_5_s456_trimmed_bismark_bt2.deduplicated.sorted.bam
https://gannet.fish.washington.edu/seashell/bu-mox/scrubbed/020320-oly/zr1394_6_s456_trimmed_bismark_bt2.deduplicated.sorted.bam
https://gannet.fish.washington.edu/seashell/bu-mox/scrubbed/020320-oly/zr1394_7_s456_trimmed_bismark_bt2.deduplicated.sorted.bam
https://gannet.fish.washington.edu/seashell/bu-mox/scrubbed/020320-oly/zr1394_8_s456_trimmed_bismark_bt2.deduplicated.sorted.bam
https://gannet.fish.washington.edu/seashell/bu-mox/scrubbed/020320-oly/zr1394_9_s456_trimmed_bismark_bt2.deduplicated.sorted.bam
https://gannet.fish.washington.edu/seashell/bu-mox/scrubbed/020320-oly/zr1394_10_s456_trimmed_bismark_bt2.deduplicated.sorted.bam
https://gannet.fish.washington.edu/seashell/bu-mox/scrubbed/020320-oly/zr1394_11_s456_trimmed_bismark_bt2.deduplicated.sorted.bam
https://gannet.fish.washington.edu/seashell/bu-mox/scrubbed/020320-oly/zr1394_12_s456_trimmed_bismark_bt2.deduplicated.sorted.bam
https://gannet.fish.washington.edu/seashell/bu-mox/scrubbed/020320-oly/zr1394_13_s456_trimmed_bismark_bt2.deduplicated.sorted.bam
https://gannet.fish.washington.edu/seashell/bu-mox/scrubbed/020320-oly/zr1394_14_s456_trimmed_bismark_bt2.deduplicated.sorted.bam
https://gannet.fish.washington.edu/seashell/bu-mox/scrubbed/020320-oly/zr1394_15_s456_trimmed_bismark_bt2.deduplicated.sorted.bam
https://gannet.fish.washington.edu/seashell/bu-mox/scrubbed/020320-oly/zr1394_16_s456_trimmed_bismark_bt2.deduplicated.sorted.bam
https://gannet.fish.washington.edu/seashell/bu-mox/scrubbed/020320-oly/zr1394_17_s456_trimmed_bismark_bt2.deduplicated.sorted.bam
https://gannet.fish.washington.edu/seashell/bu-mox/scrubbed/020320-oly/zr1394_18_s456_trimmed_bismark_bt2.deduplicated.sorted.bam
```{r}
file.list_18=list(
"/Volumes/Bumblebee/paper-oly-mbdbs-gen/data/MBD-BAM-files/zr1394_1_s456_trimmed_bismark_bt2.deduplicated.sorted.bam",
"/Volumes/Bumblebee/paper-oly-mbdbs-gen/data/MBD-BAM-files/zr1394_2_s456_trimmed_bismark_bt2.deduplicated.sorted.bam",
"/Volumes/Bumblebee/paper-oly-mbdbs-gen/data/MBD-BAM-files/zr1394_3_s456_trimmed_bismark_bt2.deduplicated.sorted.bam",
"/Volumes/Bumblebee/paper-oly-mbdbs-gen/data/MBD-BAM-files/zr1394_4_s456_trimmed_bismark_bt2.deduplicated.sorted.bam",
"/Volumes/Bumblebee/paper-oly-mbdbs-gen/data/MBD-BAM-files/zr1394_5_s456_trimmed_bismark_bt2.deduplicated.sorted.bam",
"/Volumes/Bumblebee/paper-oly-mbdbs-gen/data/MBD-BAM-files/zr1394_6_s456_trimmed_bismark_bt2.deduplicated.sorted.bam",
"/Volumes/Bumblebee/paper-oly-mbdbs-gen/data/MBD-BAM-files/zr1394_7_s456_trimmed_bismark_bt2.deduplicated.sorted.bam",
"/Volumes/Bumblebee/paper-oly-mbdbs-gen/data/MBD-BAM-files/zr1394_8_s456_trimmed_bismark_bt2.deduplicated.sorted.bam",
"/Volumes/Bumblebee/paper-oly-mbdbs-gen/data/MBD-BAM-files/zr1394_9_s456_trimmed_bismark_bt2.deduplicated.sorted.bam",
"/Volumes/Bumblebee/paper-oly-mbdbs-gen/data/MBD-BAM-files/zr1394_10_s456_trimmed_bismark_bt2.deduplicated.sorted.bam",
"/Volumes/Bumblebee/paper-oly-mbdbs-gen/data/MBD-BAM-files/zr1394_11_s456_trimmed_bismark_bt2.deduplicated.sorted.bam",
"/Volumes/Bumblebee/paper-oly-mbdbs-gen/data/MBD-BAM-files/zr1394_12_s456_trimmed_bismark_bt2.deduplicated.sorted.bam",
"/Volumes/Bumblebee/paper-oly-mbdbs-gen/data/MBD-BAM-files/zr1394_13_s456_trimmed_bismark_bt2.deduplicated.sorted.bam",
"/Volumes/Bumblebee/paper-oly-mbdbs-gen/data/MBD-BAM-files/zr1394_14_s456_trimmed_bismark_bt2.deduplicated.sorted.bam",
"/Volumes/Bumblebee/paper-oly-mbdbs-gen/data/MBD-BAM-files/zr1394_15_s456_trimmed_bismark_bt2.deduplicated.sorted.bam",
"/Volumes/Bumblebee/paper-oly-mbdbs-gen/data/MBD-BAM-files/zr1394_16_s456_trimmed_bismark_bt2.deduplicated.sorted.bam",
"/Volumes/Bumblebee/paper-oly-mbdbs-gen/data/MBD-BAM-files/zr1394_17_s456_trimmed_bismark_bt2.deduplicated.sorted.bam",
"/Volumes/Bumblebee/paper-oly-mbdbs-gen/data/MBD-BAM-files/zr1394_18_s456_trimmed_bismark_bt2.deduplicated.sorted.bam")
```
### The following command reads sorted BAM files and calls methylation percentage per base, and creates a methylRaw object for CpG methylation.
It also assigns minimum coverage of 2x and the treatments (in this case, the Olympia oyster population)
```{r, eval = FALSE}
myobj_18 = processBismarkAln(location = file.list_18, sample.id = list("1","2","3","4","5","6","7","8","9","10","11","12","13","14","15","16","17","18"), assembly = "v081", read.context="CpG", mincov=1, treatment = c(0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1))
```
### Save the MethylKit object; re-doing the previous step is memory/time intensive, so best to use the saved object moving forward.
Object prepared by Steven in 2019 is also available at https://d.pr/f/vRtrqZ
Object prepared by Laura in March 2020 is available at TBD
```{r, eval = FALSE}
save(myobj_18, file = "../analyses/methylation/R-objects/myobj_18")
```
```{bash}
#zip ../analyses/methylation/R-objects/myobj_18.zip ../analyses/methylation/R-objects/myobj_18
```
Load object if needed
```{r}
load("../analyses/methylation/R-objects/myobj_18")
```
### Check the basic stats about the methylation data - coverage and percent methylation. Index the object to look through each sample
```{r}
getMethylationStats(myobj_18[[2]],plot=F,both.strands=TRUE)
head(myobj_18[[5]])
str(myobj_18)
```
```{r}
getMethylationStats(myobj_18[[13]],plot=T,both.strands=TRUE)
```
```{r}
getCoverageStats(myobj_18[[5]],plot=TRUE,both.strands=TRUE)
```
```{r}
getCoverageStats(myobj_18[[13]],plot=TRUE,both.strands=TRUE)
```
### Examine raw data by population - how many loci have data in some population but not the other?
```{r}
# Unite data that has has no minimum coverage for either meth calling or for uniting
# THIS IS TO GET LOCI WITH LITTLE DATA IN ONE POP, BUT LOTS OF DATA IN THE OTHER
united0L=methylKit::unite(myobj_18, destrand=TRUE, min.per.group=0L)
```
```{r}
head(united0L)
```
```{r}
# This code counts up the number of samples that have data and plots
getData(united0L) %>%
mutate(HC=apply(dplyr::select(., c(matches("coverage1$|coverage2$|coverage3$|coverage4$|coverage5$|coverage6$|coverage7$|coverage8$|coverage9$"))),
MARGIN = 1, function(x) sum(!is.na(x)))) %>%
mutate(SS=apply(dplyr::select(., c(matches("coverage10|coverage11|coverage12|coverage13|coverage14|coverage15|coverage16|coverage17|coverage18"))),
MARGIN = 1, function(x) sum(!is.na(x)))) %>%
dplyr::select(c("chr", "start", "end", matches("HC|SS"))) %>%
pivot_longer(names_to = "population", cols = c("HC", "SS")) %>% mutate(population=as.factor(population)) %>%
ggplot(aes(x=value, fill=population)) +
geom_histogram(alpha=0.5) +
scale_x_continuous(breaks=c(0,1,2,3,4,5,6,7,8,9), labels=c(0,1,2,3,4,5,6,7,8,9)) +
facet_wrap(~population) + xlab("No. of individuals with data") + ylab("No. of loci") +
theme_minimal() + theme(legend.position = "bottom") + ggtitle("Distribution of the loci that have data in X no. of samples\nNo min coverage for all")
```
```{r}
# Unite data that has 5x coverage minimum, and throw out super high coverage loci.
# THIS IS TO IDENTIFY LOCI THAT HAVE HIGH METHYLATION PERCENTAGE (>75%) IN ONE POPULATION
filtered.5x=filterByCoverage(myobj_18,lo.count=5,lo.perc=NULL,
hi.count=NULL,hi.perc=99.9)
united5x=methylKit::unite(filtered.5x, destrand=TRUE, min.per.group=0L)
#Save dataframe with % methyation information, and # of samples with 5x coverage.
#Also create column with locus information (contig, start, end) to merge with other dataframe
perc.meth5x <- methylKit::percMethylation(united5x, rowids=T) %>% as.data.frame() %>%
mutate(HC.mean=rowMeans(na.rm = TRUE, dplyr::select(., c(1:9)))) %>%
mutate(SS.mean=rowMeans(na.rm = TRUE, dplyr::select(., c(10:18)))) %>%
rownames_to_column(var = "locus") %>%
mutate(HC.5x.count=apply(dplyr::select(., c(matches("^1$|^2$|^3$|^4$|^5$|^6$|^7$|^8$|^9$"))),
MARGIN = 1, function(x) sum(!is.na(x)))) %>%
mutate(SS.5x.count=apply(dplyr::select(., c(matches("10|11|12|13|14|15|16|17|18"))),
MARGIN = 1, function(x) sum(!is.na(x))))
```
### Out of curiosity - how many loci would I have if I kept only those with 5x coverage across ALL samples? 10x coverage?
```{r}
united5x_all.cov=methylKit::unite(filtered.5x, destrand=TRUE, min.per.group=9L)
# 73,756 loci here (plus additional ~250 of the "no "data"), compared to 252,366 in our filtering method (where 7/9 samples have data)
filtered.10x=filterByCoverage(myobj_18,lo.count=10,lo.perc=NULL,
hi.count=NULL,hi.perc=99.9)
united10x_all.cov=methylKit::unite(filtered.10x, destrand=TRUE, min.per.group=9L)
# 3,242 loci (plus additional ~250)
```
### How many loci have data in one population and have NO data in the other population?
```{r}
#Count number of samples that have data for each population, then merge with the average % methylation dataframe
samples.withdata <- getData(united0L) %>%
mutate(HC.1x.count=apply(dplyr::select(., c(matches("coverage1$|coverage2$|coverage3$|coverage4$|coverage5$|coverage6$|coverage7$|coverage8$|coverage9$"))),
MARGIN = 1, function(x) sum(!is.na(x)))) %>%
mutate(SS.1x.count=apply(dplyr::select(., c(matches("coverage10|coverage11|coverage12|coverage13|coverage14|coverage15|coverage16|coverage17|coverage18"))),
MARGIN = 1, function(x) sum(!is.na(x)))) %>%
mutate(locus=paste(chr, start, end, sep=".")) %>%
dplyr::select(c("locus", matches("HC|SS")))
# HERE IS THE RESULTING DATAFRAME TO USE FOR ANALYSIS OF DATA/NO DATA LOCI
# Join the dataframe that has the # of samples with data at each loci (using no min. coverage), with the average % methylation for each loci and # of samples with sufficient coverage for that % (using 5x min. coverage)
(perc.meth.withdata <- samples.withdata %>% left_join(perc.meth5x%>%dplyr::select(locus,HC.mean, SS.mean,HC.5x.count,HC.5x.count,SS.5x.count), by = "locus"))
```
# Here I summarize how many loci have no data in one population, lots of data & high methylation rate in the other
```{r}
paste("Number of loci with no data in any Hood Canal sample, 5x coverage (or greater) in 9/9 samples in South Sound: ",
perc.meth.withdata %>% filter(HC.1x.count<=0 & SS.5x.count==9) %>% nrow()) #add "& SS.mean>0" for %min
paste("Number of loci with no data in any South Sound sample, in 9/9 samples in Hood Canal: ",
perc.meth.withdata %>% filter(SS.1x.count<=0 & HC.5x.count==9) %>% nrow())
paste("Number of loci with no data in any Hood Canal sample, 5x coverage (or greater) in 7/9 samples in South Sound: ",
perc.meth.withdata %>% filter(HC.1x.count<=0 & SS.5x.count>=7) %>% nrow())
paste("Number of loci with no data in any South Sound sample, 5x coverage (or greater) in 7/9 samples in Hood Canal: ",
perc.meth.withdata %>% filter(SS.1x.count<=0 & HC.5x.count>=7) %>% nrow())
paste("Number of loci with data in 1 Hood Canal sample (any coverage), 5x coverage (or greater) in 7/9 samples in South Sound: ",
perc.meth.withdata %>% filter(HC.1x.count<=1 & SS.5x.count>=7) %>% nrow())
paste("Number of loci with data in 1 South Sound sample (any coverage), 5x coverage (or greater) in 7/9 samples in Hood Canal: ",
perc.meth.withdata %>% filter(SS.1x.count<=1 & HC.5x.count>=7) %>% nrow())
paste("Number of loci with data in 2 Hood Canal sample (any coverage), 5x coverage (or greater) in 7/9 samples in South Sound: ",
perc.meth.withdata %>% filter(HC.1x.count<=2 & SS.5x.count>=7) %>% nrow())
paste("Number of loci with data in 2 South Sound sample (any coverage), 5x coverage (or greater) in 7/9 samples in Hood Canal: ",
perc.meth.withdata %>% filter(SS.1x.count<=2 & HC.5x.count>=7) %>% nrow())
```
# Decided to go with this criteria: Loci with data in 1 sample (any coverage) in population A, 5x coverage (or greater) in 7/9 samples in population B
How many loci will I add to the analysis? 251 loci (149 + 102)
```{r}
rbind(perc.meth.withdata %>% filter(HC.1x.count<=1 & SS.5x.count>=7),
perc.meth.withdata %>% filter(SS.1x.count<=1 & HC.5x.count>=7)) %>%
arrange(locus) %>% distinct(locus)
```
```{r}
# Spot check to make sure calculations and filtering are correctly done
perc.meth.withdata %>% filter(HC.1x.count<=1 & SS.5x.count>=7)
getData(united0L) %>% filter(chr=="Contig1", start=="2396")
perc.meth.withdata %>% filter(SS.1x.count<=1 & HC.5x.count>=7)
getData(united0L) %>% filter(chr=="Contig137318", start=="3905")
```
```{r}
# Save bed files - separate file for each population with loci that were "filled in with 10 zeros"
# loci with data in 1 or less Hood Canal sample, but 5x coverage in at least 7/9 South Sound samples
perc.meth.withdata %>% filter(HC.1x.count<=1 & SS.5x.count>=7) %>% separate(locus, sep="\\.", into=c("chr", "start", "end")) %>%
write_delim("../analyses/methylation/meth-highSS-nodataHC.bed", delim = '\t', col_names = TRUE)
# loci with data in 1 or less South Sound sample, but 5x coverage in at least 7/9 Hood Canal samples
perc.meth.withdata %>% filter(SS.1x.count<=1 & HC.5x.count>=7) %>% separate(locus, sep="\\.", into=c("chr", "start", "end")) %>%
write_delim("../analyses/methylation/meth-highHC-nodataSS.bed", delim = '\t', col_names = TRUE)
# Save a single bed file with loci that were filled in with zeros for both populations, and a column indicating which population was the "no data" population
rbind(perc.meth.withdata %>% filter(HC.1x.count<=1 & SS.5x.count>=7) %>% mutate(pop.no.data="HC"),
perc.meth.withdata %>% filter(SS.1x.count<=1 & HC.5x.count>=7) %>% mutate(pop.no.data="SS")) %>%
separate(locus, sep="\\.", into=c("chr", "start", "end")) %>%
write_delim("../analyses/methylation/meth-loci-no-data-both-pops.bed", delim = '\t', col_names = TRUE)
```
##### Which of these loci are located within a gene (+/- 2kb around genes)
```{bash}
bedtools intersect -wb -a "../analyses/methylation/meth-highSS-nodataHC.bed" -b "../genome-features/Olurida_v081-20190709.gene.2kbslop.gff" > ../analyses/methylation/meth-highSS-nodataHC-genes.bed
bedtools intersect -wb -a "../analyses/methylation/meth-highHC-nodataSS.bed" -b "../genome-features/Olurida_v081-20190709.gene.2kbslop.gff" > ../analyses/methylation/meth-highHC-nodataSS-genes.bed
```
### Examine raw data by population - how many loci have data in some population but not the other?
```{r}
(meth.highSS.nodataHC.genes <- read_delim(here::here("analyses","methylation", "meth-highSS-nodataHC-genes.bed"), delim = '\t', col_names = F) %>% as_tibble() %>% dplyr::select(c(-10,-11,-12,-15,-16,-17)) %>%
setNames(c("contig","start","end","HC.1x.count","SS.1x.count", "HC.meanmeth","SS.meanmeth", "HC.5x.count", "SS.5x.count", "start.gene", "end.gene", "attribute")) %>%
mutate(ID=str_extract(attribute, "ID=(.*?);"),
Note=str_extract(attribute, "Note=(.*?);"),
Ontology_term=str_extract(attribute, "Ontology_term=(.*?);"),
SPID=str_extract(attribute, "SPID=(.*?);")) %>%
mutate(SPID=gsub("SPID=", "", SPID)) %>% mutate(SPID=gsub(";", "", SPID)))
write.csv(meth.highSS.nodataHC.genes%>% filter(!is.na(SPID)), file = "../analyses/methylation/meth.highSS.nodataHC.genes.csv")
(meth.highHC.nodataSS.genes <- read_delim(here::here("analyses","methylation","meth-highHC-nodataSS-genes.bed"), delim = '\t', col_names = F) %>% as_tibble() %>% dplyr::select(c(-10,-11,-12,-15,-16,-17)) %>%
setNames(c("contig","start","end","HC.1x.count","SS.1x.count", "HC.meanmeth","SS.meanmeth", "HC.5x.count", "SS.5x.count", "start.gene", "end.gene", "attribute")) %>%
mutate(ID=str_extract(attribute, "ID=(.*?);"),
Note=str_extract(attribute, "Note=(.*?);"),
Ontology_term=str_extract(attribute, "Ontology_term=(.*?);"),
SPID=str_extract(attribute, "SPID=(.*?);")) %>%
mutate(SPID=gsub("SPID=", "", SPID)) %>% mutate(SPID=gsub(";", "", SPID)))
write.csv(meth.highHC.nodataSS.genes%>% filter(!is.na(SPID)), file = "../analyses/methylation/meth.highHC.nodataSS.genes.csv")
# ### Copy Uniprot accession numbers for genes that are annotated
# meth.highSS.nodataHC.genes %>% filter(!is.na(SPID)) %>% select(SPID) %>%
# na.omit() %>% as.vector() %>% unique() %>%
# write_clip() #copy to clipboard
#
# ### Copy Uniprot accession numbers for genes that are annotated
# meth.highHC.nodataSS.genes %>% filter(!is.na(SPID)) %>% select(SPID) %>%
# na.omit() %>% as.vector() %>% unique() %>%
# write_clip() #copy to clipboard
```
# UPDATED ANALYSIS 8/19/2021
# Here I replace these instances of "no data" with zeros
```{r}
# First, create dataframe object of all united samples and loci with new "locus" column that I need in next code chunk
(united0L.df <- united0L %>% getData() %>% mutate(locus=paste(chr,start,end, sep = ".")))
```
```{r}
# FILL IN THE NO-DATA IN HOOD CANAL
# Create a dataframe that only includes the :no data" loci I want to fill in for Hood Canal
(nodata.HC <- perc.meth.withdata %>% filter(HC.1x.count<=1 & SS.5x.count>=7))
for (i in 1:nrow(nodata.HC)) {
print(i) #show progress
## STEP 1
### Replace "Coverage" columns with 5
united0L.df[united0L.df$locus==(nodata.HC$locus)[i], grep("coverage1$|coverage2$|coverage3$|coverage4$|coverage5$|coverage6$|coverage7$|coverage8$|coverage9$", names(united0L.df))] <- united0L.df[united0L.df$locus==(nodata.HC$locus)[i], grep("coverage1$|coverage2$|coverage3$|coverage4$|coverage5$|coverage6$|coverage7$|coverage8$|coverage9$", names(united0L.df))] %>%
replace(., is.na(.), 5)
## STEP 2
## REPLACE "NumCs" column with zeros (indicating none were methylated)
# Extract a single locus, get coverage columns for Hood Canal only, then identify those that have zero coverage in our data and replace with 5x
united0L.df[united0L.df$locus==(nodata.HC$locus)[i], grep("numCs1$|numCs2$|numCs3$|numCs4$|numCs5$|numCs6$|numCs7$|numCs8$|numCs9$", names(united0L.df))] <- united0L.df[united0L.df$locus==(nodata.HC$locus)[i], grep("numCs1$|numCs2$|numCs3$|numCs4$|numCs5$|numCs6$|numCs7$|numCs8$|numCs9$", names(united0L.df))] %>%
replace(., is.na(.), 0)
## STEP 3
## REPLACE "NumTs" column with five (indicating all reads were converted)
# Extract a single locus, get coverage columns for Hood Canal only, then identify those that have zero coverage in our data and replace with 5x
united0L.df[united0L.df$locus==(nodata.HC$locus)[i], grep("numTs1$|numTs2$|numTs3$|numTs4$|numTs5$|numTs6$|numTs7$|numTs8$|numTs9$", names(united0L.df))] <- united0L.df[united0L.df$locus==(nodata.HC$locus)[i], grep("numTs1$|numTs2$|numTs3$|numTs4$|numTs5$|numTs6$|numTs7$|numTs8$|numTs9$", names(united0L.df))] %>%
replace(., is.na(.), 5)
}
```
```{r}
# FILL IN THE NO-DATA IN SOUTH SOUND
# Create a dataframe that only includes the :no data" loci I want to fill in for Hood Canal
(nodata.SS <- perc.meth.withdata %>% filter(SS.1x.count<=1 & HC.5x.count>=7))
for (i in 1:nrow(nodata.SS)) {
print(i) #show progress
## STEP 1
### Replace "Coverage" columns with 5
united0L.df[united0L.df$locus==(nodata.SS$locus)[i], grep("coverage10|coverage11|coverage12|coverage13|coverage14|coverage15|coverage16|coverage17|coverage18", names(united0L.df))] <- united0L.df[united0L.df$locus==(nodata.SS$locus)[i], grep("coverage10|coverage11|coverage12|coverage13|coverage14|coverage15|coverage16|coverage17|coverage18", names(united0L.df))] %>%
replace(., is.na(.), 5)
## STEP 2
## REPLACE "NumCs" column with zeros (indicating none were methylated)
# Extract a single locus, get coverage columns for Hood Canal only, then identify those that have zero coverage in our data and replace with 5x
united0L.df[united0L.df$locus==(nodata.SS$locus)[i], grep("numCs10|numCs11|numCs12|numCs13|numCs14|numCs15|numCs16|numCs17|numCs18", names(united0L.df))] <- united0L.df[united0L.df$locus==(nodata.SS$locus)[i], grep("numCs10|numCs11|numCs12|numCs13|numCs14|numCs15|numCs16|numCs17|numCs18", names(united0L.df))] %>%
replace(., is.na(.), 0)
## STEP 3
## REPLACE "NumTs" column with five (indicating all reads were converted)
# Extract a single locus, get coverage columns for Hood Canal only, then identify those that have zero coverage in our data and replace with 5x
united0L.df[united0L.df$locus==(nodata.SS$locus)[i], grep("numTs10|numTs11|numTs12|numTs13|numTs14|numTs15|numTs16|numTs17|numTs18", names(united0L.df))] <- united0L.df[united0L.df$locus==(nodata.SS$locus)[i], grep("numTs10|numTs11|numTs12|numTs13|numTs14|numTs15|numTs16|numTs17|numTs18", names(united0L.df))] %>%
replace(., is.na(.), 5)
}
```
```{r}
# confirm that i did it corrrectly
united0L.df[united0L.df$locus %in% nodata.HC$locus,]
# confirm that i did it correctly
united0L.df[united0L.df$locus %in% nodata.SS$locus,]
# and that other loci didn't also get changed
united0L.df[1:100,]
```
## Filter out loci where coverage is less than 5x in more than 2/9 samples
```{r}
united0L.df.filtered <- united0L.df %>%
mutate(HC.5x.count=apply(dplyr::select(., c(matches("coverage1$|coverage2$|coverage3$|coverage4$|coverage5$|coverage6$|coverage7$|coverage8$|coverage9$"))), MARGIN = 1, function(x) sum(x > 4, na.rm=T))) %>%
mutate(SS.5x.count=apply(dplyr::select(., c(matches("coverage10|coverage11|coverage12|coverage13|coverage14|coverage15|coverage16|coverage17|coverage18"))), MARGIN = 1, function(x) sum(x > 4, na.rm=T))) %>%
filter(HC.5x.count>=7 & SS.5x.count>=7) %>%
dplyr::select(-HC.5x.count, -SS.5x.count)
```
### 12/21/2022 - THIS STEP IS JUST INCLUDED FOR SUPPLEMENTAL TO ADDRESS REVIEWER COMMENTS
In previous steps I filtered for loci that had min 5x coverage in at least 7/9 samples. BUT I did not replace the samples that had low coverage with NA values, and thus % methylation for some samples for some loci is calculated at low coverage. This introduced noise to our results. Here, I replace data with NA for samples/loci that had <5x coverage.
```{r}
united0L.df.filtered.na <- united0L.df.filtered
# Use for loop to go through each sample's coverage, numCs, and numTs columns, and replace those with coverage <5x with NA
for (i in seq(5, 56, 3)) { # coverage columns are every 3rd column, start at the 5th column, so for loop uses i= c(5, 8, 11, 14 ...)
united0L.df.filtered.na[,i] <- replace(united0L.df.filtered.na[,i], united0L.df.filtered.na[,i] < 5, NA) #replace coverage columns with <5x with NA
united0L.df.filtered.na[,i+1][is.na(united0L.df.filtered.na[,i])] <- NA #replace numCs columns with NA if that sample's coverage column is NA
united0L.df.filtered.na[,i+2][is.na(united0L.df.filtered.na[,i])] <- NA #replace numTs columns with NA if that sample's coverage column is NA
}
```
## Double check that the "no data" loci are still included in the filtered dataframe
```{r}
# confirm that i did it corrrectly
united0L.df.filtered.na[united0L.df.filtered.na$locus %in% nodata.HC$locus,] #yes, 149 loci
# confirm that i did it correctly
united0L.df.filtered.na[united0L.df.filtered.na$locus %in% nodata.SS$locus,] #yes, 102 loci
# And double check the number of loci at 5x in 7/9 samples w/o filled-in data
subset(united0L.df.filtered.na, !(locus %in% c(nodata.SS$locus,nodata.HC$locus))) #252,115 loci
```
```{r}
united0L.df.filtered.na
```
# Create a methylBase object that includes data replaced for zeros in some instances. These instances include ... loci with data in 1 Hood Canal sample (any coverage) & 5x coverage (or greater) in 7/9 samples in South Sound, and visa ersa.
```{r}
meth_filter=new("methylBase",
dplyr::select(united0L.df.filtered.na, -locus),
sample.ids=united0L@sample.ids,
assembly=united0L@assembly,
context=united0L@context,
treatment=united0L@treatment,
coverage.index=united0L@coverage.index,
numCs.index=united0L@numCs.index,
numTs.index=united0L@numTs.index,
destranded=united0L@destranded,
resolution=united0L@resolution,
names=united0L@names)
meth_filter
save(meth_filter, file = "../analyses/methylation/R-objects/meth_filter") #save object to file
```
```{r}
# OLD ANALYSIS - DOES NOT INCLUDE THE "NO DATA" LOCI
### Filter out loci where coverage is less than 5x or greater than 100x. Then, column bind all samples, keeping only loci that are present in 7 of the 9 samples. Also, destrand.
# filtered.myobj=filterByCoverage(myobj_18,lo.count=10,lo.perc=NULL,
# hi.count=100,hi.perc=NULL)
# save(filtered.myobj, file="../analyses/methylation/R-objects/filtered.myobj")
#
# meth_filter=methylKit::unite(filtered.myobj, destrand=TRUE, min.per.group=7L)
# save(meth_filter, file = "../analyses/methylation/R-objects/meth_filter") #save object to file
```
# Back to main analysis
## load "meth_filter" object if needed.
```{r}
load(file = "../analyses/methylation/R-objects/meth_filter")
```
```{r}
# require(tidyverse)
# library(matrixStats)
meth_filter %>% getData() %>% dplyr::select(starts_with("coverage")) %>%
rowwise() %>% sapply(., function(x) c(mean=mean(x, na.rm=T), median=median(x, na.rm=T),
var=var(x, na.rm=T), sd=sd(x, na.rm=T), max=max(x, na.rm=T))) %>%
round(digits = 2) %>% as.data.frame() %>% t() %>% as.data.frame() %>% rownames_to_column(var = "sample") %>%
mutate(population=sample) %>% mutate(population=case_when(grepl("10|11|12|13|14|15|16|17|18", population) ~ "SS", TRUE ~ "HC")) %>%
write_delim(file = "../analyses/methylation/coverage-statistics.tab", delim = "\t", col_names = TRUE)
meth_filter %>% getData() %>% dplyr::select(-starts_with("num"), -strand) %>% pivot_longer(cols = starts_with("coverage")) %>%
mutate(population=name) %>% mutate(population=case_when(grepl("10|11|12|13|14|15|16|17|18", population) ~ "SS", TRUE ~ "HC")) %>%
ggplot(aes(value, col=population)) + geom_histogram() + facet_wrap(~name, ncol=5)
# # For each sample, how many loci have <5x coverage?
# test <- meth_filter %>% getData() %>% dplyr::select(starts_with("coverage"))
# 100*sum((test$coverage1 <= 4) == TRUE, na.rm=TRUE)/length(test$coverage1)
# 100*sum((test$coverage2 <= 4) == TRUE, na.rm=TRUE)/length(test$coverage2)
# 100*sum((test$coverage3 <= 4) == TRUE, na.rm=TRUE)/length(test$coverage3)
# 100*sum((test$coverage4 <= 4) == TRUE, na.rm=TRUE)/length(test$coverage4)
# 100*sum((test$coverage5 <= 4) == TRUE, na.rm=TRUE)/length(test$coverage5)
# 100*sum((test$coverage6 <= 4) == TRUE, na.rm=TRUE)/length(test$coverage6)
# 100*sum((test$coverage7 <= 4) == TRUE, na.rm=TRUE)/length(test$coverage7)
# 100*sum((test$coverage8 <= 4) == TRUE, na.rm=TRUE)/length(test$coverage8)
# 100*sum((test$coverage9 <= 4) == TRUE, na.rm=TRUE)/length(test$coverage9)
#
#
# 100*sum((test$coverage10 <= 4) == TRUE, na.rm=TRUE)/length(test$coverage10)
# 100*sum((test$coverage12 <= 4) == TRUE, na.rm=TRUE)/length(test$coverage12)
# 100*sum((test$coverage13 <= 4) == TRUE, na.rm=TRUE)/length(test$coverage13)
# 100*sum((test$coverage14 <= 4) == TRUE, na.rm=TRUE)/length(test$coverage14)
# 100*sum((test$coverage15 <= 4) == TRUE, na.rm=TRUE)/length(test$coverage15)
# 100*sum((test$coverage16 <= 4) == TRUE, na.rm=TRUE)/length(test$coverage16)
# 100*sum((test$coverage17 <= 4) == TRUE, na.rm=TRUE)/length(test$coverage17)
# 100*sum((test$coverage18 <= 4) == TRUE, na.rm=TRUE)/length(test$coverage18)
```
```{r}
perc.meth=percMethylation(meth_filter, rowids=T)
```
Percent methylation matrix (rows=region/base, columns=sample) can be extracted from methylBase object by using percMethylation function. This can be useful for downstream analyses.
Here I create % methylation matrices from the filtered object, and use it to do another cluster analysis
```{r}
perc.meth=percMethylation(meth_filter, rowids=T)
hist(perc.meth, col="gray50" )
# Make sure that the perc.meth includes the no-data loci
perc.meth
# Save % methylation df to object and .tab file
save(perc.meth, file = "../analyses/methylation/R-objects/perc.meth") #save object to file
#load(file = "../analyses/methylation/R-objects/perc.meth") #load object if needed
write.table((as.data.frame(perc.meth) %>% tibble::rownames_to_column("contig")), file = here::here("analyses", "methylation", "percent-methylation-filtered.tab"), sep = '\t', na = "NA", row.names = FALSE, col.names = TRUE)
# What percentage of loci have ZERO methylation?
100*table(perc.meth==0)[2]/table(perc.meth==0)[1]
# 0.8% of loci unmethylated, averaged across all samples
# 1.4% after adding the "no data" loci
# 1.2% after replacing the <5x coverage with NA
# Mean % methylation across all samples
mean(perc.meth, na.rm=TRUE)
# 89.6%
# 88.5% after adding "no data" loci
# 88.7% after replacing the <5x coverage with NA
# Mean % methylation for each populaiton
mean(perc.meth[,1:9], na.rm=TRUE) # 89.6% <-- Hood Canal #new filtering= 88.47% #replacing <5x with NA=88.7%
mean(perc.meth[,10:18], na.rm=TRUE) # 89.6% <-- South Sound #new filtering=88.60% #88.7%
perc.meth.summ <- data.frame(sample=1:18, methylated=1:18, coverage=1:18, mean.perc=1:18)
for (i in 1:18) {
perc.meth.summ[i,"sample"] <- i
perc.meth.summ[i,"methylated"] <-mean(as.vector(getData(meth_filter)[,grep(c("numCs"), colnames(meth_filter))][i][,1]), na.rm=T)
perc.meth.summ[i,"coverage"] <- mean(as.vector(getData(meth_filter)[,grep(c("coverage"), colnames(meth_filter))][i][,1]), na.rm=T)
perc.meth.summ[i,"mean.perc"] <- mean(as.vector(getData(meth_filter)[,grep(c("numCs"), colnames(meth_filter))][i][,1]) /
as.vector(getData(meth_filter)[,grep(c("coverage"), colnames(meth_filter))][i][,1]), na.rm=T)
}
mean(perc.meth.summ[1:9,"mean.perc"]) #88.68%
mean(perc.meth.summ[10:18,"mean.perc"]) #88.73%
paste("No. of loci in filtered dataset:", nrow(perc.meth)) #new filtering = 252,366 loci
paste("No. of samples:", ncol(perc.meth)) #new filtering=18 samples
```
```{r}
myDiff=calculateDiffMeth(meth_filter,mc.cores=4)
```
```{r}
# get hyper methylated bases
myDiff25p.hyper=getMethylDiff(myDiff,difference=25,qvalue=0.01,type="hyper")
#
# get hypo methylated bases
myDiff25p.hypo=getMethylDiff(myDiff,difference=25,qvalue=0.01,type="hypo")
#
#
# get all differentially methylated bases
myDiff25p=getMethylDiff(myDiff,difference=25,qvalue=0.01)
getData(myDiff25p)
# get deferentially methylated bases using 12% difference and save to files
myDiff12p=getMethylDiff(myDiff,difference=12,qvalue=0.01)
write.table(myDiff12p, file = "../analyses/DMLs/myDiff12p.tab", sep = "\t", col.names = TRUE)
write_delim(getData(myDiff12p) %>% mutate(start = start-1, end = end+1) %>% dplyr::select(chr, start, end, meth.diff),
"../analyses/DMLs/dml12.bed", delim = '\t', col_names = TRUE)
```
```{r}
write.table(myDiff25p, file = "../analyses/DMLs/myDiff25p.tab", sep = "\t", col.names = TRUE)
save(myDiff25p, file="../analyses/DMLs/R-objects/myDiff25p")
#load(file="../analyses/DMLs/R-objects/myDiff25p") #load if needed
```
---
### Take the DMLs to a bed
```{r}
# Write out bedfile for DMLs
dml25 <- myDiff25p %>% getData() %>%
mutate(start = start-1, end = end+1) %>%
dplyr::select(chr, start, end, meth.diff)
write_delim(dml25, "../analyses/DMLs/dml25.bed", delim = '\t', col_names = TRUE)
write_delim(dml25, "../analyses/DMLs/dml25_forIGV.bed", delim = '\t', col_names = FALSE) #no column names for IGV
save(dml25, file="../analyses/DMLs/R-objects/dml25")
#load(file="../analyses/DMLs/R-objects/dml25") #load dml25 if needed
# Write out bedfile for all loci fed into DML analysis, to use as background for enrichment
mydiff.all <- mutate(getData(myDiff), start = start -1, end = end + 1) %>% dplyr::select(chr, start, end, meth.diff) %>% mutate_if(is.numeric, as.integer)
write_delim(mydiff.all, "../analyses/DMLs/mydiff-all.bed", delim = '\t', col_names = TRUE)
save(mydiff.all, file="../analyses/DMLs/R-objects/mydiff.all")
#load(file="../analyses/DMLs/R-objects/mydiff.all")
# Save DML bed files with hypermethylated loci in each population
#in the df "dml25", (-) values = hypermethylated in HC, and (+) = hypermethylated in SS
paste("No. of DMLs that had higher methylation in HC= ", dml25 %>%filter(meth.diff<0)%>%nrow())
1802/nrow(dml25)
paste("No. of DMLs that had higher methylation in SS= ", dml25 %>%filter(meth.diff>0)%>%nrow())
2109/nrow(dml25)
write_delim(dml25 %>%filter(meth.diff<0), "../analyses/DMLs/dml25-hypermethHC.bed", delim = '\t', col_names = TRUE)
write_delim(dml25 %>%filter(meth.diff>0), "../analyses/DMLs/dml25-hypermethSS.bed", delim = '\t', col_names = TRUE)
```
### This is the number of loci that are differentially methylated among populations (DMLs)
```{r}
paste("No. of DMLs:", nrow(dml25))
paste("Percent of loci (filtered for 5x for 7 of 9 samples per pop) that are DMLs: ", round(nrow(dml25)/nrow(perc.meth)*100, digits=3), "%", sep="")
```
### How many of our "zero-added" loci are also DMLs?
```{r}
paste("Number of the `zero-added` loci that are DMLs = ",
inner_join(
rbind(perc.meth.withdata %>% filter(HC.1x.count<=1 & SS.5x.count>=7) %>% mutate(pop.no.data="HC"),
perc.meth.withdata %>% filter(SS.1x.count<=1 & HC.5x.count>=7) %>% mutate(pop.no.data="SS")) %>%
separate(locus, sep="\\.", into=c("chr", "start", "end")) %>%
dplyr::select(chr, start, end) %>% mutate_at(c("start", "end"), as.numeric) %>%
mutate(start=start-1, end=end+1),
dml25
) %>% nrow())
```
### Subset the filtered methylation count dataframe for only those that are differentially methylated. Save object to file.
```{r}
dml25_counts <- selectByOverlap(meth_filter,as(myDiff25p,"GRanges"))
class(dml25_counts) #class should be methylBase
perc.methDMLs= percMethylation(dml25_counts, rowids=T) #create a percent methylated object for DMLs
# save count and percent objects to be used in subsequent notebook
save(dml25_counts, file = "../analyses/DMLs/R-objects/dml25_counts")
save(perc.methDMLs, file="../analyses/DMLs/R-objects/perc.methDMLs")
```
### Extract correlation matrix (pearson) and distance matrix (manhattan) from the % methylated matrix
FYI I checked that the correlation matrix in `getCorrelation` is derived from the percent methylated matrix. It is!
```{r}
cor.pears <- cor(perc.meth, method="pearson") # create pearson correlation matrix.
dist.manhat <- dist(t(perc.meth), method = "manhattan", diag = T, upper = F)
dist.manhat.df <- reshape2::melt(as.matrix(dist.manhat), varnames = c("row", "col"))
cor.pears.DMLs <- cor(perc.methDMLs, method="pearson")
dist.manhat.DMLs <- dist(t(perc.methDMLs), method = "manhattan", diag = T, upper = F)
dist.manhat.DMLs.df <- reshape2::melt(as.matrix(dist.manhat.DMLs), varnames = c("row", "col"))
```
### Read in sample number key, merge with distance matrix dataframes
```{r}
key <- read.csv(here::here("data", "sample-key.csv"))
dist.manhat.df <- merge(x=merge(x=dist.manhat.df, y=key, by.x="row", by.y="MBD.FILENAME"), y=key, by.x="col", by.y="MBD.FILENAME")
colnames(dist.manhat.df) <- c("SeqNum.row", "SeqNum.col", "dist.manh", "SampNum.row", "SampNum.col")
write.csv(dist.manhat.df, file=here::here("analyses", "methylation", "dist.manhat.csv"), row.names=FALSE)
dist.manhat.DMLs.df <- merge(x=merge(x=dist.manhat.DMLs.df, y=key, by.x="row", by.y="MBD.FILENAME"), y=key, by.x="col", by.y="MBD.FILENAME")
colnames(dist.manhat.DMLs.df) <- c("SeqNum.row", "SeqNum.col", "dist.manh", "SampNum.row", "SampNum.col")
write.csv(dist.manhat.DMLs.df, file=here::here("analyses", "methylation", "dist.manhat.DMLs.csv"), row.names=FALSE)
```
### Figures
### Hierarchical Clustering using filtered methylation data
The function clusters samples using hclust function and various distance metrics derived from percent methylation per base or per region for each sample.
```{r}
clusterSamples(meth_filter, dist="correlation", method="ward", plot=TRUE)
```
### Principal Component Analyses, all loci after filtering
Customized `methylKit` biplots from Yaamini Venkataraman's code, [https://github.com/epigeneticstoocean/paper-gonad-meth/blob/master/code/04-methylkit.Rmd#L270](https://github.com/epigeneticstoocean/paper-gonad-meth/blob/master/code/04-methylkit.Rmd#L270)
```{r, fig.show='hide'}
#load("../analyses/methylation/R-objects/meth_filter") #load filtered meth data object if needed
#Create dataframe with sample treatment information, color & symbol scheme
plotCustomization <- data.frame(sample = 1:18, population=c(
rep("HC", times=9),
rep("SS", times=9)),
color=c(rep("firebrick3", times=9),
rep("dodgerblue3", times=9)),
symbol=c(rep(16, times=9),
rep(17, times=9)))
PCA.filtered <- PCASamples(meth_filter, obj.return = TRUE) #Run a PCA analysis on percent methylation for all samples. methylKit uses prcomp to create the PCA matrix
summary(PCA.filtered) #Look at summary statistics. The first PC explains 9.87% of variation, the second PC explains 8.51% of variation #new analysis: PC1=9.87% PC2=8.51% #after replacing >5x with NAs: PC1=10.7% PC2=8.6%
```
### PCA Figure, all methylated loci after filtering
```{r}
# Save size: width=650 height=600
par(mar = c(5, 5, 1, 1)) #Specify inner and outer margins
PCA.figure <- ordiplot(PCA.filtered, choices = c(1, 2), type = "none", display = "sites", cex = 0.5, xlab = "", ylab = "", xaxt = "n", yaxt = "n") #Use ordiplot to create base biplot. Do not add any points
points(PCA.figure, "sites", col = as.character(plotCustomization$color), pch = plotCustomization$symbol, cex = 2.5) #Add each sample. Darker samples are ambient, lighter samples are elevated pCO2
#Add multiple white boxes on top of the default black box to manually change the color
box(col = "white")
box(col = "white")
box(col = "white")
box(col = "white")
box(col = "white")
box(col = "white")
box(col = "white")
box(col = "white")
box(col = "white")
box(col = "white")
#ordiellipse(PCA.filtered, plotCustomization$population, show.groups = "HC", col = "firebrick3") #Add confidence ellipse around the samples in elevated pCO2
#ordiellipse(PCA.filtered, plotCustomization$population, show.groups = "SS", col = "dodgerblue3") #Add confidence ellipse around the samples in ambient pCO2
axis(side = 1, labels = TRUE, col = "grey80", cex.axis = 1.7) #Add x-axis
mtext(side = 1, text = "PC 1 (10.7%)", line = 3, cex = 1.5) #Add x-axis label
axis(side = 2, labels = TRUE, col = "grey80", cex.axis = 1.7) #Add y-axis
mtext(side = 2, text = "PC 2 (8.6%)", line = 3, cex = 1.5) #Add y-axis label
legend("topleft",
pch = c(16, 17),
legend = c("Hood Canal", "South Sound"),
col = c(c("firebrick3", "dodgerblue3")),
cex = 1.6, bty = "n") #Add a legend with information about ambient and elevated samples
```
### Save PC scores and R object for integration with genetic data
NOTE: this PCA was run with _all_ methylated loci data after filtering for 5x across 7 of 9 samples per pop.
```{r}
save(PCA.filtered, file="../analyses/methylation/R-objects/PCA.filtered")
write_delim(data.frame(PCA.filtered$x) %>% tibble::rownames_to_column("sample"), "../analyses/methylation/PC-scores-filtered-methylation.tab", delim = '\t', col_names = TRUE)
```
## permANOVA and dispersion test, % methylation by population
```{r}
perc.meth.d<-vegdist(data.frame(t(perc.meth)),"euclidean", na.rm = TRUE)
print(meth.perm <- adonis(data.frame(t(perc.meth)) ~ population, data=plotCustomization,
permutations=1000, method='euclidean', na.rm = TRUE)) #yes, global population difference
hist(meth.perm$f.perms, main="Histogram of pseudo-F values under the null model",
xlab="pseudo=F values", col="gray", xlim=c(0.5,2.5))
abline(v=1.9743, col="red") # indicates F-value is significant.
source("../resources/biostats.R")
pca.eigenval(PCA.filtered) #The Proporation of Variance = %variance explained
screeplot(PCA.filtered, bstick=TRUE)
# check sign. of PCs - can't get this to work, though.
#ordi.monte(PCA.filtered, ord="pca", dim=2, perm = 1000, scale = F, plot = F)
anova(meth.bdp<-betadisper(perc.meth.d,plotCustomization$population)) #no sign. difference in dispersion, which is good
boxplot(meth.bdp)
# perform PCoA just for fun on distance matrix
meth.pcoa<-cmdscale(perc.meth.d, eig=TRUE, add=T)
ordiplot(meth.pcoa, choices = c(1, 2), type="text", display="sites", xlab="PC 1", ylab="PC 2")
```
### Principal Component Analyses, DMLs only
```{r, fig.show='hide'}
load("../analyses/DMLs/R-objects/dml25_counts")
PCA.DMLs <- PCASamples(dml25_counts, obj.return = TRUE, sd.filter = T) #Run a PCA analysis on percent methylation for all samples. methylKit uses prcomp to create the PCA matrix
nrow(PCA.DMLs$rotation)
summary(PCA.DMLs) #Look at summary statistics. The first PC explains 36.3% of variation, the second PC explains 8.5% of variation. # new analysis/filtering AND >5x replace with NA: PC1=36.5% PC2=9.3%
```
### PCA Figure, DMLs only
```{r}
par(mar = c(5, 5, 1, 1)) #Specify inner and outer margins
PCA.figure <- ordiplot(PCA.DMLs, choices = c(1, 2), type = "none", display = "sites", cex = 0.5, xlab = "", ylab = "", xaxt = "n", yaxt = "n") #Use ordiplot to create base biplot. Do not add any points
points(PCA.figure, "sites", col = as.character(plotCustomization$color), pch = plotCustomization$symbol, cex = 2.5) #Add each sample. Darker samples are ambient, lighter samples are elevated pCO2
#Add multiple white boxes on top of the default black box to manually change the color
box(col = "white")
box(col = "white")
box(col = "white")
box(col = "white")
box(col = "white")
box(col = "white")
box(col = "white")
box(col = "white")
box(col = "white")
box(col = "white")
#ordiellipse(PCA.DMLs, plotCustomization$population, show.groups = "HC", col = "firebrick3") #Add confidence ellipse around the samples in elevated pCO2
#ordiellipse(PCA.DMLs, plotCustomization$population, show.groups = "SS", col = "dodgerblue3") #Add confidence ellipse around the samples in ambient pCO2
axis(side = 1, labels = TRUE, col = "grey80", cex.axis = 1.7) #Add x-axis
mtext(side = 1, text = "PC 1 (36.5%)", line = 3, cex = 1.5) #Add x-axis label
axis(side = 2, labels = TRUE, col = "grey80", cex.axis = 1.7) #Add y-axis
mtext(side = 2, text = "PC 2 (9.3%)", line = 3, cex = 1.5) #Add y-axis label
legend("topright",
pch = c(16, 17),
legend = c("Hood Canal", "South Sound"),
col = c(c("firebrick3", "dodgerblue3")),
cex = 1.4, bty = "n") #Add a legend with information about ambient and elevated samples
save(PCA.figure, file="../analyses/DMLs/R-objects/PCA.figure")
```
### Save PC scores and R object for integration with genetic data
NOTE: this PCA was run with DMLs only
```{r}
save(PCA.DMLs, file="../analyses/DMLs/R-objects/PCA.DMLs")
write_delim(data.frame(PCA.DMLs$x) %>% tibble::rownames_to_column("sample"), "../analyses/DMLs/PC-scores-DMLs.tab", delim = '\t', col_names = TRUE)
```
### Reshape filtered data to create a new column with sample info, and calculate %methylation for each locus/sample
```{r}
meth_filter_reshaped <- melt(data=meth_filter, id=c("chr", "start", "end", "strand"), value.name = "count") %>%
tidyr::separate(col = "variable" , into = c("variable", "sample"), sep = "(?<=[a-zA-Z])\\s*(?=[0-9])") %>%
dcast(chr+start+end+strand+sample ~ variable) %>%
mutate(percMeth = 100*(numCs/coverage)) %>%
mutate(population=as.character(sample))
meth_filter_reshaped$population <- car::recode(meth_filter_reshaped$population, "c('1', '2', '3', '4', '5', '6', '7', '8', '9')='HC'")
meth_filter_reshaped$population <- car::recode(meth_filter_reshaped$population, "c('10','11','12','13','14','15','16','17','18')='SS'")
readr::write_delim(meth_filter_reshaped, here::here("analyses", "methylation", "meth_filter_long.tab"), delim = '\t', col_names = FALSE)
save(meth_filter_reshaped, file="../analyses/methylation/R-objects/meth_filter_reshaped")
#load(file="../analyses/methylation/R-objects/meth_filter_reshaped")
```
## Check out overall distribution of filtered methylation data by population
Any trends? Looks like HC has more instances of ~100% methylation with a lower second peak at around ~92%. Compared to SS that has a lower peak at ~100% and a second around 95%.
```{r}
meth_filter_reshaped %>%
ggplot(aes(x=population, y=percMeth, fill=population)) +
geom_violin(width=1, alpha=0.8) + theme_minimal() +
ylab("% CpG Methylation") + theme(axis.title.x = element_blank(), legend.position = "none",
text = element_text(size=16)) +
scale_x_discrete(labels=c("HC" = "Hood Canal\nPopulation", "SS" = "South Sound\nPopulation")) +
scale_fill_manual(values=c("firebrick3", "dodgerblue3"))
```
View distributions in density plot
```{r}
ggplotly(meth_filter_reshaped %>%
ggplot(aes(x=percMeth, fill=population, color=population)) +
geom_density(alpha=0.5) +
scale_x_discrete(labels=c("HC" = "Hood Canal\nPopulation", "SS" = "South Sound\nPopulation")) +
scale_color_manual(values=c("firebrick3", "dodgerblue3")) +
scale_fill_manual(values=c("firebrick3", "dodgerblue3")) +
theme_minimal() +
ylab("Density") + xlab("% CpG Methylation") +
theme(text = element_text(size=18), legend.position = "none"))
```
Find location of peaks in each population. Used this as a reference: http://ianmadd.github.io/pages/PeakDensityDistribution.html
```{r}
## Find largest peak
# Find largest peak for HC
density(subset(meth_filter_reshaped, population=="HC" & percMeth!="NA")$percMeth) # Y maximum is 0.332
# Find where that maximum lies (this gives the index #)
which.max(density(subset(meth_filter_reshaped, population=="HC" & percMeth!="NA")$percMeth)$y) #answer = index #504
# Find the x value where the largest Y value is
density(subset(meth_filter_reshaped, population=="HC" & percMeth!="NA")$percMeth)$x[504] #HC max (0.332) is @ 100%
##--------------
# Find largest peak for SS
density(subset(meth_filter_reshaped, population=="SS" & percMeth!="NA")$percMeth) # Y maximum is 0.280
which.max(density(subset(meth_filter_reshaped, population=="SS" & percMeth!="NA")$percMeth)$y) #answer = 504
density(subset(meth_filter_reshaped, population=="SS" & percMeth!="NA")$percMeth)$x[502] #SS max (0.280) is 99.5%
## Find 2nd largest peak. NOTE: I looked at the ggplotly density figure above and noted that the 2nd peaks end before ~98%, so I subset the percMeth data to retain only those <98%, then find the peak
# Find 2nd largest peak for HC
density(subset(meth_filter_reshaped, population=="HC" & percMeth!="NA" & percMeth<98)$percMeth) # Y maximum is 0.0889
# Find where that maximum lies (this gives the index #)
which.max(density(subset(meth_filter_reshaped, population=="HC" & percMeth!="NA" & percMeth<98)$percMeth)$y) #answer = index #475
# Find the x value where the largest Y value is
density(subset(meth_filter_reshaped, population=="HC" & percMeth!="NA" & percMeth<98)$percMeth)$x[474] #HC max (0.0889) is @ 92.40%
#---
# Find 2nd largest peak for SS
density(subset(meth_filter_reshaped, population=="SS" & percMeth!="NA" & percMeth<98)$percMeth) # Y maximum is 0.0651
# Find where that maximum lies (this gives the index #)
which.max(density(subset(meth_filter_reshaped, population=="SS" & percMeth!="NA" & percMeth<98)$percMeth)$y) #answer = index #483
# Find the x value where the largest Y value is
density(subset(meth_filter_reshaped, population=="SS" & percMeth!="NA" & percMeth<98)$percMeth)$x[488] #SS max (0.0889) is @ 94.95%
#----- # of loci
meth_filter_reshaped %>%
filter(population=="HC" & percMeth!="NA") %>%
select(chr, start) %>% unique() %>% nrow() #252,366
meth_filter_reshaped %>%
filter(population=="SS" & percMeth!="NA") %>%
select(chr, start) %>% unique() %>% nrow() #252,366 (good they are the same)
#----- average % meth by pop
meth_filter_reshaped %>%
filter(population=="SS" & percMeth!="NA") %>%
select(percMeth) %>% summary()
```
### Calculations of % methylation by population
```{r}
# mean % methylation across samples
dml25_counts_unique <- unique(getData(dml25_counts)[,c("chr", "start")]) %>%
mutate(chr_start=paste(chr, start, sep="_"))
#Calculate ratio between mean % methylation in Hood Canal : South Sound
DML.ratios <- meth_filter_reshaped %>%
mutate(chr_start=paste(chr, start, sep="_")) %>%
filter(chr_start %in% dml25_counts_unique$chr_start) %>%
group_by(population, chr, start) %>%
summarise(median_percentMeth = median(percMeth, na.rm = TRUE)) %>% #mean_percentMeth = mean(percMeth, na.rm = TRUE),
dcast(chr + start ~ population) %>%
mutate(ratio_HC.SS = HC/SS, #in this ratio column, values <1 = HC hypomethylated, values >1 = SS hypomethylated
chr_start=paste(chr, start, sep="_"),
diff_HC.SS = HC-SS) #this is the difference between HC & SS (HC - SS)
nrow(DML.ratios)==nrow(dml25_counts_unique) #confirm same # loci; should=TRUE
save(DML.ratios, file="../analyses/DMLs/R-objects/DML.ratios")
#load(file="../analyses/DMLs/R-objects/DML.ratios")
# For all loci, calculate mean and sd percent methylation across samples by population
DML.calcs <- meth_filter_reshaped %>%
group_by(population, chr, start) %>%
dplyr::summarise(
mean_percMeth = mean(percMeth, na.rm=TRUE),
sd_percMeth=sd(percMeth, na.rm=TRUE),
n()) %>%
filter(chr %in% c(DML.ratios$chr) &
start %in% c(DML.ratios$start))
```
```{r}
# For DMLs, any difference? somewhat- mean %meth for HC=60%, S=66%
ggplotly(
DML.ratios %>%
pivot_longer(cols = c(HC, SS),
names_to = "population",
values_to = "mean_meth") %>%
ggplot(aes(x=population, y=mean_meth, fill=population)) +
geom_violin(alpha=0.2) + geom_boxplot(alpha=0.4))
```
Divergent Bar plot of DMLs
```{r}
DML.ratios %>%
mutate(hypermethylated=ifelse(diff_HC.SS>0, "HC", "SS")) %>%
ggplot(aes(x = chr_start, y = diff_HC.SS, fill=hypermethylated)) +
geom_bar(stat = "identity", width=0.41) +
scale_x_discrete(limits=(DML.ratios[order(DML.ratios$diff_HC.SS),]$chr_start)) +
scale_y_continuous(limits=c(-80,80), breaks=c(-75, -50, -25, 0),
labels=c("-75"="75%", "-50"="50%", "-25"="25%", "0"="0%"),
sec.axis = sec_axis(trans=~.*1, breaks=c(75, 25, 50, 0),
labels=c("-75"="75%", "-50"="50%", "-25"="25%", "0"="0%"))) +
theme(axis.text.y=element_blank(), axis.ticks.y = element_blank(),
legend.position = "none", text = element_text(size=14, color="gray30"),
panel.background = element_blank(),
panel.border = element_rect(colour = "gray30", fill=NA, size=.4)) +
#ggtitle("DMLs, showing % methylation difference among populations") +
scale_fill_manual(values=c("firebrick3", "dodgerblue3")) +
#annotate(geom="text", x=-20, y=-65, label="Loci hypermethylated in\nSouth Sound", color="dodgerblue3") +
#annotate(geom="text", x=260, y=62, label="Loci hypermethylated in\nHood Canal", color="firebrick3") +
ylab("% methylation difference among populations") + xlab("Locus") + coord_flip()
ggsave(filename = "../analyses/DMLs/DMLs-diff-barplot.pdf", width=7, height=5)
```