-
Notifications
You must be signed in to change notification settings - Fork 0
/
BlodgettProjectManuscript_mostlyITS.R
5469 lines (4734 loc) · 240 KB
/
BlodgettProjectManuscript_mostlyITS.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
library(data.table)
library(tidyverse)
library(ggplot2)
library(reshape2)
library(vegan)
library(phyloseq)
#
####
### AMPtk pre-processing --> ITS DADA2 workflow:
### https://github.com/nextgenusfs/amptk/blob/master/amptk/dada2_pipeline_nofilt.R
### https://benjjneb.github.io/dada2/ITS_workflow.html
####
#
###############################################################
#### Package installation and Overview Summary of Pipeline ####
###############################################################
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("dada2")
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("ShortRead")
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("Biostrings")
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("decontam")
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("phyloseq")
## AMPtk installation:
# https://amptk.readthedocs.io/en/latest/index.html
### OVERVIEW
# Step 1: run the AMPtk pre-processing step in terminal on local machine
# Step 2: explore the AMPtk preprocessing output in R, check quality, etc. on local machine
# Step 3: run DADA2 (test run on local machine with 1-2 samples, then as a SBATCH on Savio)
# Step 4: run decontam from R on local machine to account for taxa present in negative controls
# Step 5: stats!
##############################################################
#### STEP 1: run the AMPtk pre-processing step in terminal ####
###############################################################
#
# $ cd ~/BlodgettAmpliconSequencingRawData/
# $ conda activate amptk_v1.5.1
# $ amptk illumina -i allITS -o allITS_AMPTKpreprocessed -f AACTTTYRRCAAYGGATCWCT -r AGCCTCCGCTTATTGATATGCTTAART --rescue_forward off --primer_mismatch 6
#### Runtime on my MacbookPro + 4TB external hard drive = 2hrs 20min
#### total number of samples = 664
##################################################################################
#### STEP 2: explore the AMPtk preprocessing output in R, check quality, etc. ####
##################################################################################
library(dada2)
library(ShortRead)
library(Biostrings)
# Check for any primers remaining in sequences after AMPtk processing (from the DADA2 ITS tutorial)
# Create objects that are your primer sequences
FWD <- "AACTTTYRRCAAYGGATCWCT" #5.8sfun
REV <- "AGCCTCCGCTTATTGATATGCTTAART" #ITS4fun
# Verify the correct sequence and orientation of those primer sequences!
# Create all orientations of the input sequence
allOrients <- function(primer) {
require(Biostrings)
dna <- DNAString(primer) #Biostrings package works w/ DNAString objects rather than character vectors
orients <- c(Forward = dna, Complement = complement(dna), Reverse = reverse(dna),
RevComp = reverseComplement(dna))
return(sapply(orients, toString)) # Convert back to character vector
}
FWD.orients <- allOrients(FWD)
REV.orients <- allOrients(REV)
FWD.orients # "Forward" sequence matches what I assigned above as FWD
REV.orients # "Forward" sequence matched what I assigned above as REV
# Count the number of times the primers appear in the forward and reverse read,
# while considering all possible primer orientations.
primerHits <- function(primer, fn) {
# Counts number of reads in which the primer is found
nhits <- vcountPattern(primer, sread(readFastq(fn)), fixed = FALSE)
return(sum(nhits > 0))
}
# Designate File Path to the AMPtk output folder
path <- "~/BlodgettAmpliconSequencingRawData/allITS_AMPTKpreprocessed"
# check that the path is correct
list.files(path)
#each sample has five different file types:
# (1) _R1.fq ## forward reads, left primer removed, low quality reads removed
# (2) _R2.fq ## reverse reads, left primer removed, low quality reads removed
# (3) .demux.fq ## merged reads + additional quality filtering/trimming!
# (4) .merged.fq ## merged reads
# (5) .stats ## statistics...
## This .stats file coontains a string of 6 different numbers:
## (1) Initial total number of input reads
## (2) number of reads where the F primer was found
## (3) number of reads where the R primer was found
## (4) number of reads where the multiple primers were found (discarded due to no single primer being found)
## (5) ?number of reads discarded for being too short (<100bp)
## (6) total number of reads output at end?
#
# Jon Palmer says he uses the DADA2 pipeline as if working with single reads instead of paired reads
# because quality filtering and trimming is part of the merging process in AMPtk
#
# make a table out of the .stats files:
path <- "~/BlodgettAmpliconSequencingRawData/allITS_AMPTKpreprocessed"
filelist <- list.files(path, pattern=".stats") #list the filenames = sampleIDs
#create a list of all the file paths to the .stats files:
fullpathfilelist <- list.files(path, pattern=".stats", full.names=TRUE)
#apply the fread function to fullpathfilelist list of file paths:
allfiles <- lapply(fullpathfilelist, fread)
length(allfiles) #check that it's the correct number of files
#turn the allfiles object into a data.table with rbindlist()!
AMPtkStats <- rbindlist(allfiles)
#add the filesnames/sampleIDs onto the list as a new column:
AMPtkStats[ ,FileNames:=filelist]
#remove the .stat from the end of each filename:
AMPtkStats[ ,c("SampleID", "trash"):=tstrsplit(FileNames, ".st", fixed=FALSE)]
#remove excess columns:
AMPtkStats[ ,c("trash", "FileNames"):=NULL]
ColnamesList <- c("Input", "ReadsWithFprimer", "ReadsWithRprimer", "ReadsWithMultiplePrimers", "TooShortReads", "Output", "SampleID")
names(AMPtkStats) <- ColnamesList
write.csv(AMPtkStats, "/Users/monikafischer/Desktop/AMPtkStats.csv")
# Create a list of the file names for forward and reverse files:
#raw:
path <- "/Users/monikafischer/Desktop/AmpliconSeqStuff/supersmall/supersmallsubset_rawdata"
fnRawFs <- sort(list.files(path, pattern = "_R1_001.fastq.gz", full.names = TRUE))
fnRawRs <- sort(list.files(path, pattern = "_R2_001.fastq.gz", full.names = TRUE))
plotQualityProfile(fnRawFs[2])
plotQualityProfile(fnRawRs[2])
#AMPtk output #1, low quality reads removed
path <- "/Users/monikafischer/Desktop/AmpliconSeqStuff/supersmall/supersmallsubset_preprocessed/"
fnFs <- sort(list.files(path, pattern = "_R1.fq", full.names = TRUE))
fnRs <- sort(list.files(path, pattern = "_R2.fq", full.names = TRUE))
plotQualityProfile(fnFs[2]) #plot the quality profile for a single fastq file
plotQualityProfile(fnRs[2]) #plot the quality profile for a single fastq file
plotQualityProfile(fnFs, aggregate = TRUE) #aggreate quality profile data from all fastq files into one plot
plotQualityProfile(fnRs[1])
# grey-scale heatmap = distribution of quality scores at each position
# green = mean
# orange solid = median
# orange dashed = 25th and 75th quantiles
# red line (only present if seqs vary in length) = % of seqs at each length
##AMPtk output #2, merged reads:
fnMerged <- sort(list.files(path, pattern = ".merged.fq", full.names = TRUE))
plotQualityProfile(fnMerged[2])
#
##AMPtk output #3, demuxed merged reads:
path <- "~/BlodgettAmpliconSequencingRawData/allITS_AMPTKpreprocessed"
FNs <- sort(list.files(path, pattern = "demux.fq", full.names = TRUE))
plotQualityProfile(FNs[1])
plotQualityProfile(FNs, aggregate = TRUE)
# Run the primerHits function on one of the filteredN files
# change the number in the brackets to change which file you're looking at!
# number in brackets associated with the row number:
rbind(FWD.ForwardReads = sapply(FWD.orients, primerHits, fn = FNs[2]),
FWD.ReverseReads = sapply(FWD.orients, primerHits, fn = FNs[2]),
REV.ForwardReads = sapply(REV.orients, primerHits, fn = FNs[2]),
REV.ReverseReads = sapply(REV.orients, primerHits, fn = FNs[2]))
# file types:
FNs #demuxed... AMPtk output #3
fnRawFs #Should contain primers... Raw Forward Reads
fnRawRs #Should contain primers... Raw Reverse Reads
fnFs #left primer should be removed... AMPtk output #1
fnRs #left primer should be removed... AMPtk output #1
fnMerged #merged reads... AMPtk output #2
rbind(FWD.ForwardReads = sapply(FWD.orients, primerHits, fn = fnRawFs[2]),
FWD.ReverseReads = sapply(FWD.orients, primerHits, fn = fnRawRs[2]),
REV.ForwardReads = sapply(REV.orients, primerHits, fn = fnRawFs[2]),
REV.ReverseReads = sapply(REV.orients, primerHits, fn = fnRawRs[2]))
######################################################################################################
#### STEP 3: RUN DADA2! (practice on a tiny subset of data here, but for reals, do this on Savio) ####
######################################################################################################
#
## test run on local machine with two samples,
## then convert this into a script to run on the Savio supercomputer!
#
## Following Jon Palmer's DADA2 code, which also follows the ITS tutorial:
library(dada2)
# load the data from a folder
# we only care about the .demux.fq file, which is fully trimmed, quality controlled, and paired reads are merged
path <- "~/BlodgettAmpliconSequencingRawData/TESTdata_AMPTKpreprocessed"
fns <- list.files(path, pattern = "\\.demux.fq$")
## "list the files within the path that contain the pattern"
## The Pattern (a regular expression, a.k.a. regex):
# \\. --matches a literal .
# demux.fq --matches a literal sequence of characters (in this case, "demux.fq")
# $ --end of string
Seqs <- file.path(path, fns)
Seqs
#get sample names
sample.names <- list.files(path, pattern = "\\.demux.fq$", full.name = FALSE)
sample.names <- gsub('[.demux.fq]', '', sample.names)
sample.names
#Dereplication (remove duplicated sequences)
derepSeqs <- derepFastq(Seqs, verbose=TRUE)
#name the derep class with sample names
names(derepSeqs) <- sample.names
#Sample Inference = infer ASVs!
dadaSeqs <- dada(derepSeqs, err=NULL, selfConsist=TRUE, BAND_SIZE=32)
## "err" is normally the output from learnErrors(), which I skipped over by using AMPtk for pre-processing
## "selfConsist=TRUE" is the algorithm will alternate between sample inference and error rate estimation until convergence.
## "BAND_SIZE=32" When set, banded Needleman-Wunsch alignments are performed. Banding restricts the net cumulative number
## of insertion of one sequence relative to the other. The default value of BAND_SIZE is 16
# multithread=FALSE (default)
# USE_QUALS=TRUE: (default) If TRUE, the dada2 error model takes into account the consensus quality score of the
# dereplicated unique sequences. If FALSE, quality scores are ignored.
#make sequence table
seqtab <- makeSequenceTable(dadaSeqs, orderBy = "abundance")
dim(seqtab)
#remove chimeras
seqtab.nochim <- removeBimeraDenovo(seqtab, verbose=TRUE)
dim(seqtab.nochim)
# Export ASV table:
write.csv(seqtab.nochim, "~/seqtab.nochim.csv")
## Sanity check -- make sure the chimera removal step only removed a few reads from each sample
## if chimera removal resulted in a substantial loss in reads, this is could indicate
## that primers were removed inappropriately.
getN <- function(x) sum(getUniques(x))
track <- cbind(out, sapply(dadaFs, getN), sapply(dadaRs, getN), sapply(mergers, getN), rowSums(seqtab.nochim))
track <- cbind(sapply(dadaSeqs, getN), rowSums(seqtab.nochim))
colnames(track) <- c("dadaSeqs", "nonchim")
rownames(track) <- sample.names
track
# Assign Taxonomy!
unite.ref <- "~/UNITE_sh_general_release_dynamic_04.02.2020.fasta"
taxa <- assignTaxonomy(seqtab.nochim, unite.ref, verbose = TRUE)
# EXPORT TAXONOMY TABLE
write.csv(taxa, "~/ALL_ITS_taxa.csv")
#
##
#
## Post-DADA2 Sanity check Histograms
#
## sanity check to make sure the distribution of the data make sense:
library(data.table)
library(ggplot2)
otu <- fread(file.choose()) #load OTUtable
otu[1:10, 1:10] #rownames/SeqIDs are now column1!
# count the number of values in each column that are greater than zero
NumberOfSamplesWithEachOTU <- data.frame(columnsums = colSums(otu[,-1] > 0))
# [,-1] skips the first column, which is the SeqIDs...
#plot histogram:
ggplot(NumberOfSamplesWithEachOTU, aes(x=columnsums))+
geom_histogram(bins=500)+
theme_minimal()+
scale_x_continuous(breaks=seq(0, 15, 1), limits=c(0, 15))+
scale_y_continuous(breaks=seq(0, 140000, 10000), limits=c(0, 140000))+
xlab("Binned number of Samples/OTU") +
ylab("Count") +
ggtitle("ITS Histogram of Samples/OTU")
# count the number of values in each row that are greater than zero
NumberOfOTUsInEachSample <- data.frame(rowsums = rowSums(otu > 0))
#plot histogram:
ggplot(NumberOfOTUsInEachSample, aes(x=rowsums))+
geom_histogram(bins=500)+
theme_minimal()+
scale_x_continuous(breaks=seq(0, 1400, 100), limits=c(0, 1400))+
scale_y_continuous(breaks=seq(0, 30, 5), limits=c(0, 30))+
xlab("Binned number of OTUs/Sample") +
ylab("Count") +
ggtitle("ITS Histogram of OTUs/Sample")
# how many samples have fewer than 30 OTUs? ...play with this as a compliment to the histograms!
sum(NumberOfOTUsInEachSample < 30)
#########################################################
#### STEP 4: CREATE PHYLOSEQ OBJECT and RUN DECONTAM ####
#########################################################
##
# https://benjjneb.github.io/decontam/vignettes/decontam_intro.html#identifying-contaminants-in-marker-gene-and-metagenomics-data
##
library(phyloseq)
library(ggplot2)
library(decontam)
library(data.table)
library(tidyverse)
library(vegan)
## decontam removes ASVs that are over-represented in negative-controls
## decontam uses dada2 outputs converted into phyloseq objects, and phyloseq requires OTU and Taxa tables as matrices
#
# Load OTU table:
seqtab.nochim <- as.matrix(fread("~/20210118_allITS_seqtab.nochim.csv"),
rownames=1)
seqtab.nochim[1:5,1:2]
# Load Taxonomy Table:
taxa <- as.matrix(fread("~/20210119_allITS_taxa.csv",
header = TRUE), rownames=1)
head(taxa)
# Extract the SeqIDs from the OTUtable
samplenames <- data.table(SeqID = row.names(seqtab.nochim))
dim(samplenames)
# add read counts per sample to the Sample Metadata Table.. (reads output by DADA2 in the post-chimera-removal sanity check)
reads <- fread("~/20210118_allITS_trackreads.csv")
head(reads)
colnames(reads) <- c("SeqID", "dada2_reads", "nochim_reads")
class(reads$nochim_reads)
class(reads$dada2_reads)
reads$nochim_reads <- as.numeric(reads$nochim_reads)
max(reads$nochim_reads)
min(reads$nochim_reads)
# subset metadata table, add reads counts, and create the Sample Table for phyloseq
metasub1 <- merge(samplenames, SampleMetadata, by="SeqID", all.x=TRUE)
metasub2 <- merge(metasub1, reads, by="SeqID", all.x=TRUE)
sampITS <- column_to_rownames(metasub2, var="SeqID") #rownames = SeqID will be required for phyloseq
#plot the number of reads by sample to get an general idea of what the data look like
ggplot(data=metasub2, aes(x=SeqID, y=nochim_reads, color=Who)) +
geom_point() +
scale_color_manual(values = c("#00ff95", "#ff9500", "#9500ff", "#ff006a")) +
scale_y_continuous(expand= c(0,0), breaks=seq(0, 130000, 10000), limits=c(0, 130000)) +
theme(axis.text.x = element_text(angle=90, hjust=1, vjust=1))
##
# CREATE PHYLOSEQ OBJECT
ps <- phyloseq(otu_table(seqtab.nochim, taxa_are_rows=FALSE), #matrix (rownames = SeqIDs, colnames = ASVs, values = abundance of each ASV in each SeqID)
tax_table(taxa), #matrix (rownames = ASVs, colnames = taxonomic levels, values = taxonomic classification for each ASV)
sample_data(sampITS)) #data.frame, (rownames = SeqIDs, colnames & values = additional info for each SeqID)
# check that nothing catastrophic happened when creating the phyloseq object, table dimensions should match:
dim(tax_table(ps)) #phyloseq object
dim(taxa) #original
(otu_table(ps))[1:5, 1:5] #phyloseq object
dim(seqtab.nochim) #original
dim(sample_data(ps)) #phyloseq object
head(sampITS) #original
head(tax_table(ps))
## ASVs are currently identified by their DNA sequence, which is cumbersome
## Move this DNA sequence to the refseq slot of the phyloseq object
## and then give each ASV a simpler name like ASV1, ASV2, etc.
#
# move the current taxa names to a DNAStringSet object, which phyloseq will recognize at DNAseqs
dna <- Biostrings::DNAStringSet(taxa_names(ps))
# connect taxa_names with this DNAseq object
names(dna) <- taxa_names(ps)
# Right now taxa_names = DNAseq, but we want both so we can manipulate taxa_names while retaining the DNAseqs in their own table
# add the DNAseq table to the phyloseq object:
ps <- merge_phyloseq(ps, dna)
ps #note the new "refseq()" part of the phyloseq object!
#
# IF YOU DON'T HAVE DNA SEQs SKIP TO HERE
# replace whatever is in the taxa_names space with ASV1, ASV2, ASV3, etc.
taxa_names(ps) <- paste0("ASV", seq(ntaxa(ps)))
ps
# To run decontam, we will need a column ("is.neg") in the Metadata Table
# is.neg = control samples are indicated as "TRUE" and all other samples are "FALSE"
# Create the is.neg column:
is.neg <- sample_data(ps)$Who == "Control"
head(sample_data(ps)) #check that it looks correct
#
## RUN DECONTAM!
# outputs a table of stats for all taxa and decontam's decision about whether or not each taxon is a contaminant
# play around with changing the "threshold" value..
contamdf.prev <- isContaminant(ps, method="prevalence", neg="is.neg", threshold = 0.5)
# Summary table: how many true contaminants were identified?
table(contamdf.prev$contaminant)
#tresh0.1 = 190contaminants, tresh0.05 = 93contaminants
# plot a histogram of the # of ASVs (y-axis) for each decontam score (a.k.a. P-value) on the x-axis
# ASVs are rows in the contamdf.prev object
ggplot(contamdf.prev, aes(x=p)) +
geom_histogram(binwidth=0.01, fill="purple")+
scale_x_continuous(breaks=seq(0, 1, 0.1))
#there isn't much of a difference between 0 and 0.55...
#moving forward with 0.1, since that's the recommended threshold
# ...though I could concievably increase the threshold substantially...
# thresh0.5 = 746contaminants (14470 non-contminants)
### NOTES!
## "method=prevalence" means contaminants are identified by increased prevalence in negative controls vs. samples
# default: threshold = 0.1 ...defines the threshold for deciding what is more prevalent in the negative controls than the soil samples
# default: detailed = TRUE ...returns a data.frame with diagnostic info on the contamination decision
### Some notes about the threshold argument from Davis et al 2018:
## For P=0.5, ASVs would be classified as contaminants if present in a higher fraction of controls than samples
## The P-value is the result of 2x2 chi-squared test (many samples), or Fisher's Exact Test (few samples)
## this P-value is also refered to as a "score" or "decontam score"
## Each ASV is assigned a P-value or score
## In general, the threshold P value should be less-than-or-equal-to 0.5
## higher scores = likely not a contaminant
predecontamreads <- data.frame( SeqID = row.names(otu_table(ps)),
TotalReads = rowSums(otu_table(ps)))
write.csv(predecontamreads, "~/predecontamreads.csv")
sum(sample_data(ps)$Who == "Control", na.rm=TRUE) #46
sum(sample_data(ps)$Who == "Neemonika") #325
sum(sample_data(ps)$Who == "Neemonika", sample_data(ps)$Who =="Phillip", sample_data(ps)$Who =="PhillipPool") #618 ...618+46=664
# Make phyloseq object of presence-absence in negative controls and true samples
ps.pa <- transform_sample_counts(ps, function(abund) 1*(abund>0))
ps.pa.neg <- prune_samples(sample_data(ps.pa)$Who == "Control", ps.pa)
ps.pa.pos <- prune_samples(sample_data(ps.pa)$Who == c("Phillip", "PhillipPool", "Neemonika"), ps.pa)
#Warning: In sample_data(ps.pa)$Who == c("Phillip", "PhillipPool", "Neemonika") : "longer object length is not a multiple of shorter object length"
# Make data.frame of prevalence in positive and negative samples
df.pa <- data.frame(pa.pos=taxa_sums(ps.pa.pos), pa.neg=taxa_sums(ps.pa.neg),
contaminant=contamdf.prev$contaminant)
# Plot ASV presence in negative controls vs. positive samples and color by whether or not they were ID's as a contaminant
ggplot(data=df.pa, aes(x=pa.neg, y=pa.pos, color=contaminant))+
geom_point()+
#geom_abline(intercept = 0, slope = 1)+ #optional line showing the threshold for equal representation in controls and samples
xlab("Prevalence (Negative Controls)") + ylab("Prevalence (True Samples)")+
ggtitle("Threshold = 0.1")
# REMOVE CONTAMINANTS FROM DATASET
contam <- rownames_to_column(contamdf.prev, var = "rowname") #move rownames (ASVs) to a column that data.table can work with
contam.dt <- data.table(contam) #convert to a data.table
badTaxa <- contam.dt[contaminant == "TRUE", rowname] #use data.table to generate a list of contaminant ASVs
goodTaxa <- setdiff(taxa_names(ps), badTaxa) #list of all ASVs minus the contaminant ASVs
ps2 <- prune_taxa(goodTaxa, ps) #create a new phyloseq object that only contains the good (non-contaminant) ASVs!
#check out the difference:
ps #15216 taxa
ps2 #15026 taxa
15216-15026 # =190, which is equivalent to the number of taxa that decontam identified as bad! this worked! hooray!
# Export final ps2 tables:
write.csv(tax_table(ps2), "~/ps2_TAXtable_withASVs.csv")
write.csv(otu_table(ps2), "~/ps2_OTUtable_withASVs.csv")
write.csv(refseq(ps2), "~/ps2_SEQtable_withASVs.csv")
#############################################
#### EXTRACT DATA FROM PHYLOSEQ OBJECTS #####
#############################################
# Make a copy of sample metadata table with total read counts/sample
MetadataWithReads <- copy(metasub2)
head(MetadataWithReads)
class(MetadataWithReads) #datatable and dataframe
# if you want to re-extract the sample_data() from a phyloseq object
# as.data.frame() or as.data.table() don't work for some reason, indead, use as():
SampDat.dt <- as.data.table(as(sample_data(ps2), "data.frame"))
colnames(otu.dft[1])
# Extract OTU table:
otu.df <- as.data.frame(otu_table(ps2.Neemonika))
otu.df <- as.data.frame(otu_table(ps2.NeemonTop))
otu.df[1:5,1:5]
dim(otu.df)
otu.df.rownames <- rownames_to_column(otu.df, var = "SeqID")
otu.df.rownames[1:5,1:5]
dim(otu.df)
#transpose OTU table
otu.dft <- as.data.frame(t(otu.df.rownames[,2:ncol(otu.df.rownames)]))
colnames(otu.dft) <- otu.df.rownames[,1]
otu.dft[1:5,1:5]
otu.dft <- rownames_to_column(otu.dft, var = "ASV")
otu.dft[1:5,1:5]
# Extract taxonomy table:
tax.df <- as.data.frame(tax_table(ps2))
tax.df <- rownames_to_column(tax.df, var = "ASV")
head(tax.df)
# Extract Sequence table:
seq.df <- as.data.frame(refseq(ps2))
seq.df <- rownames_to_column(seq.df, var = "ASV")
names(seq.df)[2] <- "Sequence"
# Merge taxonomy with OTU table (= "spe" table in Numerical Ecology)
OTUtax.df <- merge(otu.dft, tax.df, by="ASV", all.x=TRUE)
dim(OTUtax.df)
OTUtax.df[1:5, 1:5]
OTUtax.df[1:5, 662:672]
unique(OTUtax.df$Phylum)
write.csv(OTUtax.df, "~/OTUtable_with_Taxonomy.csv")
#explore phyla...
OTUtax.dt <- as.data.table(OTUtax.df)
nrow(OTUtax.dt[Phylum == "NA"]) #0
nrow(OTUtax.dt[Phylum == "p__Monoblepharomycota"]) #46
nrow(OTUtax.dt[Phylum == "p__Basidiobolomycota"]) #5
nrow(OTUtax.dt[Phylum == "p__Blastocladiomycota"]) #4
nrow(OTUtax.dt[Phylum == "p__Zoopagomycota"]) #26
nrow(OTUtax.dt[Phylum == "p__Olpidiomycota"]) #6
nrow(OTUtax.dt[Phylum == "p__Kickxellomycota"]) #37
nrow(OTUtax.dt[Phylum == "p__Rozellomycota"]) #47
nrow(OTUtax.dt[Phylum == "p__Entorrhizomycota"]) #1
nrow(OTUtax.dt[Phylum == "p__Aphelidiomycota"]) #1
nrow(OTUtax.dt[Phylum == "p__Ascomycota"]) #8346
nrow(OTUtax.dt[Phylum == "p__Basidiomycota"]) #4524
nrow(OTUtax.dt[Phylum == "p__Mucoromycota"]) #285
nrow(OTUtax.dt[Phylum == "p__Glomeromycota"]) #191
nrow(OTUtax.dt[Phylum == "p__Chytridiomycota"]) #316
dim(OTUtax.df) #15026 taxa
length(unique(OTUtax.df$Kingdom)) #1
length(unique(OTUtax.df$Phylum)) #16
length(unique(OTUtax.df$Class)) #59
length(unique(OTUtax.df$Order)) #148
length(unique(OTUtax.df$Family)) #324
length(unique(OTUtax.df$Genus)) #708
# Merge Sequences with OTUtax table:
OTUtaxSeq.df <- merge(OTUtax.df, seq.df, by="ASV", all.x=TRUE)
OTUtaxSeq.df[1:5, 1:5]
OTUtaxSeq.df[1:5, 665:673]
write.csv(OTUtaxSeq.df, "~/OTUtable_withTaxonomy_and_Sequences.csv")
####################################
######### DIVERSITY METRICS ########
############# Figure 2 #############
####################################
library(vegan)
## IS THERE A DIFFERENCE IN DIVERSITY BY DEPTH? -- Phillip's samples only
#sample_data(ps2)$Who == "Phillip" & sample_data(ps2)$Who == "PhillipPool" &
#sample_data(ps2)$Date== "17-Feb-20" & sample_data(ps2)$Date== 17-Feb-20
#subset for the two dates that are pre- and post- fire for Phillip's samples
unique(MetadataWithReads$Date)
MetadataPP <- MetadataWithReads[Date == "2/17/20" | Date == "10/8/19", ]
MetadataPP <- MetadataPP[Who == "Phillip" | Who == "PhillipPool"]
MetadataPP <- MetadataPP[Plot == "321west"]
otu.df <- as.data.frame(otu_table(ps2))
otu.df.rownames <- rownames_to_column(otu.df, var="SeqID")
dim(otu.df.rownames)
# Subset OTUtax table for only the column names that match the SeqIDs in MetadataPP
otu.df.rownames[1:5, 1:5]
OTUtaxPP <- otu.df.rownames[otu.df.rownames$SeqID %in% MetadataPP$SeqID, ]
OTUtaxPP[1:3, 1:5]
rownames(OTUtaxPP) <- c() #clear rownames
OTUtaxPP <- column_to_rownames(OTUtaxPP, var="SeqID") #move SeID column into rownames
unique(ShanDivPP$depth)
ShanDivPP <- as.data.table(diversity(OTUtaxPP, index="shannon"))
ShanDivPP[ ,numdate:=MetadataPP$numdate]
ShanDivPP[ ,depth:=MetadataPP$depth]
ggplot(ShanDivPP[depth == "p1-10" | depth == "1" | depth == "2" | depth == "3" |
depth == "4" | depth == "5" | depth == "10" ], aes(x=V1, y=depth, color=factor(numdate)))+
geom_boxplot()+
geom_point(position=position_dodge(width=0.75),aes(group=numdate))+
scale_y_discrete(limits=rev(c("1", "2", "3", "4", "5", "10", "p1-10")))+
scale_color_manual(values=c("#00BFC4", "#F8766D"), labels=c("Pre-Fire", "Post-Fire"))+
xlab("Shannon Diversity Index")+
ylab("Depth (cm)")+
theme_light()+
theme(axis.text = element_text(size = 12))+
theme(legend.text = element_text(size = 12))+
theme(legend.title = element_blank())+
ggtitle("Phillip's Samples - 321west")
dat4test <- ShanDivPP[depth == "p1-3" | depth == "p1-10"]
t.test(V1 ~ depth, data=dat4test[numdate == "18309"]) #numdate=18309 or Date == "17-Feb-20" ...p=0.5872
t.test(V1 ~ depth, data=dat4test[numdate == "18177"]) #numdate=18177 or Date == "8-Oct-19" ...p=0.7613
t.test(V1 ~ numdate, data=dat4test[depth == "p1-3"]) #p=0.2152
t.test(V1 ~ numdate, data=dat4test[depth == "p1-10"]) #p=0.3074
unique(ShanDivPP$depth)
## ANOVAs
dat4test <- ShanDivPP[depth == "p1-10" | depth == "1" | depth == "2" | depth == "3" |
depth == "4" | depth == "5" | depth == "10" ]
anova <- aov(V1 ~ depth, data=dat4test[numdate == "18309"])
summary(anova) #p=0.948
anova <- aov(V1 ~ numdate, data=dat4test)
summary(anova) #p=0.0736
dat4test <- ShanDivPP[depth == "p1-10" | depth == "1" | depth == "2" | depth == "3" |
depth == "4" | depth == "5" | depth == "10" ]
anova <- aov(V1 ~ depth, data=dat4test[numdate == "18177"])
summary(anova) #p=0.995
dat4test <- ShanDivPP[depth == "p1-3" | depth == "1" | depth == "2" | depth == "3" ]
anova <- aov(V1 ~ depth, data=dat4test[numdate == "18309"])
summary(anova) #p=0.611
anova <- aov(V1 ~ numdate, data=dat4test)
summary(anova) #p=0.187
ShanDivPP <- ShanDivPP[V1 != 0]
dat4test <- ShanDivPP[depth == "p1-3" | depth == "1" | depth == "2" | depth == "3"]
anova <- aov(V1 ~ depth, data=dat4test[numdate == "18177"])
summary(anova) #p=0.742
#
## COMP240!!!
#
#subset for the two dates that are pre- and post- fire for Phillip's samples
unique(MetadataWithReads$Date)
MetadataPP <- MetadataWithReads[Date == "2/17/20" | Date == "10/8/19", ]
MetadataPP <- MetadataPP[Who == "Phillip" | Who == "PhillipPool"]
MetadataPP <- MetadataPP[Plot == "240"]
# Subset OTUtax table for only the column names that match the SeqIDs in MetadataPP
OTUtaxPP <- otu.df.rownames[otu.df.rownames$SeqID %in% MetadataPP$SeqID, ]
OTUtaxPP[1:3, 1:5]
rownames(OTUtaxPP) <- c() #clear rownames
OTUtaxPP <- column_to_rownames(OTUtaxPP, var="SeqID") #move SeID column into rownames
unique(ShanDivPP$depth)
#Shannon Diversity
ShanDivPP <- as.data.table(diversity(OTUtaxPP, index="shannon"))
ShanDivPP[ ,numdate:=MetadataPP$numdate]
ShanDivPP[ ,depth:=MetadataPP$depth]
ggplot(ShanDivPP[depth == "p1-10" | depth == "1" | depth == "2" | depth == "3" |
depth == "4" | depth == "5" | depth == "10" ], aes(x=V1, y=depth, color=factor(numdate)))+
geom_boxplot()+
geom_point(position=position_dodge(width=0.75),aes(group=numdate))+
scale_y_discrete(limits=rev(c("1", "2", "3", "4", "5", "10", "p1-10")))+
scale_color_manual(values=c("#00BFC4", "#F8766D"), labels=c("Pre-Fire", "Post-Fire"))+
xlab("Shannon Diversity Index")+
ylab("Depth (cm)")+
theme_light()+
theme(axis.text = element_text(size = 12))+
theme(legend.text = element_text(size = 12))+
theme(legend.title = element_blank())+
ggtitle("Phillip's Samples - 240")
# PDF export: 6x5 (1-10), 5x4 (1-3), 4.5x4 (pools only)
dat4test <- ShanDivPP[depth == "p1-3" | depth == "p1-10"]
t.test(V1 ~ depth, data=dat4test[numdate == "18309"]) #numdate=18309 or Date == "17-Feb-20" ...p=0.4787
t.test(V1 ~ depth, data=dat4test[numdate == "18177"]) #numdate=18177 or Date == "8-Oct-19" ...p=0.7495
t.test(V1 ~ numdate, data=dat4test[depth == "p1-3"]) #p=0.2454
t.test(V1 ~ numdate, data=dat4test[depth == "p1-10"]) #p=0.7693
unique(ShanDivPP$depth)
## ANOVAs
dat4test <- ShanDivPP[depth == "p1-10" | depth == "1" | depth == "2" | depth == "3" |
depth == "4" | depth == "5" | depth == "10" ]
anova <- aov(V1 ~ depth, data=dat4test[numdate == "18309"])
summary(anova) #p=0.104
dat4test <- ShanDivPP[depth == "p1-10" | depth == "1" | depth == "2" | depth == "3" |
depth == "4" | depth == "5" | depth == "10" ]
anova <- aov(V1 ~ depth, data=dat4test[numdate == "18177"])
summary(anova) #p=0.562
dat4test <- ShanDivPP[depth == "p1-3" | depth == "1" | depth == "2" | depth == "3" ]
anova <- aov(V1 ~ depth, data=dat4test[numdate == "18309"])
summary(anova) #p=0.673
dat4test <- ShanDivPP[depth == "p1-3" | depth == "1" | depth == "2" | depth == "3"]
anova <- aov(V1 ~ depth, data=dat4test[numdate == "18177"])
summary(anova) #p=0.444
# Simpson Diversity
SimpDivPP <- as.data.table(diversity(OTUtaxPP, index="simpson"))
SimpDivPP[ ,Date:=MetadataPP$Date]
SimpDivPP[ ,depth:=MetadataPP$depth]
ggplot(SimpDivPP, aes(x=V1, y=depth, color=Date))+
geom_boxplot()+
geom_point(position=position_dodge(width=0.75),aes(group=Date))+
scale_y_discrete(limits=rev(c("1", "2", "3", "p1-3", "4", "5", "10", "p1-10", "20")))+
xlab("Simpson Diversity Index")+
ylab("Depth (cm)")+
theme_light()+
theme(axis.text = element_text(size = 12))+
theme(legend.text = element_text(size = 12))+
ggtitle("Phillip's Samples - 321west")
dat4test <- SimpDivPP[depth == "p1-3" | depth == "p1-10"]
t.test(V1 ~ depth, data=dat4test[Date == "17-Feb-20"])
t.test(V1 ~ depth, data=dat4test[Date == "8-Oct-19"])
t.test(V1 ~ Date, data=dat4test[depth == "p1-3"])
t.test(V1 ~ Date, data=dat4test[depth == "p1-10"])
#calculate species RICHNESS
# Where ASV abundance value is greater than 0, give it a 1, then sum the 1's by sample:
OTUtaxPP.rich <- apply(OTUtaxPP > 0,1,sum)
OTUtaxPP.rich.dt <- as.data.table(OTUtaxPP.rich)
OTUtaxPP.rich.dt[ ,depth:=MetadataPP$depth]
OTUtaxPP.rich.dt[ ,Date:=MetadataPP$Date]
names(OTUtaxPP.rich.dt)[1] <- "Richness"
ggplot(OTUtaxPP.rich.dt, aes(x=Richness, y=depth, color=Date))+
geom_boxplot()+
geom_point(position=position_dodge(width=0.75),aes(group=Date))+
scale_y_discrete(limits=rev(c("1", "2", "3", "p1-3", "4", "5", "10", "p1-10", "20")))+
scale_color_manual(values=c("#00BFC4", "#F8766D"))+
xlab("Richness")+
ylab("Depth (cm)")+
theme_light()+
theme(axis.text = element_text(size = 12))+
theme(legend.text = element_text(size = 12))+
ggtitle("Phillip's Samples - 321west")
dat4test <- OTUtaxPP.t.rich.dt[depth == "p1-3" | depth == "p1-10"]
t.test(Richness ~ depth, data=dat4test[Date == "17-Feb-20"])
t.test(Richness ~ depth, data=dat4test[Date == "8-Oct-19"])
t.test(Richness ~ Date, data=dat4test[depth == "p1-3"])
t.test(Richness ~ Date, data=dat4test[depth == "p1-10"])
dat4test <- OTUtaxPP.t.rich.dt[Date == "17-Feb-20"]
anova <- aov(Richness ~ depth, data=dat4test)
summary(anova) #no sig difs
tukey <- TukeyHSD(anova, "depth")
result <- data.frame(tukey$depth)
result["p.adj"]
dat4test <- OTUtaxPP.t.rich.dt[Date == "8-Oct-19"]
anova <- aov(Richness ~ depth, data=dat4test)
summary(anova)#no sig difs#no sig difs
tukey <- TukeyHSD(anova, "depth")
result <- data.frame(tukey$depth)
result["p.adj"]
#calculate species EVENNESS
OTUtaxPP.even <- as.data.table(diversity(OTUtaxPP, index="simpson"))/log(OTUtaxPP.rich)
OTUtaxPP.even[ ,Date:=MetadataPP$Date]
OTUtaxPP.even[ ,depth:=MetadataPP$depth]
names(OTUtaxPP.even)[1] <- "Evenness"
ggplot(OTUtaxPP.even, aes(x=Evenness, y=depth, color=Date))+
geom_boxplot()+
geom_point(position=position_dodge(width=0.75),aes(group=Date))+
scale_y_discrete(limits=rev(c("1", "2", "3", "p1-3", "4", "5", "10", "p1-10", "20")))+
scale_color_manual(values=c("#00BFC4", "#F8766D"))+
xlab("Evenness")+
ylab("Depth (cm)")+
theme_light()+
theme(axis.text = element_text(size = 12))+
theme(legend.text = element_text(size = 12))+
ggtitle("Phillip's Samples - 321west")
dat4test <- OTUtaxPP.t.even[depth == "p1-3" | depth == "p1-10"]
t.test(Evenness ~ depth, data=dat4test[Date == "17-Feb-20"])
t.test(Evenness ~ depth, data=dat4test[Date == "8-Oct-19"])
t.test(Evenness ~ Date, data=dat4test[depth == "p1-3"])
t.test(Evenness ~ Date, data=dat4test[depth == "p1-10"])
##
#
##
#
##
#
##
#
### DIVERSTIY METRICS on main "Neemonika" data
#
# Calculate diversity metric with the vegan package
ShanDivAll <- as.data.table(diversity(otu.df, index="shannon")) #index options = shannon, simpson, or invsimpson
head(ShanDivAll)
ShanDivAll$SeqID <- rownames(otu.df)
ShanDivAll <- merge(ShanDivAll, MetaNeemonika[,c("SeqID", "depth", "numdate", "Plot", "Fire")], by="SeqID")
head(ShanDivAll)
View(ShanDivAll)
ShanDivAll$Plot <- factor(ShanDivAll$Plot, levels=c("240", "321west", "321east", "400"))
ShanDivAll$Fire <- factor(ShanDivAll$Fire, levels=c("Pre-fire", "Post-fire"))
library(ggplot2)
ggplot(ShanDivAll, aes(x=numdate, y=V1, color=Plot))+
geom_point(aes(shape=Fire), size=1)+
geom_smooth(aes(fill=Plot), size=1, alpha=0.1, method="loess")+
theme_classic()+
theme(axis.text.x = element_text(hjust=1, vjust=0.5, angle=90, color = "black", size=14))+
theme(axis.text.y = element_text(color = "black", size=14))+
theme(axis.title = element_text(color = "black", size=14))+
theme(legend.text = element_text(size=14, color="black"))+
theme(legend.title = element_text(size=14, face="bold", color="black"))+
scale_x_continuous(breaks=seq(17815, 18325, 30), limits=c(17815, 18325))+
scale_y_continuous(breaks=seq(2, 5, 1), limits=c(2, 5))+
scale_shape_manual(values=c(5, 19))+
scale_color_manual(values=c("#009acd", "#00cd9a", "#ff7f01", "#ff0102"),
labels=c("1c", "2c", "lo", "hi"))+
scale_fill_manual(values=c("#009acd", "#00cd9a", "#ff7f01", "#ff0102"),
labels=c("1c", "2c", "lo", "hi"))+
xlab("Time")+
ylab("Shannon Diversity")+
ggtitle("All Neemonika Shannon Diversity (3-6cm samples omitted)")
###export as .png or .tiff: 600x350
#calculate species RICHNESS
# Where ASV abundance value is greater than 0, give it a 1, then sum the 1's by sample:
otu.df.rich <- apply(otu.df > 0, 1, sum)
head(otu.df.rich)
otu.df.rich.dt <- as.data.table(otu.df.rich)
otu.df.rich.dt[1:5,]
dim(ot.df.rich.dt)
head(MetaNeemonika)
otu.df.rich.dt[ ,SeqID:=rownames(otu.df)]
otu.df.rich.dt <- merge(otu.df.rich.dt, MetaNeemonika[,c("SeqID", "depth", "numdate", "Plot", "Fire")], by="SeqID")
head(otu.df.rich.dt)
names(otu.df.rich.dt)[2] <- "Richness"
otu.df.rich.dt$Plot <- factor(otu.df.rich.dt$Plot, levels=c("240", "321west", "321east", "400"))
otu.df.rich.dt$Fire <- factor(otu.df.rich.dt$Fire, levels=c("Pre-fire", "Post-fire"))
ggplot(otu.df.rich.dt, aes(x=numdate, y=Richness, color=Plot, fill=Plot))+
geom_point(aes(shape=Fire), size=1)+
geom_smooth(aes(fill=Plot), size=1, alpha=0.1)+
theme_classic()+
theme(axis.text.x = element_text(hjust=1, vjust=0.5, angle=90, color = "black", size=14))+
theme(axis.text.y = element_text(color = "black", size=14))+
theme(axis.title = element_text(color = "black", size=14))+
theme(legend.text = element_text(size=14, color="black"))+
theme(legend.title = element_text(size=14, face="bold", color="black"))+
scale_x_continuous(breaks=seq(17815, 18325, 30), limits=c(17815, 18325))+
scale_y_continuous(breaks=seq(0, 800, 100), limits=c(0, 800))+
scale_shape_manual(values=c(5, 19))+
scale_color_manual(values=c("#009acd", "#00cd9a", "#ff7f01", "#ff0102"),
labels=c("1c", "2c", "lo", "hi"))+
scale_fill_manual(values=c("#009acd", "#00cd9a", "#ff7f01", "#ff0102"),
labels=c("1c", "2c", "lo", "hi"))+
xlab("Time")+
ylab("Richness")+
ggtitle("All Neemonika Richness")
###export as .png or .tiff: 600x350, PDF: 6x4
#calculate species EVENNESS
otu.df.even <- as.data.table(diversity(otu.df, index="shannon"))/log(otu.df.rich)
otu.df.even[ ,SeqID:=rownames(otu.df)]
otu.df.even <- merge(otu.df.even, MetaNeemonika[,c("SeqID", "depth", "numdate", "Plot", "Fire")], by="SeqID")
head(otu.df.even)
names(otu.df.even)[2] <- "Evenness"
otu.df.even$Plot <- factor(otu.df.even$Plot, levels=c("240", "321west", "321east", "400"))
otu.df.even$Fire <- factor(otu.df.even$Fire, levels=c("Pre-fire", "Post-fire"))
ggplot(otu.df.even, aes(x=numdate, y=Evenness, color=Plot, fill=Plot))+
geom_point(aes(shape=Fire), size=1)+
geom_smooth(aes(fill=Plot), size=1, alpha=0.1)+
theme_classic()+
theme(axis.text.x = element_text(hjust=1, vjust=0.5, angle=90, color = "black", size=14))+
theme(axis.text.y = element_text(color = "black", size=14))+
theme(axis.title = element_text(color = "black", size=14))+
theme(legend.text = element_text(size=14, color="black"))+
theme(legend.title = element_text(size=14, face="bold", color="black"))+
scale_x_continuous(breaks=seq(17815, 18325, 30), limits=c(17815, 18325))+
#scale_y_continuous(breaks=seq(0.3, 0.9, 0.1), limits=c(0.3, 0.9))+
scale_shape_manual(values=c(5, 19))+
scale_color_manual(values=c("#009acd", "#00cd9a", "#ff7f01", "#ff0102"),
labels=c("1c", "2c", "lo", "hi"))+
scale_fill_manual(values=c("#009acd", "#00cd9a", "#ff7f01", "#ff0102"),
labels=c("1c", "2c", "lo", "hi"))+
xlab("Time")+
ylab("Evenness")+
ggtitle("All Neemonika Evenness (3-6cm samples omitted)")
###export as .png or .tiff: 600x350
##########################################################
### Rarefaction & Diversity Metrics with Rarefied data ###
###################### ITS only ##########################
##########################################################
# plot rarefaction curves using a phyloseq function for ITS data:
rarecurve(otu_table(ps2.Neemonika), step=50, cex=0.5, label = FALSE)
#start @ 11:50... end ~1pm ish...
# Export PDF 20x25
#
# this curve is the procedure of "rarefaction"...
# ...rarefying is the normalization procedure!
#
# Rarefaction Curve = number of ASVs as a function of number of reads (a.k.a. "sample size")
# ...one line for each sample showing how many ASVs are detected for each number of reads
# rarefy the data:
ps2.Neemonika.rarefied = rarefy_even_depth(ps2.Neemonika,
rngseed=1, # set.seed for random number generation
sample.size=min(sample_sums(ps2.Neemonika)), #reads will be removed such that all samples have the same number of reads as the sample with the fewest reads
replace=FALSE, #resample with or without replacement, FALSE = more memory intensive!, TRUE = instantaneous!
verbose=TRUE,
trimOTUs=TRUE) #remove OTUs that have zero reads left after rarefaction
#sampling without replacement took a few minutes and resulted in 6220 OTUS being removed because they were no longer present
#sample with replacement was basically instantaneous and resulted in 6379 OTUs being removed because they were no longer present
#sample depths pre- and post- rarefying:
range(sample_sums(ps2.Neemonika)) #13751-84409
sample_sums(ps2.Neemonika.rarefied) #all the same! 13751
# Extract OTU table:
otu.df.rare <- as.data.frame(otu_table(ps2.Neemonika.rarefied))
otu.df.rare[1:5,1:5]
dim(otu.df.rare)
unique(otu.df.rich.dt$numdate)
otu.df.rich.dt[numdate==18086]
#ANOVA
anova <- aov(Richness~Plot, data=otu.df.rich.dt[numdate==17898 | numdate==17904])
summary(anova)
tukey <- TukeyHSD(anova, "Plot")
result <- data.frame(tukey$Plot)
result["p.adj"]
# 17816 & 17820 & 17833 p=0.89 (lo pre- and post fire, 1c, 2c)
# 17855 p=0.593
# 17862 p=0.485
# 17868 p=0.021 (posthoc TukeyHSD --> lo vs. 1c p=0.0190, all others ns)
# 17878 p=0.372
# 17898 p=0.174 (hi pre-fire, lo post-fire, 1c, 2c)
# 17898 & 17904 p=0.00233 (lo post-fire, hi pre- and post-fire, 1c, 2c)
# 17988 p=0.113
# 18024 p=0.0409 (hi vs. 1c p=0.298, all others ns)
# 18057 p=0.218
# 18086 p=0.016 (hi vs. 2c p=0.01291, all others ns)
# 18114 p=0.516
# 18140 p=0.00732 (hi vs. lo p=0.00428, all others ns)
# 18177 p=7.01e-07 (hi vs. lo, hi vs. 1c, and hi vs. 2c are all less than 1e-5, the rest are ns)
# 18206 p=0.726
# 18239 p=0.0398 (all p>0.05 after TukeyHSD...)
# 18267 p=0.000315 (hi vs. all other plots p<0.02, the rest are ns)
# 18305 p=0.129
### DIVERSTIY METRICS CALCULATED ON ALL DATA AT ONCE
# Calculate diversity metric with the vegan package
ShanDivAll.rare <- as.data.table(diversity(otu.df.rare, index="shannon")) #index options = shannon, simpson, or invsimpson
head(ShanDivAll.rare)
ShanDivAll.rare$SeqID <- rownames(otu.df.rare)
ShanDivAll.rare <- merge(ShanDivAll.rare, MetaNeemonika[,c("SeqID", "depth", "numdate", "Plot", "Fire")], by="SeqID")
head(ShanDivAll.rare)
ShanDivAll.rare$Plot <- factor(ShanDivAll.rare$Plot, levels=c("240", "321west", "321east", "400"))
ShanDivAll.rare$Fire <- factor(ShanDivAll.rare$Fire, levels=c("Pre-fire", "Post-fire"))
ggplot(ShanDivAll.rare, aes(x=numdate, y=V1, color=Plot))+
geom_point(aes(shape=Fire), size=1)+
geom_smooth(aes(fill=Plot), size=1, alpha=0.1, method="loess")+
theme_classic()+
theme(axis.text.x = element_text(hjust=1, vjust=0.5, angle=90, color = "black", size=14))+
theme(axis.text.y = element_text(color = "black", size=14))+
theme(axis.title = element_text(color = "black", size=14))+
theme(legend.text = element_text(size=14, color="black"))+
theme(legend.title = element_text(size=14, face="bold", color="black"))+
scale_x_continuous(breaks=seq(17815, 18325, 30), limits=c(17815, 18325))+
scale_y_continuous(breaks=seq(2, 5, 1), limits=c(2, 5))+
scale_shape_manual(values=c(5, 19))+
scale_color_manual(values=c("#009acd", "#00cd9a", "#ff7f01", "#ff0102"),
labels=c("1c", "2c", "lo", "hi"))+
scale_fill_manual(values=c("#009acd", "#00cd9a", "#ff7f01", "#ff0102"),
labels=c("1c", "2c", "lo", "hi"))+
xlab("Time")+
ylab("Shannon Diversity")+
ggtitle("All Neemonika ITS Shannon Diversity RAREFIED")
###export as .png or .tiff: 600x350
#calculate species RICHNESS
# Where ASV abundance value is greater than 0, give it a 1, then sum the 1's by sample:
otu.df.rare.rich <- apply(otu.df.rare > 0, 1, sum)
otu.df.rare[1:5, 1:5]
head(otu.df.rare.rich)
otu.df.rare.rich.dt <- as.data.table(otu.df.rare.rich)
otu.df.rare.rich.dt[1:5,]
dim(otu.df.rare.rich.dt)
head(MetaNeemonika)
otu.df.rare.rich.dt[ ,SeqID:=rownames(otu.df.rare)]