-
Notifications
You must be signed in to change notification settings - Fork 2
/
cellphylo_functions.R
1872 lines (1484 loc) · 73.3 KB
/
cellphylo_functions.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
library(magrittr)
library(dplyr)
library(utils)
library(DropletUtils)
library(R.utils)
library(Seurat)
library(tibble)
library(readr)
library(tidyr)
library(stats)
library(factoextra)
library(ggplot2)
library(ape)
library(TreeTools)
library(gtools)
library(phangorn)
library(ggtree)
#' best_jumble_tree
#'
#' Identify the jumeble tree with the highest sum, mean and median of jumble scores.
#'
#' @param tree_dir_path Path to a directory containing multiple jumble trees
#' @param multiphylo A multiphylo object containing multiple jumble trees
#'
#' @return NULL Prints to consolethe index of the tree with the maximum sum, mean and median of jumble scores. To identify which tree file the index corresponds to use 'list.files(tree_dir_path)'.
#' @export
#'
#' @examples
#'
best_jumble_tree <- function(tree_dir_path, multiphylo){
if (!missing(tree_dir_path) && missing(multiphylo)){
#tree_files <- list.files(tree_dir_path, full.names=TRUE)
trees <- combine_multi_trees(tree_dir_path)
multiphylo = combine_multi_trees(tree_dir_path)
}
if (missing(tree_dir_path) && !missing(multiphylo)){
trees <- multiphylo
}
scored_trees <- lapply(trees, function(trees){
focal_tree_path <- trees
output <- score_jumble(multiphylo = multiphylo,focal_tree_obj = trees, print=FALSE )
return(output)
}) #function close, lapply close
#max sum of jumble scores
jumble_sums <- lapply(scored_trees, function(scored_trees){
sums <- scored_trees$score_sum
})
index = which(jumble_sums == max(unlist(jumble_sums)))
print(paste0("max sum of scores = index: " , index, " score: ", max(unlist(jumble_sums))))
#max mean of scores
# jumble_means <- lapply(scored_trees, function(scored_trees){
# mean <- scored_trees$score_mean
# })
# index = which(jumble_means == max(unlist(jumble_means)))
# print(paste0("max mean of scores = index: " , index, " score: ", max(unlist(jumble_means))))
#max median of scores
# jumble_medians <- lapply(scored_trees, function(scored_trees){
# median <- scored_trees$score_median
# })
# index = which(jumble_medians == max(unlist(jumble_medians)))
# print(paste0("max median of scores = index: " , index, " score: ", max(unlist(jumble_medians))))
} #close func
#' calc_pc_sweep_var
#'
#' Calculates tree length (Fig. 2A), average tip and interior edge lengths (Fig. 2B), and the star-ness score (ratio of sum of tip:interior edge lengths, Fig. 2C)
#'
#' @importFrom gtools mixedsort
#' @importFrom ape read.tree
#'
#' @param tree_dir_path Path to directory containing the trees (ie. `contml outtrees`) to calculate the PC sweep variables from.
#'
#' @return var.df A data frame containing the variables in a format suitable for plotting with cellphylo::plot_pc_sweep.
#' @export
#'
#' @examples
calc_pc_sweep_var <- function(tree_dir_path){
tree_list <- list.files(path = tree_dir_path, full.names=TRUE, recursive=FALSE )
tree_list <- mixedsort(tree_list)
n_trees <- c( 1: length(tree_list))
vars <- sapply(n_trees, function(n_trees){
#read in tree
tree <- read.tree(tree_list[n_trees])
#plot(tree)
#count number of taxa
n.tips <- length(tree$tip.label)
#interior edges
#pull out interior edge lengths
int.edges <- tree$edge.length[which(tree$edge[,1] > (n.tips) & tree$edge[,2] > (n.tips))]
#sum of interior edge lengths
sum.int.edges <-sum(int.edges)
#average of interior edge lengths
ave.int.edges <- mean(int.edges)
#tip edges
#pull out tip edge lengths
tip.edges <- tree$edge.length[which(tree$edge[,1] <= (n.tips) | tree$edge[,2] <= (n.tips))]
#sum of tip edge lengths
sum.tip.edges <- sum(tip.edges)
#average tip edge length
ave.tip.edges<-mean(tip.edges)
#tree length
TL <- sum(tree$edge.length)
#star measure
#star.measure <- ave.tip.edges/ave.int.edges
star.measure <- sum.tip.edges/sum.int.edges
return(list(ave_int_edge = ave.int.edges, ave_tip_edge = ave.tip.edges, sum_int_edge = sum.int.edges, sum_tip_edge = sum.tip.edges, TL = TL, star_measure = star.measure))
} #sapply function close
) #sapply close
#put variables into a dataframe
var.df <- t(vars) %>% as.data.frame()
return(var.df)
}
#' color_tree
#'
#' @import ggtree
#' @importFrom phangorn midpoint
#' @importFrom ape read.tree
#' @importFrom tibble column_to_rownames
#' @importFrom grDevices pdf dev.off
#'
#' @param tree_path Path to tree
#' @param print Print out the colored trees as pdf
#'
#' @return A list containing two trees: 'tip_tree' = the tree with tip labels colored by cell type and 'bar_tree': the tree with cell types indicated with a colored bar
#' @export
#'
#' @examples
color_tree = function(tree_path, print){
tree <- read.tree(tree_path)
#midpoint root tree
tree <- midpoint(tree)
#parse tips to make annotation df
#designed for scjack_outtrees_labels
celltype_ann <- sapply(strsplit(tree$tip.label, "_"), function(v){return(v[4])})
#annotation df to attach to ggtree object = tips = column 1
annotation.df.1 <- data.frame("tip" = tree$tip.label, "annotation"= celltype_ann)
#annotation df to use in gheatmap - tips = rownames
annotation.df.2 <- column_to_rownames(annotation.df.1, var="tip")
#colour tips by annotation
tree.tip_cols <- ggtree(tree, aes(color=annotation)) %<+% annotation.df.1 + geom_tiplab()
#label tip by colour bar
tree.gg <- ggtree(tree) %<+% annotation.df.2
tree.heatmap <- gheatmap(tree.gg, annotation.df.2, width=0.01, colnames=FALSE, legend="cell type", offset=1.5) + geom_tiplab()
if (print==TRUE){
pdf(file="color_tip.pdf", width=20, height=20)
print(tree.tip_cols)
dev.off()
pdf(file="color_bar.pdf", width=20, height=20)
print(tree.heatmap)
dev.off()
}#close print if
return(list(tip_tree = tree.tip_cols, bar_tree = tree.heatmap))
} #close function
#' combine_matrices
#'
#' Takes in multiple species' matrices and combines them into a single multi-species matrix combined by gene orthology. This combined matrix is NOT cross-species normalized yet.
#'
#' @import dplyr
#' @importFrom Seurat Read10X as.sparse
#' @importFrom readr read_tsv
#' @importFrom tidyr pivot_longer
#' @importFrom tibble rownames_to_column
#' @importFrom DropletUtils write10xCounts
#' @importFrom stats na.omit
#'
#' @param matrix_list A list of the full paths to the matrices you want to combine. Cell ids must be in cellphylo format.
#' @param ortholog_map_path Path to wrangled Ensembl orthologs file.
#' @param print Boolean. Print out the combined matrix.
#'
#' @return full_join_sparse The combined matrix in sparse matrix format. If print=TRUE, it will print out the combined matrix and deposit it in `matrix/cross-species_analysis/combined_by_orthology`
#' @export
#'
#' @examples
#'
combine_matrices <- function(matrix_list, ortholog_map_path, print=TRUE){
ensembl_map_final <- read_tsv(ortholog_map_path)
species_cols <- ensembl_map_final[, !names(ensembl_map_final) %in% "gene_identifier"] %>% colnames()
gene_ont <- pivot_longer(ensembl_map_final, cols = all_of(species_cols))
colnames(gene_ont) <- c("gene_identifier", "species", "gene")
#initialize an empty list
list_matrices <- list()
for (i in 1:length(matrix_list)){
#load matrix
mat <- Read10X(matrix_list[i])
mat <- as.data.frame(mat)
mat <- rownames_to_column(mat, var = "gene")
#get species name
current_species <- sapply(strsplit(colnames(mat)[2], "_"), function(v){return(v[1])})
#filter gene_ont for the current species matrix
gene_ont_sub <- filter(gene_ont, species == current_species)
gene_ont_sub <- gene_ont_sub %>% select(gene_identifier, gene)
#inner join - add gene_identifiers to mat
joined.matrix <- inner_join(mat, gene_ont_sub, by="gene")
#remove column with gene symbols - will use gene_identifiers instead
joined.matrix <- subset(joined.matrix, select = -gene)
if (i == 1) { full_join <- joined.matrix }
if (i > 1) { full_join <- full_join(full_join, joined.matrix) }
list_matrices[[i]] <- joined.matrix
} #end forloop
#format matrix
full_join <- column_to_rownames(full_join, var = "gene_identifier")
#remove NAs
full_join_noNA <- na.omit(full_join)
full_join_sparse <- as.sparse(full_join_noNA)
if(print==TRUE){
#create a matrix directory if doesn't already exist
if(!dir.exists("matrix")){
dir.create("matrix")
}
#create a directory for the combined matrix
if(!dir.exists("matrix/combined_by_orthology")){
dir.create("matrix/combined_by_orthology")
}
write10xCounts("aqhumor_combined_mtx", full_join_sparse, version="3")
#move matrix to its own folder
file.rename("aqhumor_combined_mtx", "matrix/combined_by_orthology/aqhumor_combined_mtx")
}
return(full_join_sparse)
} #end function
#' combine_multi_trees
#'
#' @importFrom TreeTools as.multiPhylo
#' @importFrom ape read.tree
#' @param tree_dir_path Path to directory containing the trees to combine into a multiphylo object. These tree files should be in newick format.
#'
#' @return multi_tree A multiphylo object of all trees in the tree_dir_path directory
#' @export
#'
#' @examples
#'
combine_multi_trees <- function(tree_dir_path){
tree_paths <- list.files(tree_dir_path, full.names=TRUE)
multi_tree <- lapply(tree_paths, function(tree_paths){
tree <- read.tree(tree_paths)
return(tree)
})
#convert multi_tree list to multiphylo object
#as.multiPhylo not to be mixed up with phytools::as.multiPhylo
multi_tree <- TreeTools::as.multiPhylo(multi_tree)
return(multi_tree)
}
#' convert_to_10x
#'
#' Converts a UMI count matrix in CSV format to 10x-style feature-barcode matrices.
#'
#' @importFrom utils read.csv
#' @importFrom R.utils gunzip
#' @importFrom tibble column_to_rownames
#' @importFrom DropletUtils write10xCounts
#' @importFrom Seurat as.sparse
#'
#' @param path_to_csv Path to csv file
#' @param species_name Name of species matrix belongs to.
#'
#' @return sparse The matrix in sparse matrix format and a directory with a matrix in 10x feature-barcode format (version 3)
#' @export
#'
#' @examples
#'
convert_to_10x <- function(path_to_csv, species_name){
#gunzip
unzipped <- gunzip(path_to_csv, remove=FALSE)
#read csv
csv <- read.csv(unzipped)
#make rownames gene ids
csv <- column_to_rownames(csv, var = "X")
#convert to sparse
sparse <- as.sparse(csv)
#write new 10x matrix
write10xCounts(paste0("matrix_10x_", species_name), sparse, version="3")
#create a folder for matrices, if not already present
if(!dir.exists("matrix")){
dir.create("matrix")
}
#create the 10x folder if not already present
if(!dir.exists("matrix/10x")){
dir.create("matrix/10x")
}
#move this file into the 10x folder
file.rename(paste0("matrix_10x_", species_name), paste0("matrix/10x/matrix_10x_", species_name))
#remove expanded file
remove_file <- gsub(".gz", "", path_to_csv)
file.remove(remove_file)
return(sparse)
} #close fun
#' count_replicates
#'
#' This function counts the number of replicates per cell type group.
#'
#' @importFrom Seurat Read10X
#'
#'
#' @param path_to_dir Path to a directory containing matrices (not the matrix folder itself). Matrix must be must in cellphylo format
#'
#' @return For each matrix in the directory, prints out stats for the number of cell type groups, the number of replicates per cell type group and identifies the cell type group with the fewest number of replicates.
#' @export
#'
#' @examples
#'
count_replicates <- function(path_to_dir){
#setup
dir_list <- list.dirs(path = path_to_dir, full.names=TRUE, recursive=FALSE )
lapply(dir_list, function(x){
#load matrix
print("Reading in matrix...")
mat <- Read10X(x)
print("Matrix loaded.")
#parse names
species <- sapply(strsplit(colnames(mat)[1], "_"), function(v){return(v[1])})
cluster_ids <- sapply(strsplit(colnames(mat), "_"), function(v){return(v[3])})
#identify unique cluster ids
cluster_ids.unique <- unique(cluster_ids)
n_clusters <- length(cluster_ids.unique)
print(paste0("For ",species, ", there are ", n_clusters," unique cell type groups."))
#count table
print(paste0("Cell replicates tally for ",species, ":"))
print(table(cluster_ids))
#what is the smallest number of replicates?
min_reps <- table(cluster_ids) %>% min()
smallest_cluster <- names(table(cluster_ids))
smallest_cluster <- smallest_cluster[which(table(cluster_ids) == min(table(cluster_ids)))]
print(paste0("For ", species, ", the cell type group with the fewest replicates is ", smallest_cluster, " with ", min_reps, " cells"))
}) #close lapply
} #close function
#' cross_species_integration
#'
#' Integrate counts across species using the Seurat workflow. The default values for this function are the parameters for the Seurat workflow performed by Mah & Dunn (2023).
#'
#' @import Seurat
#' @importFrom DropletUtils write10xCounts
#'
#' @param matrix_path Path to combined matrix. This combined matrix features cells from multiple species but is not yet cross-species integrated. This matrix was created using `cellphylo::combine_matrices`. Cell ids must be in cellphylo format.
#' @param min_cells input for the min.cells option of Seurat::CreateSeuratObject. Default is 3.
#' @param min_features input for the min.features option of Seurat::CreateSeuratObject. Default is 200.
#' @param k_weight The k.weight to use with Seurat::IntegrateData. This must be less than or equal to the number of cells in the smallest batch. Default is 100.
#' @param n_pcs input for the npcs option of Seurat::RunPCA. Default is 30.
#' @param UMAP_dim input for the dim option of Seurat:: RunUMAP. Default is 1:30.
#' @param FindNeighbors_dim input for the dim option of Seurat::FindNeighbors. Default is 1:30.
#' @param FindClusters_res input for the res option of Seurat::FindClusters. Default is 0.5.
#' @param print Boolean. Print out the matrix and Seurat object of cross-species integrated data.
#'
#' @return combined The Seurat object containing the cross-species integrated matrix. Print=TRUE prints out the cross-species integrated matrix and an RDS file of the Seurat object that results from the Seurat integration workflow.
#'
#' @export
#'
#' @examples
cross_species_integration <- function(matrix_path, min_cells = 3, min_features = 200, k_weight=100, n_pcs=30, UMAP_dim = c(1:30), FindNeighbors_dim=c(1:30), FindClusters_res = 0.5, print=TRUE){
### create Seurat object
#read in combined species matrix aqhumor_combined_mtx
mat <- Read10X(matrix_path)
#qc: filter for min cells and min features
seurat.obj <- CreateSeuratObject(counts = mat, project="cross_species_integration", min.cells = min_cells, min.features=min_features)
#qc may have filtered out cells. Remove missing cells from mat
#seurat cells - the cells in the seurat object that survived filtering min.cells=3, min.features=200
seurat_cells <- seurat.obj@assays[["RNA"]] %>% colnames()
filtered_mat <- mat[,which(colnames(mat) %in% seurat_cells)]
### Annotate Seurat object with species, sample, cluster id, cell type.
#parse out annotations from the cell ids
species_id <- sapply(strsplit(colnames(filtered_mat), "_"), function(v){return(v[1])})
sample_id <- sapply(strsplit(colnames(filtered_mat), "_"), function(v){return(v[2])})
cluster_id <- sapply(strsplit(colnames(filtered_mat), "_"), function(v){return(v[3])})
cell_type_id <- sapply(strsplit(colnames(filtered_mat), "_"), function(v){return(v[4])})
broad_cell_type_id <- sapply(strsplit(cell_type_id, "-"), function(v){return(v[1])})
#Add annotation to Seurat obj
seurat.obj <- AddMetaData(seurat.obj, species_id, col.name="species_id")
seurat.obj <- AddMetaData(seurat.obj, sample_id, col.name = "sample_id")
seurat.obj <- AddMetaData(seurat.obj, cluster_id, col.name = "cluster_id")
seurat.obj <- AddMetaData(seurat.obj, cell_type_id, col.name = "cell_type_id")
seurat.obj <- AddMetaData(seurat.obj, broad_cell_type_id, col.name = "broad_cell_type_id")
seurat.obj@meta.data
### Integrate
seurat.obj.list <- SplitObject(seurat.obj, split.by = "species_id")
features <- SelectIntegrationFeatures(object.list = seurat.obj.list)
anchors <- FindIntegrationAnchors(object.list = seurat.obj.list, anchor.features = features)
#repeatable no matter what object you choose.
seurat.obj.example <- seurat.obj.list[[1]]
all_genes <- seurat.obj.example@assays[["RNA"]] %>% row.names()
combined <- IntegrateData(anchorset = anchors, k.weight=k_weight, features.to.integrate = all_genes)
combined<- ScaleData(combined, verbose=FALSE)
combined<- RunPCA(combined, npcs=n_pcs, verbose=FALSE)
combined<- RunUMAP(combined, reduction = "pca", dims=UMAP_dim)
combined<- FindNeighbors(combined, reduction = "pca", dims = FindNeighbors_dim)
combined<- FindClusters(combined, resolution=FindClusters_res)
integrated<- combined[["integrated"]]@scale.data
integrated <- as.sparse(integrated)
#Save data
if (print==TRUE){
#create a directory for the matrix if it doesn't already exist
if(!dir.exists("matrix")){
dir.create("matrix")
}
#create a directory for the cross-species integrated matrix
if(!dir.exists("matrix/cross-species_integration")){
dir.create("matrix/cross-species_integration")
}
saveRDS(combined, "cross_species_integrated.rds")
write10xCounts("aqhumor_cross_species_integrated_mtx", integrated, version="3")
#move the matrix into the folder
file.rename("aqhumor_cross_species_integrated_mtx", "matrix/cross-species_integration/aqhumor_cross_species_integrated_mtx")
}
return(combined)
}
#' extract_gene_loadings
#'
#' @importFrom readr read_tsv
#' @importFrom dplyr left_join select
#' @importFrom Seurat Read10X
#' @importFrom stats prcomp
#' @importFrom tibble rownames_to_column
#'
#' @param ann_table Annotation table connecting gene ids to human gene symbols.
#' @param matrix_path Path to matrix in 10x format to do PCA on and extract gene loadings.
#'
#' @return gene_loadings_output A list containing 1) 'by_most_Positive': a data frame with gene symbols have been ordered by most positive loading for each PC 2) "by_most_Negative": A data frame with gene symbol have been ordered by mmost negative loading for each PC, "PCA_output": the object produced by performing PCA on the input matrix with prcomp.
#' @export
#'
#' @examples
extract_gene_loadings <- function(ann_table, matrix_path){
#load gene id to gene symbol ann file
ann_table <- read_tsv(ann_table)
#change underscores to dashes to match matrix
ann_table$gene_identifier <- gsub("_", "-", ann_table$gene_identifier)
#just use the human gene symbols
ann_table <- select(ann_table, c("gene_identifier", "human"))
colnames(ann_table) <- c("gene_identifier", "gene_symbol")
#load matrix
mat <- Read10X(matrix_path)
#replace gene ids with gene symbols
mat_names <- rownames(mat) %>% as.data.frame()
colnames(mat_names) <- "gene_identifier"
new_mat_names <- left_join(mat_names, ann_table, by = "gene_identifier")
new_mat_names <- new_mat_names %>% select("gene_symbol")
rownames(mat) <- new_mat_names[["gene_symbol"]]
#PCA
#must transpose for prcomp
mat <- as.matrix(mat) %>% t()
#prcomp
pca <- prcomp(mat,center=TRUE, scale=FALSE)
#pca$rotation are the gene loadings for each pc
loadings <- pca$rotation %>% as.data.frame()
#prep loadings df for lapply fun
loadings <- rownames_to_column(loadings, var="gene_symbol")
#index for lapply fun. First col of loadings is not a PC
n_pc <- 1:(ncol(loadings)-1)
#order each PC by most positive and most negative gene loadings
order <- lapply(n_pc, function(pc_index){
#name of PC column are we working iwth
PCn <- paste0("PC", pc_index)
#isolate gene symbol col and PC col
pc_column <- loadings[,c("gene_symbol", PCn)]
# "-" sorts by descending (highest value first)
by_most_pos <- pc_column[order(-pc_column[,2]),] %>% select(gene_symbol)
colnames(by_most_pos) <- PCn
by_most_neg <- pc_column[order(pc_column[,2]),] %>% select(gene_symbol)
colnames(by_most_neg) <- PCn
output <- list("positive" = by_most_pos, "negative" = by_most_neg)
return(output)
})
positive <- sapply(order, "[[", "positive")
positive_df <- do.call(cbind, positive)
negative <- sapply(order, "[[", "negative")
negative_df <- do.call(cbind, negative)
gene_loadings_output <- list("by_most_Positive" = positive_df, "by_most_Negative" = negative_df, "PCA_output" = pca)
return(gene_loadings_output)
}
#' id_errored_runs
#'
#' When `contml` runs error, it produces an empty `outtree` and `outfile`. We remove these, but now the index between the `contml` output files and scjack matrices do not match, leading to downstream analysis issues.
#' This function makes a list of matrices that produced an error when `contml` was run. We will then use this list to remove these matrices and proceed with downstream analyses.
#' This function is part of the workflow of the scjackknife analysis.
#'
#'
#' @importFrom utils write.table
#'
#' @param outtrees_dir Path to directory with contml `outtrees`. This directory contains all `outtree` files created by contml, including for runs that errored (ie unprocessed outtrees directory)
#' @param matrix_name_pattern Prefix for matrix name. This function assumes that your matrix names end in `index_num_mtx` eg. for `means_5_reps_540_cells_20PC_after_PCA_index_mtx`, the matrix_name_pattern is `means_5_reps_540_cells_20PC_after_PCA`
#'
#' @return prints out a list of scjackknife matrix names that produced empty tree files when run through contml.
#' @export
#'
#' @examples
#'
id_errored_runs <- function(outtrees_dir, matrix_name_pattern){
files_list <- list.files(path = outtrees_dir, full.names=TRUE, recursive=FALSE )
empty_paths <- files_list[file.size(files_list) == 0]
empty_files <- sapply(strsplit(empty_paths, "/"), function(v){return(v[length(v)])})
#convert to matrices dir names
names <- sapply(strsplit(empty_files, "_"), function(v){return(v[length(v)-1])})
matrix_names <- paste0(matrix_name_pattern,"_", names,"_mtx")
write.table(matrix_names, file = "errored_runs.txt", quote=FALSE, sep = "\t", col.names=FALSE, row.names=FALSE)
}
#' label_scjack_trees
#'
#' Comparison and calculation of TBE across scjackknife trees require that the tips share identical names
#' This script labels scjackknife outtrees by 1) their full tip names and 2) a common label - a tip name that contains only species and cell type cluster group id that can be matched acros scjackknife trees
#' Previously named label_scjack_trees_54.
#'
#' @importFrom ape write.tree
#'
#' @param n_trees The number of trees to be relabelled.
#' @param matrix_dir_path The path to the scjackknife matrices that were used to create the scjackknife infiles. This directory must have matrices that produced empty outtree files removed using cellphylo::id_errored_runs.
#' @param tree_dir_path The path to the scjakknife outtrees produced by contml. This must have empty outtree files removed.
#' @param print Boolean. Print out trees with 1) full labels and 2) common labels
#' @param file_name_prefix The prefix for the outtree file names
#' @return output A list containing scjackknife trees with full tip name labels ("labelled_tree") and scjackknife trees with comomn labels ("common_labels_tree)
#' @export
#'
#' @examples
label_scjack_trees<- function(n_trees, matrix_dir_path, tree_dir_path, print=TRUE, file_name_prefix){
if (print==TRUE){
if(!dir.exists(paste0(file_name_prefix, "_label"))){
dir.create(paste0(file_name_prefix, "_label"))
}
if(!dir.exists(paste0(file_name_prefix, "_common_label"))){
dir.create(paste0(file_name_prefix, "_common_label"))
}
}
tree_list <- list.files(path = tree_dir_path, full.names=TRUE, recursive=FALSE )
matrix_list <- list.dirs(path = matrix_dir_path, full.names=TRUE, recursive=FALSE)
lapply(c(1:n_trees), function(tree_index){
#label the tree with the full cell ids
labelled_tree <- label_tree(matrix_path = matrix_list[tree_index], tree_path = tree_list[tree_index], print=FALSE)
#parse out cell id
file_name <- sapply(strsplit(tree_list[tree_index], "/"), function(v){return(v[length(v)])})
index_num <- sapply(strsplit(file_name, "_"), function(v){return(v[length(v)-1])})
suffix <- sapply(strsplit(file_name, "_"), function(v){return(v[length(v)])})
if (print==TRUE){
#print out trees with human readable tips. Full cell ids.
write.tree(labelled_tree, file=paste0(file_name_prefix,"_labels_", index_num, "_", suffix))
file.rename(paste0(file_name_prefix,"_labels_", index_num, "_", suffix), paste0(file_name_prefix, "_label/",file_name_prefix,"_labels_", index_num, "_", suffix))
}
#bootstrapping requires common label across jackknife trees so that tips correspond
#make common labels
species <- sapply(strsplit(labelled_tree$tip.label, "_"), function(v){return(v[1])})
cluster_id <- sapply(strsplit(labelled_tree$tip.label, "_"), function(v){return(v[3])})
new_label <- paste0(species, "_", cluster_id)
common_labels_tree <- labelled_tree
common_labels_tree$tip.label <- new_label
if (print==TRUE){
#print out common labels tree
write.tree(common_labels_tree, file=paste0(file_name_prefix, "_common_labels_", index_num, "_", suffix))
file.rename(paste0(file_name_prefix, "_common_labels_", index_num, "_", suffix), paste0(file_name_prefix, "_common_label/",file_name_prefix, "_common_labels_", index_num, "_", suffix))
}
output <- list("labelled_tree" = labelled_tree, "common_labels_tree" = common_labels_tree)
return(output)
} #close apply function
)#close apply
}
#' label_tree
#'
#' Replace the taxon number labels in the contml outtree with the cell ids in the input matrix used to make the contml infile.
#'
#' @importFrom Seurat Read10X
#' @importFrom ape read.tree write.tree
#' @importFrom dplyr left_join
#'
#' @param matrix_path Path to matrix from which the contml infile was created. The taxon numbers of the infile correspond to the matrix cell ids in sequence.
#' @param tree_path Path to the raw phylogeny produced by contml ("outtree" file)
#' @param file_name File name for the labelled tree this function produces.
#' @param print Print out the tree file of the labelled tree. Tree is given in newick format.
#'
#' @return A phylo object of the labelled tree.
#' @export
#'
#' @examples
label_tree = function(matrix_path,tree_path, file_name, print){
mat <- Read10X(matrix_path)
#connect cell ids to index
key.df <- data.frame(cell_id = rownames(mat), index = as.character(c(1:nrow(mat))))
#load tree
tree <- read.tree(tree_path)
plot(tree)
#find tip label
tips.df <- data.frame(index = tree$tip.label)
#connect tip id to cell id
new_tip.df <- left_join(tips.df, key.df, by="index")
#replace old tip indices with new tip cell id labels
tree$tip.label <- new_tip.df$cell_id
plot(tree)
if (print==TRUE){
write.tree(tree, file=file_name)
}
return(tree)
}
#' make_infile
#'
#' Converts a matrix to contml (PHYLIP) infile format.
#'
#' @importFrom Seurat Read10X
#' @importFrom utils write.table
#' @param matrix Matrix to create contml infile from. Use this if you have the matrix as an R object.
#' @param matrix_path Path to matrix to create contml infile from. Use this if you have a printed matrix in 10x feature-barcode format.
#' @param print Print the infile
#'
#' @return An object representing a matrix formatted in contml infile format. Contml requires a printed infile so print=TRUE is recommended.
#' @export
#'
#' @examples
make_infile = function(matrix, matrix_path, print){
if (!missing(matrix_path) && missing(matrix)){
mat <- Read10X(matrix_path)
}
if (!missing(matrix) && missing(matrix_path)){
mat <- matrix
}
ntaxa <- nrow(mat)
nchar <- ncol(mat)
NA_col <- rep(NA, times=nrow(mat))
phylip=""
phylip <- cbind(1:nrow(mat), NA_col, NA_col,NA_col,NA_col,NA_col,NA_col,NA_col,NA_col,NA_col,NA_col,NA_col,NA_col,NA_col, mat)
phylip_final <- rbind(c(rep(NA, times=4), ntaxa, rep(NA, times=3), nchar, rep(NA, times=ncol(phylip)-9)), phylip)
if (print == TRUE){
print(write.table(phylip_final, file="infile", quote=FALSE, sep=" ", row.names=FALSE, col.names=FALSE, na= ""))}
return(phylip_final)
}
#' make_pc_sweep_infiles
#'
#' This function produces the input files for the PC sweep experiment in Mah & Dunn (2023) (Fig.2).
#' It takes in the 919 cell 919 PC matrix (Fig. S1, step 7) and subsets the matrix to an increasing number of PCs. These matrix subsets are printed out as `contml` infiles.
#' If `calculate_PCA = TRUE` the function will first perform PCA on the matrix before subsetting and creating the infiles.
#'
#' @param matrix_path Path to a matrix that has had PCA performed on it.
#' @param run_PCA Boolean. If TRUE, calculate PCA on the input matrix before subsetting and producing infiles (i.e. the input matrix has not had PCA performed on it yet)
#' @param PC_range A range indicating the number of PCs to subset from the input matrix.
#'
#' @return This function prints out the `contml` infiles for the PC sweep experiment
#' @export
#'
#' @examples
#'
make_pc_sweep_infiles <- function(matrix_path, run_PCA = FALSE, PC_range=c(3:100)){
mat <- Read10X(matrix_path) %>% as.matrix()
if(run_PCA == TRUE){
mat <- t(mat)
#pca
pc <- prcomp(mat, center=TRUE, scale=FALSE)
x <- pc$x
#normalize sd
sds <- apply(x,2,sd)
x.scaled <- t(t(x)/sds)
apply(x.scaled, 2, sd)
mat <- x.scaled
}
#set up directories to deposit infiles into
if(!dir.exists("PC_sweep")){
dir.create("PC_sweep")
}
#create a directory for the combined matrix
if(!dir.exists("PC_sweep/infiles")){
dir.create("PC_sweep/infiles")
}
lapply(PC_range, function(c){
x.sweep.matrix <- mat[,1:c]
#format to phylip
phylip <- make_infile(matrix = x.sweep.matrix, print=FALSE)
#write table
write.table(phylip, file=paste0("PC_sweep/infiles/infile_pca_var_norm_pc_sweep_",c), quote=FALSE, sep=" ", row.names=FALSE, col.names=FALSE, na= "")
print(paste0("infile ", c))
} #close function
) #close lapply
}
#' make_scjack_sample
#'
#' Create resampled matrices that contain 1 replicate cell per cell type group from a larger matrix that contains multiple replicates. This is used for running the scjackknife analysis.
#' Previously named make_scjack_sample54
#'
#' @importFrom utils write.table
#' @importFrom DropletUtils write10xCounts
#' @importFrom Seurat Read10X
#'
#' @param n_samples Number of scjackknife samples to create.
#' @param subsample_matrix_path Path to the matrix to sample from.
#' @param full_matrix_path Path to full unsubsetted matrix. PCA should already have been performed and PCs subsetted.
#' @return pca_matrix List of resampled single jackknife matrices.
#' @export
#'
#' @examples
make_scjack_sample <- function(n_samples, subsample_matrix_path, full_matrix_path, file_name){
#load the full 919 cell 20 PC matrix for subsetting down below
full_pca_mat <- Read10X(full_matrix_path)
#make directories to organize
if(!dir.exists("scjackknife")){
dir.create("scjackknife")
}
if(!dir.exists(paste0("scjackknife/input_", file_name))){
dir.create(paste0("scjackknife/input_", file_name))
}
if(!dir.exists(paste0("scjackknife/input_", file_name,"/scjack_selected_cells"))){
dir.create(paste0("scjackknife/input_", file_name,"/scjack_selected_cells"))
}
if(!dir.exists(paste0("scjackknife/input_", file_name,"/scjack_matrices"))){
dir.create(paste0("scjackknife/input_", file_name,"/scjack_matrices"))
}
if(!dir.exists(paste0("scjackknife/input_", file_name, "/scjack_infiles"))){
dir.create(paste0("scjackknife/input_", file_name, "/scjack_infiles"))
}
sapply(n_samples, function(n_samples){
#draw random subset of cells. 1 cell per cell type cluster per speciesu
sampled_cells <- subset_combined_matrix(matrix_path = subsample_matrix_path, sample_size=1, print=FALSE)
#record list of selected cells
subset_list <- colnames(sampled_cells)
#rank reduce matrix
#pca_matrix <- run_pca(matrix_path = matrix_path, n_PCs = 20, subset = subset_list, print=FALSE)
pca_matrix <- full_pca_mat[rownames(full_pca_mat) %in% subset_list,]
#make infile
phylip <- make_infile(matrix = pca_matrix, print = FALSE)
## print out scjackknife input files
## print outside of the two above functions so can have sequential file names
## record the subsampled cell ids
write.table(subset_list, paste0("scjackknife_", file_name,"_selected_cells_",n_samples, ".txt"), quote=FALSE, row.names = FALSE, col.names = FALSE)
file.rename(paste0("scjackknife_", file_name,"_selected_cells_",n_samples, ".txt"), paste0("scjackknife/input_", file_name,"/scjack_selected_cells/scjackknife_", file_name,"_selected_cells_",n_samples, ".txt"))
## write out subasmpled matrix
write10xCounts(paste0("scjackknife_", file_name,"_",n_samples, "_mtx"), pca_matrix, version="3")
file.rename(paste0("scjackknife_", file_name,"_",n_samples, "_mtx"), paste0("scjackknife/input_", file_name,"/scjack_matrices/scjackknife_", file_name,"_",n_samples, "_mtx"))
## phylip infile
write.table(phylip, file=paste0("infile_scjackknife_", file_name,"_", n_samples), quote=FALSE, sep=" ", row.names=FALSE, col.names=FALSE, na= "")
file.rename(paste0("infile_scjackknife_", file_name,"_", n_samples), paste0("scjackknife/input_", file_name,"/scjack_infiles/infile_scjackknife_", file_name,"_", n_samples))
return(pca_matrix)
} #close function
)#close sapply
} #close scjackknife_make_input()
#' plot_eigs
#'
#' Perform PCA and produce eigenvalue plots for a matrix. These plots are 1) eigenvalue vs PC, 2) eigenvalue cumulative sum, 3) eigenvalue vs percent variance, and 4) percent variance vs PC.
#'
#' @import ggplot2
#' @importFrom Seurat Read10X
#' @importFrom stats prcomp sd
#' @importFrom factoextra get_eigenvalue
#' @importFrom grDevices pdf dev.off
#'
#' @param matrix_path Path to matrix PCA is to be performed on.
#' @param highlight The number of the PC you would like to highlihgt in the plot. The dot representing this PC will be highlighted in red. Default is PC20, used in Mah & Dunn (2023)
#' @param print Boolean. Print out plots.
#' @return plots A list containing the four eigenvalue plots as ggplot objects
#' @export
#'
#' @examples
plot_eigs <- function(matrix_path, highlight=20, print=TRUE){
#within-species normalized, combined and integrated cross-species. NOT subsetted.
#unfiltered raw data
mat <- Read10X(matrix_path)
#transpose the matrix for the pca function.
mat <- as.matrix(mat) %>% t()
#pca
pca <- prcomp(mat, center=TRUE, scale=FALSE)
#eigenvalues, % var, cumulative % var
eigvals <- get_eigenvalue(pca)
#build plots
#1. eigenvalue vs PC
plot.df <- data.frame(order = 1:length(eigvals$eigenvalue), eigenvalue = eigvals$eigenvalue)
plot.df.highlight <- plot.df[highlight,]
p1 <- ggplot(plot.df, aes(x=order, y=eigenvalue)) + geom_point() + geom_point(data = plot.df.highlight, colour = "red" ) + theme_bw() +theme(panel.border = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"))
#2. eigenvalue cumulative sum
plot.df <- data.frame(order = 1:length(cumsum(eigvals$eigenvalue)), eigenvalue_cum_sum = cumsum(eigvals$eigenvalue))
p2 <- ggplot(plot.df, aes(x=order, y=eigenvalue_cum_sum)) + geom_point() +theme_bw() +xlab("order") +ylab("eigenvalue cumulative sum")
#3 eigenvalue vs percent variance
plot.df <- data.frame(eigenvalue = eigvals$eigenvalue, percent_variance = eigvals$variance.percent)
p3 <- ggplot(plot.df, aes(x=eigenvalue, y=percent_variance)) + geom_point() +theme_bw() + xlab("eigenvalue") + ylab("percent variance")
#how much variance is described by the first few PCs?
#aqhumor_cross_species_integrated_mtx/: 26.65411
print(paste0("The amount of variance described by the first ",highlight," PCs is: ",sum(plot.df$percent_variance[1:highlight])))
# percent variance vs PC
plot.df <- data.frame(component = 1:length(eigvals$variance.percent), percent_variance = eigvals$variance.percent)
plot.df.highlight <- plot.df[highlight,]
p4 <- ggplot(plot.df, aes(x = component, y = percent_variance)) + geom_point(size=0.5) + xlab("Principal Component") + ylab("Percent Variance") + geom_point(data = plot.df.highlight, colour = "red", size=0.5) + theme_bw() +theme(panel.border = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"), axis.text.x=element_text(size=6.5), axis.text.y = element_text(size=6.5), axis.title = element_text(size = 7))
if (print==TRUE){
if(!dir.exists("eigenvalue_plots")){
dir.create("eigenvalue_plots")
}
#4 variance vs
#line plot of eigenvalues ordered
pdf(file="eigenvalue_plots/1_eigenvalue_vs_PC.pdf", width=8, height=5)
print(p1)
dev.off()
#cumulative sum
pdf(file="eigenvalue_plots/2_eigenvalue_cumulative_sum.pdf", width=8, height=5)
print(p2)
dev.off()
#variance of each component vs eigenvalue
pdf(file="eigenvalue_plots/3_eigenvalue_vs_variance.pdf", width=5.5, height=5)
print(p3)
dev.off()
#component variance
pdf(file="eigenvalue_plots/4_eigenvalue_component_variance.pdf", width=8, height=5)
print(p4)
dev.off()
}
plots <- list("1_eigenvalue_vs_PC" = p1, "2_eigenvalue_cumulative_sum" = p2, "3_eigenvalue_vs_variance" = p3, "4_eigenvalue_component_variance" = p4)
return(plots)
}
#' plot_pc_sweep
#'
#' Plots the variables calculated by calc_pc_sweep_var. Produces Figs 2A, 2B, 2C. Default values are what were used in Mah & Dunn (2023).
#'
#' @import ggplot2
#' @importFrom grDevices pdf dev.off
#'