-
Notifications
You must be signed in to change notification settings - Fork 0
/
SLIDER_ScoreReplicates.R
382 lines (342 loc) · 24.6 KB
/
SLIDER_ScoreReplicates.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
#!/usr/bin/env SLIDER_ScoreReplicates.R
#!/usr/local/bin/ Rscript
args = commandArgs(trailingOnly = TRUE)
if (length(args)<3) {
stop('Missing one or more input arguments. Provide (counts file for unsorted sample), (counts file for sorted sample), and (a name you will use to identify this analysis)', call.=FALSE)
}
if (length(args)>3) {
stop('You have too many input arguments. Provide (counts file for unsorted sample), (counts file for sorted sample), and (a name you will use to identify this analysis)', call.=FALSE)
}
#Read in supplied counts data and title/label for this sort
Unsort <- read.table(args[1], header=FALSE, row.names=1, quote='\'', comment.char='', stringsAsFactors = FALSE)
Sort <- read.table(args[2], header=FALSE, row.names=1, quote='\'', comment.char='', stringsAsFactors = FALSE)
StudyName <- as.character(args[3])
#Create a dataframe for read counts at each guide present in Sorted and Unsorted samples
Data <- merge(Unsort, Sort, by=0, all.x=TRUE)
Data$V2.y<-NULL
Data$V3.y<-NULL
#Assigning a value of 0 counts to sgRNA present as NAs (these are guides present in the Unsorted or Sorted Sample but not captured by NGS of the other)
Data[is.na(Data)]<-0
#Assigning Labels to Values
Data$AdditionalGuideInfo<-Data$V2.x
Data$V2.x<-NULL
Data$GeneName<-Data$V3.x
Data$V3.x<-NULL
Data$UnsortCounts<-Data$V4.x
Data$SortCounts<-Data$V4.y
Data$V4.x<-NULL
Data$V4.y<-NULL
#Normalizing Counts for Read Depth Variation (Counts Per Million) and log transforming using log2(CPM+2) pseudocount for plotting
UnsortSizeFactor<-(sum(Data$UnsortCounts))/1000000
SortSizeFactor<-(sum(Data$SortCounts))/1000000
Data$UnsortNormCounts<-(Data$UnsortCounts/UnsortSizeFactor)
Data$SortNormCounts<-(Data$SortCounts/SortSizeFactor)
Data$Unsort_log2PC <- log2(Data$UnsortNormCounts+2)
Data$Sort_log2PC <- log2(Data$SortNormCounts+2)
#Assigning sgRNA Ranks for Each Sample Based on Raw Read Counts and Calculating Difference in Rank for Statistical Testing
Data$Unsort_Rank<-rank(x=Data$UnsortCounts)
Data$Sort_Rank<-rank(x=Data$SortCounts)
Data$Unsort_NormRank<-Data$Unsort_Rank/length(Data$Unsort_Rank)
Data$Sort_NormRank<-Data$Sort_Rank/length(Data$Sort_Rank)
Data$SortVsUnsort_NormRankDiff<-Data$Sort_NormRank-Data$Unsort_NormRank
#Dividing Data Set into sgRNAs that increased in Rank (positive rank diff) and sgRNAs that decreased or stayed the same (rank shift <=0)
DataDown<-Data[which(Data$SortVsUnsort_NormRankDiff <= 0),]
DataUp<-Data[which(Data$SortVsUnsort_NormRankDiff > 0),]
#Computing Local Estimates for the Diversity Parameter (b) for a Laplace Distribution for the Normalized Change in sgRNA Ranks (Sort vs Unsort) by
#sampling 1000 sgRNA closest in rank to the current sgRNA within the unsorted dataset and defining an expected value of 0 for a null change in rank.
#Using the maximum likelihood estimator to calculate b. Performing estimate for two half-functions, one for guides that enriched after sorting (rank
#shift>0) and one for guides that did not (rank shift<=0). Applying distribution with locally estimated directional diversity to compute sgRNA pvalue.
Laplace_pval <- function(a,mu,b) {
p<-0.5*(exp(-1*abs(a-mu)/b))
p_twotail<-2*p
return (p_twotail)
}
center = 0
diversity = sum(abs(DataUp$SortVsUnsort_NormRankDiff))/length(DataUp$SortVsUnsort_NormRankDiff)
DataUp_Ordered<-DataUp[order(DataUp$UnsortNormCounts),]
DataUp_Ordered$idx<-seq(from=1,to=(length(DataUp$SortVsUnsort_NormRankDiff)))
DataUp_Ordered$minRange<-DataUp_Ordered$idx - 499
DataUp_Ordered$maxRange <-DataUp_Ordered$idx + 500
DataUp_Ordered$minRange[which(DataUp_Ordered$idx < 500)] <- 1
DataUp_Ordered$maxRange[which(DataUp_Ordered$idx < 500)] <- 1000
DataUp_Ordered$minRange[which(DataUp_Ordered$idx > length(DataUp_Ordered$SortVsUnsort_NormRankDiff)-500)]<-length(DataUp_Ordered$SortVsUnsort_NormRankDiff)-999
DataUp_Ordered$maxRange[which(DataUp_Ordered$idx > length(DataUp_Ordered$SortVsUnsort_NormRankDiff)-500)]<-length(DataUp_Ordered$SortVsUnsort_NormRankDiff)
i=0
while(i<length(DataUp_Ordered$idx)){
i=i+1
Nearest1K<-DataUp_Ordered$SortVsUnsort_NormRankDiff[which(DataUp_Ordered$idx>=DataUp_Ordered$minRange[i] & DataUp_Ordered$idx<=DataUp_Ordered$maxRange[i])]
DataUp_Ordered$b[i]<-sum(abs(Nearest1K - 0))/length(Nearest1K)
}
DataUp_Ordered$sgRNA_pvalue<- Laplace_pval(DataUp_Ordered$SortVsUnsort_NormRankDiff,mu=0,DataUp_Ordered$b)
DataUp_Ordered$idx<-NULL
DataUp_Ordered$minRange<-NULL
DataUp_Ordered$maxRange<-NULL
center = 0
diversity = sum(abs(DataDown$SortVsUnsort_NormRankDiff))/length(DataDown$SortVsUnsort_NormRankDiff)
DataDown_Ordered<-DataDown[order(DataDown$UnsortNormCounts),]
DataDown_Ordered$idx<-seq(from=1,to=(length(DataDown$SortVsUnsort_NormRankDiff)))
DataDown_Ordered$minRange<-DataDown_Ordered$idx - 499
DataDown_Ordered$maxRange <-DataDown_Ordered$idx + 500
DataDown_Ordered$minRange[which(DataDown_Ordered$idx < 500)] <- 1
DataDown_Ordered$maxRange[which(DataDown_Ordered$idx < 500)] <- 1000
DataDown_Ordered$minRange[which(DataDown_Ordered$idx > length(DataDown_Ordered$SortVsUnsort_NormRankDiff)-500)]<-length(DataDown_Ordered$SortVsUnsort_NormRankDiff)-999
DataDown_Ordered$maxRange[which(DataDown_Ordered$idx > length(DataDown_Ordered$SortVsUnsort_NormRankDiff)-500)]<-length(DataDown_Ordered$SortVsUnsort_NormRankDiff)
i=0
while(i<length(DataDown_Ordered$idx)){
i=i+1
Nearest1K<-DataDown_Ordered$SortVsUnsort_NormRankDiff[which(DataDown_Ordered$idx>=DataDown_Ordered$minRange[i] & DataDown_Ordered$idx<=DataDown_Ordered$maxRange[i])]
DataDown_Ordered$b[i]<-sum(abs(Nearest1K - 0))/length(Nearest1K)
}
DataDown_Ordered$sgRNA_pvalue<- Laplace_pval(DataDown_Ordered$SortVsUnsort_NormRankDiff,mu=0,DataDown_Ordered$b)
DataDown_Ordered$idx<-NULL
DataDown_Ordered$minRange<-NULL
DataDown_Ordered$maxRange<-NULL
Data_Ordered<-rbind(DataUp_Ordered,DataDown_Ordered)
#An sgRNA that had no change in rank will recieve a p-value of 1 from the laplace function above. This will lead to logits that are infinite in steps below. Replacing
#these p-values with 0.99999995 to prevent 0 values from impairing calculations later in this script.
Data_Ordered$sgRNA_pvalue[which(Data_Ordered$SortVsUnsort_NormRankDiff==0)]<-0.99999995
#The next step is a filter. Not all sgRNA efficiently knockout/activate/repress their targets in every cell line. Some sgRNAs also have cell-line specific off-target
#activity. This causes a minority of sgRNA designed to target a gene to behave differently from sgRNA that successfully target the gene. In a screen where evidence is
#combined across independant sgRNA to nominate genes, retaining these sgRNA reduces sensitivity. To overcomes this, a filter is implemented to exclude sgRNA that are
#discordant in their behavior from that of the predominate behavior of sgRNAs designed against a specific gene.
#
#Filtering Steps:
##Identify Genes Where Fisher Test Finds the Fraction of sgRNAs that Increase in Rank at a Gene is Greater than the Fraction Across the Library. Label as UpGene.
#Identify Genes Where Fisher Test Finds the Fraction of sgRNAs that Decrease in Rank at a Gene is Greater than the Fraction Across the Library. Label as DownGene.
#Identify Genes Where Fisher Tests Found No Such Difference for Either Fraction. Label as Neutral. Genes categorized by these tests as both UpGenes and DownGenes
#are relabeled as Neutral on the grounds that sorting lead to the sgRNAs for that gene becoming dispersed but without any prefferance for direction. Using a
#permissive Fisher's Exact pvalue of 0.1 for this step. Feel free to tune this filtering parameter for the needs of your library, screen, and dataset. Once genes
#are labeled, those sgRNAs that are discordant with the genes category label are excluded from further analysis.
#Categorizing sgRNA with p<0.2 as active and enriched (positive rank change) or depleted (negative rank change). sgRNA unable to acheive p<0.2 after sorting are
#are categorized as neutral.
Data_Ordered$sgRNA_Up <-0
Data_Ordered$sgRNA_Up[which(Data_Ordered$sgRNA_pvalue<0.2 & Data_Ordered$SortVsUnsort_NormRankDiff>0)]<-1
Data_Ordered$sgRNA_Down <-0
Data_Ordered$sgRNA_Down[which(Data_Ordered$sgRNA_pvalue<0.2 & Data_Ordered$SortVsUnsort_NormRankDiff<0)]<-1
Data_Ordered$sgRNA_Neutral <-1
Data_Ordered$sgRNA_Neutral[which(Data_Ordered$sgRNA_Up==1)]<-0
Data_Ordered$sgRNA_Neutral[which(Data_Ordered$sgRNA_Down==1)]<-0
#Performing Fisher's Exact Test Comparing the Fraction of sgRNAs for Gene-i that Increase in Rank to the Fraction of sgRNAs that Do So Across the Whole Library.
Gene_sgRNAcount<-aggregate(Data_Ordered$sgRNA_Up, by=list(Data_Ordered$GeneName),FUN=length)
Gene_sgRNAup<-aggregate(Data_Ordered$sgRNA_Up, by=list(Data_Ordered$GeneName),FUN=sum)
UpGeneSummary<-merge(Gene_sgRNAcount, Gene_sgRNAup, by='Group.1')
UpGeneSummary$GeneName<-UpGeneSummary$Group.1
UpGeneSummary$NsgRNA<-UpGeneSummary$x.x
UpGeneSummary$NsgRNAup<-UpGeneSummary$x.y
LibraryUp<-sum(Data_Ordered$sgRNA_Up)
LibraryTotal<-length(Data_Ordered$sgRNA_Up)
FisherExactPvalue<-Vectorize(function(NumUp,NumTotal,LibUp,LibTotal) {
out<-fisher.test(matrix(c(NumUp, NumTotal-NumUp, LibUp, LibTotal-LibUp), ncol=2))
return(out$p.value)
})
UpGeneSummary$EnrichedPvalue<-FisherExactPvalue(UpGeneSummary$NsgRNAup,UpGeneSummary$NsgRNA,LibraryUp,LibraryTotal)
UpGeneSummary$Group.1<-NULL
UpGeneSummary$x.x<-NULL
UpGeneSummary$x.y<-NULL
#Performing Fisher's Exact Test Comparing the Fraction of sgRNAs for Gene-i that Decrease in Rank to the Fraction of sgRNAs that Do So Across the Whole Library.
Gene_sgRNAdown<-aggregate(Data_Ordered$sgRNA_Down, by=list(Data_Ordered$GeneName),FUN=sum)
DownGeneSummary<-merge(Gene_sgRNAcount, Gene_sgRNAdown, by='Group.1')
DownGeneSummary$GeneName<-DownGeneSummary$Group.1
DownGeneSummary$NsgRNA<-DownGeneSummary$x.x
DownGeneSummary$NsgRNAdown<-DownGeneSummary$x.y
LibraryDown<-sum(Data_Ordered$sgRNA_Down)
LibraryTotal<-length(Data_Ordered$sgRNA_Up)
DownGeneSummary$DepletedPvalue<-FisherExactPvalue(DownGeneSummary$NsgRNAdown,DownGeneSummary$NsgRNA,LibraryDown,LibraryTotal)
DownGeneSummary$Group.1<-NULL
DownGeneSummary$x.x<-NULL
DownGeneSummary$x.y<-NULL
UpGenes<-UpGeneSummary[which(UpGeneSummary$EnrichedPvalue<0.1),]
DownGenes<-DownGeneSummary[which(DownGeneSummary$DepletedPvalue<0.1),]
BothSets<-UpGenes[which(UpGenes$GeneName %in% DownGenes$GeneName),]
Enriched<-UpGenes[which(!(UpGenes$GeneName %in% BothSets$GeneName)),]
Depleted<-DownGenes[which(!(DownGenes$GeneName %in% BothSets$GeneName)),]
Gene_sgRNAneut<-aggregate(Data_Ordered$sgRNA_Neutral, by=list(Data_Ordered$GeneName),FUN=sum)
NeutGeneSummary<-merge(Gene_sgRNAcount, Gene_sgRNAneut, by='Group.1')
NeutGeneSummary$GeneName<-NeutGeneSummary$Group.1
NeutGeneSummary$NsgRNA<-NeutGeneSummary$x.x
NeutGeneSummary$NsgRNAneutral<-NeutGeneSummary$x.y
NeutGeneSummary$Group.1<-NULL
NeutGeneSummary$x.x<-NULL
NeutGeneSummary$x.y<-NULL
Neutral<-NeutGeneSummary[which(!(NeutGeneSummary$GeneName %in% Enriched$GeneName) & !(NeutGeneSummary$GeneName %in% Depleted$GeneName)) ,]
#Subsetting Data to Exclude sgRNAs that Were Discordant with the Category Assigned to Each Gene
EnrichedGenes<-Data_Ordered[which(Data_Ordered$GeneName %in% Enriched$GeneName),]
EnrichedGenes_UPsgRNAs<-EnrichedGenes[which(EnrichedGenes$sgRNA_Up==1),]
EnrichedGenes_UPsgRNAs$WeightedLogits<-(EnrichedGenes_UPsgRNAs$SortVsUnsort_NormRankDiff)*log(EnrichedGenes_UPsgRNAs$sgRNA_pvalue/(1-EnrichedGenes_UPsgRNAs$sgRNA_pvalue))
DepletedGenes<-Data_Ordered[which(Data_Ordered$GeneName %in% Depleted$GeneName),]
DepletedGenes_DOWNsgRNAs<-DepletedGenes[which(DepletedGenes$sgRNA_Down==1),]
DepletedGenes_DOWNsgRNAs$WeightedLogits<-(DepletedGenes_DOWNsgRNAs$SortVsUnsort_NormRankDiff)*log(DepletedGenes_DOWNsgRNAs$sgRNA_pvalue/(1-DepletedGenes_DOWNsgRNAs$sgRNA_pvalue))
NeutGenes<-Data_Ordered[which(Data_Ordered$GeneName %in% Neutral$GeneName),]
NeutGenes_NEUTsgRNAs<-NeutGenes[which(NeutGenes$sgRNA_Neutral==1),]
NeutGenes_NEUTsgRNAs$WeightedLogits<-(NeutGenes_NEUTsgRNAs$SortVsUnsort_NormRankDiff)*log(NeutGenes_NEUTsgRNAs$sgRNA_pvalue/(1-NeutGenes_NEUTsgRNAs$sgRNA_pvalue))
Data_Filtered<-rbind(EnrichedGenes_UPsgRNAs,NeutGenes_NEUTsgRNAs,DepletedGenes_DOWNsgRNAs)
#This step deploys the weighted logit method for combining p-values to arrive at a gene-level score by combining evidence across the sgRNAs targetting the
#gene that are present after the filtering step above. Mudholkar and George (see https://apps.dtic.mil/dtic/tr/fulltext/u2/a049993.pdf) showed that the sum
#of logit(p-value) takes the form of a scaled Student's t-distribution with 5m+4 degrees of freedom (df) and the scalar C=sqrt((0.3*df)/(m*(df-2))), where m
#is the number of p-values combined. When including weights, the sum of Weight*logit(p) also follows a T-distribution, but df and C become a function of the
#weights. Applying Mudholkar and George's closed form solution for C and df would therefore involve calculating a unique df and scaler at each gene. For
#computational simplicity the implementation below instead assigns a conservative df=2 and uses an L2E estimate to determine an optimal C across the genes scored.
#We find df=2 to be conservative as the unweighted method would have df=14 for combining p-values from just two sgRNA and most screens aim for 3+ active sgRNAs
#of genes scored. When calculating our score, we weight logit(p) by the normalized change in rank to arrive at a combined score that accounts for significance
#(p-value) and effect size (change in rank) at each sgRNA. The sum/C yields a gene level T-statisitic we've labeled SLIDER scores. p-values for each gene are
#then computed from a Student's t-distribution with df=2. Multiple comparisons are corrected for using a Benjamini-Hochberg estimate of the FDR at each gene.
#This script scores a single unsorted-vs-sorted replicate. To facilitate generating screen-level summary scores by combining replicates using Stouffer's method,
#a gene level Zequiv score computed by taking the inverse-normal(Gene_p-value) and matching its sign to the SLIDER-score.
GeneStat<-aggregate(Data_Filtered$WeightedLogits, by=list(Data_Filtered$GeneName),FUN=sum)
GeneStat$GeneName<-GeneStat$Group.1
GeneStat$SLIDER_NegWeightedLogitSum<-(-1*GeneStat$x)
GeneStat$Group.1<-NULL
GeneStat$x<-NULL
Gene_meanRankDiff<-aggregate(Data_Filtered$SortVsUnsort_NormRankDiff, by=list(Data_Filtered$GeneName),FUN=mean)
Gene_meanRankDiff$GeneName<-Gene_meanRankDiff$Group.1
Gene_meanRankDiff$meanDiffRank<-Gene_meanRankDiff$x
Gene_meanRankDiff$Group.1<-NULL
Gene_meanRankDiff$x<-NULL
GeneScores<-merge(x=GeneStat,y=Gene_meanRankDiff,by='GeneName')
#Using the 99% of Neutral Genes with Weighted Logit Sums Nearest to the Median to Inform an L2E Estimate of the
#scalar of the Negative Weigthed Logit Sum that best approximates a t-distribution with 2-degrees of freedom.
#This is done to limit screen hits (not neutral) and outliers (i.e. neutral genes with read out issues) from
#biasing the value estimated for the scalar.
L2E_neuts<-GeneScores$SLIDER_NegWeightedLogitSum[which(GeneScores$GeneName %in% NeutGenes$GeneName)]
L2E_neuts_ordered<-sort(L2E_neuts)
StartCount<-round((0.005*length(L2E_neuts_ordered)),0)
EndCount<-round((0.995*length(L2E_neuts_ordered)),0)
L2E_presubset<-L2E_neuts_ordered[StartCount:EndCount] - median(L2E_neuts_ordered[StartCount:EndCount])
if (abs(min(L2E_presubset))<abs(max(L2E_presubset))) {
L2E_subset<-L2E_presubset[which(abs(L2E_presubset)<abs(min(L2E_presubset)))]
}
if (abs(min(L2E_presubset))>abs(max(L2E_presubset))) {
L2E_subset<-L2E_presubset[which(abs(L2E_presubset)<abs(max(L2E_presubset)))]
}
degfree<-2
params<-c(0.2)
names(params) <- c('scalar')
obj=function(S){
((sqrt(pi)*gamma(x=((1/2)+(degfree))))/(((degfree)^(3/2))*S['scalar']*(beta(a=(degfree/2),b=(1/2)))^2*gamma(x = degfree)))-(2*mean((1/S['scalar'])*dt((L2E_subset)/S['scalar'],df=degfree)))
}
SLIDER_Tdist_params<-nlminb(params,obj, lower=0.01, upper=5)$par
#Calculating SLIDER_score T-statistic, determining p-value, correcting for multiple comparisons with FDR determined using the
#Benjamini-Hochberg method, and reporting an inverse-normal(p-value) Zscore that would achieve the same p-value in a two-tail test.
GeneScores$SLIDER_score<-(GeneScores$SLIDER_NegWeightedLogitSum-median(L2E_neuts_ordered[StartCount:EndCount]))/SLIDER_Tdist_params[1]
GeneScores$pvalue<-2*pt(q=(-1*abs(GeneScores$SLIDER_score)), df = degfree, lower.tail = TRUE, log.p=FALSE)
GeneScores$FDR<-p.adjust(GeneScores$pvalue,method = 'fdr')
GeneScores$Zequiv<-qnorm(GeneScores$pvalue/2)*(-1)*sign(GeneScores$SLIDER_score)
#Exporting Output Files. sgRNA level scoring and categroization are output into a seperate CSV file from gene level scores.
filename<-paste(StudyName,'SLIDER_L2E_Tdist.txt')
sink(filename)
print(SLIDER_Tdist_params)
sink()
Filename<-paste(StudyName,'_SLIDER_sgRNA_scores_categorization.csv')
write.csv(x = Data_Ordered, Filename)
Filename<-paste(StudyName,'_SLIDER_GeneScores.csv')
write.csv(x = GeneScores, Filename)
Filename<-paste(StudyName,'_SLIDER_EnrichedGenes_ActiveGuides.csv')
write.csv(x = Enriched, Filename)
Filename<-paste(StudyName,'_SLIDER_DepletedGenes_ActiveGuides.csv')
write.csv(x = Depleted, Filename)
Filename<-paste(StudyName,'_SLIDER_NeutralGenes_NeutralGuides.csv')
write.csv(x = Neutral, Filename)
#Outputting Plots to a PDF file
Filename<-paste(StudyName,'_SLIDER_plots.pdf')
pdf(file = Filename)
#Plotting Distribution of Normalized Change in sgRNA Rank between the sorted and unsorted sample
plotname<-paste(StudyName,'\n Histogram of Normalized Change in sgRNA Rank')
b_graph <- sum(abs(Data$SortVsUnsort_NormRankDiff - 0))/length(Data$SortVsUnsort_NormRankDiff)
ymax_plot <- round((1/(2*b_graph)),1)
x_graph <- seq(-1,1,0.01)
laplace_line <-(1/(2*b_graph))*exp(-abs(x_graph)/b_graph)
hist(Data$SortVsUnsort_NormRankDiff, breaks=75, xlim=c(-1,1), col=rgb(0,1,0,0.4), ylim=c(0,ymax_plot), main=plotname, xlab='Normalized Change in sgRNA Rank', prob=T)
lines(x_graph, laplace_line, type='l', col='darkgreen', lwd=2)
legend('topright', c('Normalized Change in Rank', 'Laplace Distribution'),col=c('green','darkgreen'), bty='y',lty=c(1,1,1), cex=0.75)
#Plotting Laplace b parameter as a function of normalized rank in the unsorted sample (will appear as two lines, one for b+ and one for b-)
plotname<-paste(StudyName,'\n Laplace Parameter "b" vs Unsorted sgRNA Rank')
plot(Data_Ordered$Unsort_NormRank, Data_Ordered$b, xlab = "sgRNA Rank Prior to Sorting", ylab="Laplace Parameter b", main=plotname, pch=20)
#Plotting Distibution of sgRNA Abundance (Log-2-Normalized Read Counts) Before and After Sorting
plotname<-paste(StudyName,'\n sgRNA Abundance Before and After Sorting')
hist(Data_Ordered$Unsort_log2PC, breaks=100, xlim=c(0,10), col=rgb(0,0,1,0.4), main=plotname, xlab='sgRNA Abundance (log2(PC))', prob=T, ylim=c(0,0.75))
hist(Data_Ordered$Sort_log2PC, breaks=100, xlim=c(0,10), col=rgb(1,0,0,0.4), prob=T, add=T)
legend('topright', c('Unsorted', 'Sorted'),col=c('blue','red'), bty='y',lty=c(1,1,1), cex=0.75)
#Plotting -log10(sgRNA_pvalue) Colormapped Scatter Plots and an Image Figure of Colormap
z=matrix(1:20,nrow=1)
x=1
y=c(0,0.75,1.5,max(-log10(Data_Ordered$sgRNA_pvalue)))
y=append(y,seq(from=2.0, to=5, len=17),after=3)
redColRamp <- colorRampPalette(c('black','red'))
sgRNApvalue_Col<- redColRamp(20)[as.numeric(cut(-log10(Data_Ordered$sgRNA_pvalue), breaks = y))]
plotname<-paste(StudyName,'\n Sorted vs Unsorted sgRNA Abundance')
plot(Data_Ordered$Unsort_log2PC, Data_Ordered$Sort_log2PC, col=sgRNApvalue_Col, xlab = "Unsorted (log2(PC))", ylab="Sorted (log2(PC))", main=plotname, pch=20)
#Plotting -log10(sgRNA_pvalue) Plots of Unsort-vs-Sort Rank and an Image Figure of Colormap
z=matrix(1:20,nrow=1)
x=1
y=c(0,0.75,1.5,max(-log10(Data_Ordered$sgRNA_pvalue)))
y=append(y,seq(from=2.0, to=5, len=17),after=3)
redColRamp <- colorRampPalette(c('black','red'))
sgRNApvalue_Col<- redColRamp(20)[as.numeric(cut(-log10(Data_Ordered$sgRNA_pvalue), breaks = y))]
plotname<-paste(StudyName,'\n Sorted vs Unsorted sgRNA Ranks')
plot(Data_Ordered$Unsort_NormRank, Data_Ordered$Sort_NormRank, col=sgRNApvalue_Col, xlab = "Unsorted sgRNA Rank", ylab="Sorted sgRNA Rank", main=plotname, pch=20)
#plotting LFC against Unsorted Rank for each sgRNA as a -log10(sgRNA_pvalue) colormapped scatter plot
z=matrix(1:20,nrow=1)
x=1
y=c(0,0.75,1.5,max(-log10(Data_Ordered$sgRNA_pvalue)))
y=append(y,seq(from=2.0, to=5, len=17),after=3)
redColRamp <- colorRampPalette(c('black','red'))
sgRNApvalue_Col<- redColRamp(20)[as.numeric(cut(-log10(Data_Ordered$sgRNA_pvalue), breaks = y))]
plotname<-paste(StudyName,'\n Unsorted sgRNA Rank vs Log Fold Change')
plot(Data_Ordered$Unsort_NormRank, (Data_Ordered$Sort_log2PC - Data_Ordered$Unsort_log2PC), col=sgRNApvalue_Col, xlab = "Unsorted sgRNA Rank", ylab="sgRNA LFC Sorted-vs-Unsorted", main=plotname, pch=20)
#plotting LFC against Sorted Rank for each sgRNA as a -log10(sgRNA_pvalue) colormapped scatter plot
z=matrix(1:20,nrow=1)
x=1
y=c(0,0.75,1.5,max(-log10(Data_Ordered$sgRNA_pvalue)))
y=append(y,seq(from=2.0, to=5, len=17),after=3)
redColRamp <- colorRampPalette(c('black','red'))
sgRNApvalue_Col<- redColRamp(20)[as.numeric(cut(-log10(Data_Ordered$sgRNA_pvalue), breaks = y))]
plotname<-paste(StudyName,'\n Sorted sgRNA Rank vs Log Fold Change')
plot(Data_Ordered$Sort_NormRank, (Data_Ordered$Sort_log2PC - Data_Ordered$Unsort_log2PC), col=sgRNApvalue_Col, xlab = "Sorted sgRNA Rank", ylab="sgRNA LFC Sorted-vs-Unsorted", main=plotname, pch=20)
#plotting image of colormap for sgRNA pvalues
y2<-c(0,0.75,1.5,seq(from=2.0, to=5, len=17),6)
image(x,y2, z,col=redColRamp(20), ylab="-log10(sgRNA_pval)")
#Plotting Each Gene's SLIDER Score Against the Mean Difference in Rank of its sgRNAs When Comparing Sorted to Unsorted
z=matrix(1:20,nrow=1)
x=1
y=c(0,0.75,1.5,(max(-log10(GeneScores$pvalue))+1))
y=append(y,seq(from = 2.0, to = (max(-log10(GeneScores$pvalue))-1),len=17),after=3)
redColRamp <- colorRampPalette(c('black','red'))
Gene_pvalue_Col<- redColRamp(20)[as.numeric(cut(-log10(GeneScores$pvalue), breaks = y))]
plotname<-paste(StudyName,'\n SLIDER_Tstat vs Average Change in Rank of sgRNA for Each Gene Colored by pvalue')
plot(GeneScores$meanDiffRank, GeneScores$SLIDER_score, xlim=c(-0.6,0.6), ylim=c(-70,70), col=Gene_pvalue_Col, xlab = "Mean Difference in Rank (Sorted-vs-Unsorted)", ylab="SLIDER Score", main=plotname, pch=20)
#plotting colormap of Gene pvalue
image(x,y, z,col=redColRamp(20), ylab = "-log(Gene_pval)")
#Plotting Each Gene's -log10(pvalue) Against the Mean Change in Rank of its sgRNA from Comparing Sorted to Unsorted (Rank Based Volcano Plot)
z=matrix(1:20,nrow=1)
x=1
y1=seq(from = -35, to = 35,len=18)
minval<-(min(GeneScores$SLIDER_score)-1)
y=append(minval,y1,after=length(minval))
y=append(y,(max(GeneScores$SLIDER_score)+1),after=length(y))
redColRamp <- colorRampPalette(c('blue','black','red'))
SLIDER_T_Col<- redColRamp(20)[as.numeric(cut(GeneScores$SLIDER_score, breaks = y))]
plotname<-paste(StudyName,'\n Volcano Plot for Genes Assayed (by Rank), Colored by SLIDER Score')
plot(GeneScores$meanDiffRank, -log10(GeneScores$pvalue), col=SLIDER_T_Col, xlab = "Mean Change in Rank (Sorted-vs-Unsorted)", ylab="-log10(pvalue)", main=plotname, pch=20, xlim=c(-0.75,0.75), ylim=c(0,(1+(-log10(min(GeneScores$pvalue))))))
#Plotting colormap of SLIDER stats
y2<-append((min(y1)-5),y1,after=length(min(y1)-3))
y2<-append(y2,max(y2+5),after=length(y))
image(x,y2,z,col=redColRamp(20), ylab = "SLIDER score")
#Plotting Tdist Over Histogram of SLIDER Scores
plotname<-paste(StudyName,'\n t-Distirbution Over Histogram of SLIDER Scores')
hist(GeneScores$SLIDER_score, breaks=2000, xlim=c(-10,10), col=rgb(0,1,0,0.4), main=plotname, xlab='Statistic', prob=T, ylim=c(0,0.7))
curve(dt(x, df=degfree), col='darkgreen', lwd=2, add=TRUE, yaxt='n', n=1001)
legend('topright', c('SLIDER Score', 'Students t-Dist'),col=c('green','darkgreen'), bty='y',lty=c(1,1,1), cex=0.75)
#Generating Dot Plots of SLIDER Scores Ranked from Largest to Smallest and an Image of the Matching Colormap. Making Two plots, one with and one without gene name labels.
plotname<-paste(StudyName,'\n Ranked SLIDER Scores')
RankedGeneScores<-GeneScores[order(-GeneScores$SLIDER_score),]
z=matrix(1:20,nrow=1)
x=1
y=c(0,0.75,1.5,(max(-log10(RankedGeneScores$pvalue))+1))
y=append(y,seq(from = 2.0, to = (max(-log10(RankedGeneScores$pvalue))-1),len=17),after=3)
redColRamp <- colorRampPalette(c('black','red'))
RankedGene_pvalue_Col<- redColRamp(20)[as.numeric(cut(-log10(RankedGeneScores$pvalue), breaks = y))]
plot(RankedGeneScores$SLIDER_score, col=RankedGene_pvalue_Col, xlab="Rank by SLIDER score", ylab="SLIDER Score", main=plotname, pch=20, xlim=c(0,(2000+length(RankedGeneScores$SLIDER_score))))
plot(RankedGeneScores$SLIDER_score, col=RankedGene_pvalue_Col, xlab="Rank by SLIDER score", ylab="SLIDER Score", main=plotname, pch=20, xlim=c(0,(2000+length(RankedGeneScores$SLIDER_score))))
text(Indices<-seq(from=1, to=length(RankedGeneScores$SLIDER_score), by = 1), RankedGeneScores$SLIDER_score, labels = RankedGeneScores$GeneName, cex=0.6, pos=4)
image(x,y,z,col=redColRamp(20), ylab = "Gene -log10(pvalue)")
dev.off()