-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path04_calculate_regions.R
264 lines (209 loc) · 12.1 KB
/
04_calculate_regions.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
#### PREPARATION STEPS ####
## You can run this code as it is to process a small subset of proteins, or you can follow these next steps to analyze the entire dataset.
## 1. Make sure you have run all previous codes
## 2. Set the "testing" variable in the config below to FALSE, or run this code with the "-test F" argument
## 3. If you are running this code in Rstudio, set the "main_folder" variable in the config below to the folder containing this code
#### CONFIG ####
main_folder <- "." #When running in Rstudio, set this to the absolute path of the folder containing this code
testing <- TRUE #set this to FALSE when running the actual data
#### READ ARGUMENTS AND GET PATH (DO NOT CHANGE) ####
args <- commandArgs(TRUE)
if (length(args == 2)) {
if (args[1] == "-test") {
testing <- as.logical(args[2])
}
}
if (testing == TRUE) {
#For testing
project_folder <- sprintf("%s/test_data", main_folder)
extra_files_folder <- sprintf("%s/77_extra_files", main_folder)
} else {
#For running the actual data
project_folder <- sprintf("%s/chagastope_data", main_folder)
extra_files_folder <- sprintf("%s/77_extra_files", main_folder)
}
#### INTERNAL CONFIG (DO NOT CHANGE) ####
library(data.table)
chronic_peaks_data_file <- sprintf("%s/outputs/03_pools_antigenic_peaks/pools_peaks_cutoff4SD_2pep.tsv", project_folder)
sylvio_fasta_data_file <- sprintf("%s/sylvio-x10-tritrypdb-5.Dec.2016-no-pseudo_SIMPLIFIED_TABBED.tsv", extra_files_folder)
brener_fasta_data_file <- sprintf("%s/tcruzi-clbrener-tritrypdb-5.Dec.2016-no-pseudo_SIMPLIFIED_TABBED.tsv", extra_files_folder)
chronic_peaks_tag <- "analyzed_proteome_cutoff_4_SD_2_pep" #tag for the final file
chronic_peaks_types <- c("PO")
chronic_peaks_max_other_type_signal_ratio <- 0.2 #-1 means keep all
chronic_peaks_peptide_length <- 16
chronic_peaks_peptide_offset <- 1 #using offset 1 to get all the sequence
#min_peptide_length is the minimum peptide length to return, meaning that if a protein has a sequence of
#less than peptide_length, but more or equal than min_peptide_length, then it will be returned "full"
#(a shorter peptide)
chronic_peaks_min_peptide_length <- 12
chronic_peaks_border_peptide_amount <- 16
#Outputs
detailed_region_output_file <- sprintf("%s/outputs/04_antigenic_regions/peptide_region_data.tsv", project_folder)
summary_region_output_file <- sprintf("%s/outputs/04_antigenic_regions/summary_region_data.tsv", project_folder)
#### AUXILIAR FUNCTIONS ####
calculateRegions <- function(peaks_data,
fasta_data,
peaks_types,
peaks_max_other_type_signal_ratio,
peaks_peptide_length,
peaks_peptide_offset,
peaks_min_peptide_length,
peaks_border_peptide_amount,
peaks_tag) {
### Filter the peaks
#Keep only the peaks with 1 source (because the rest are combinations of those)
peaks_data <- peaks_data[!(grepl(",", source))]
#Keep only allowed types
peaks_data <- peaks_data[type %in% peaks_types]
#Keep only peaks with an acceptable level of other type signal
if (peaks_max_other_type_signal_ratio != -1) {
peaks_data <- peaks_data[other_type_best_ratio_original_peak_ratio <= peaks_max_other_type_signal_ratio]
}
#Remove extra info from the peaks and simplify the sources
peaks_data <- peaks_data[, .(protein, peak_start, peak_last_start)]
peaks_data <- unique(peaks_data)
### Parse data
#Filter the FASTA to keep only data from my peaks
sub_fasta_data <- fasta_data[protein %in% unique(peaks_data$protein)]
#Get all the peptides of all the proteins with at least one peak (they'll be filtered later on)
peptide_data <- splitFastaIntoPeptides(fasta_data = sub_fasta_data,
peptide_length = peaks_peptide_length,
peptide_offset = peaks_peptide_offset,
min_peptide_length = peaks_min_peptide_length)
#Add peak data information to the peptides and keep only the peptides in the peaks and borders
peaks_border_start_amount <- peaks_border_peptide_amount * peaks_peptide_offset
peptide_data <- merge(peptide_data,
peaks_data,
by = "protein",
all.x = T,
allow.cartesian = T)
peptide_data <- peptide_data[(start >= (peak_start - peaks_border_start_amount)) &
(start <= (peak_last_start + peaks_border_start_amount))]
#Add border info
peptide_data$is_border <- 1
peptide_data[(start >= peak_start) & (start <= peak_last_start), is_border := 0]
#Remove the peak information and simplify repeated peptides
#In this case if I have a peptide that is both border and not border it will be recorder as not border
peptide_data <- peptide_data[, .(is_border = as.numeric(all(as.logical(is_border)))), by = .(protein, start, peptide)]
# peptide_data <- unique(peptide_data[, .(protein, start, peptide, is_border)])
peptide_data <- peptide_data[, .(protein, start, peptide, is_border)]
#Set the tag to the chronic peaks
peptide_data$origin <- peaks_tag
peptide_data
}
splitFastaIntoPeptides <- function(fasta_data, peptide_length, peptide_offset, min_peptide_length) {
#min_peptide_length is the minimum peptide length to return, meaning that if a protein has a sequence of
#less than peptide_length, but more or equal than min_peptide_length, then it will be returned "full"
#(a shorter peptide)
#Add the length of the sequences
fasta_data[, seq_len := nchar(sequence)]
#Calculate the amount of peptides per protein based on the length and offset
fasta_data[, pep_amount := ceiling((seq_len - peptide_length + 1) / peptide_offset)]
#Save the shorter proteins to process later on, and remove them from the data
shorter_proteins <- fasta_data[(seq_len >= min_peptide_length) & (seq_len < peptide_length)]
fasta_data <- fasta_data[seq_len >= peptide_length]
if (fasta_data[, .N] > 0) {
#Replicate the data. Each sequence will be trimmed to one peptide
peptide_data <- data.table(protein = rep(fasta_data$protein, fasta_data$pep_amount),
sequence = rep(fasta_data$sequence, fasta_data$pep_amount))
#Calculate the start and end of each peptide
peptide_data[, start := .SD[,.I], by = protein]
#Since I'm getting only the peptides I'll use from the start, I don't have to filter out peptides
#but I have to fix the starts
peptide_data[, start := ((start - 1) * peptide_offset) + 1]
#Calculate the end
peptide_data[, end := start + peptide_length - 1]
#Trim the sequences into peptides
peptide_data[, peptide := substring(sequence, start, end)]
#Remove extra columns
peptide_data <- peptide_data[, -c("sequence")]
} else {
peptide_data <- data.table(protein = character(),
start = numeric(),
end = numeric(),
peptide = character())
}
#If necessary, add the shorter sequences
if (shorter_proteins[, .N] > 0) {
shorter_proteins$start <- 1
shorter_proteins[, end := start + nchar(sequence) - 1]
shorter_proteins <- shorter_proteins[, .(protein, start, end, peptide = sequence)]
peptide_data <- rbindlist(list(peptide_data, shorter_proteins))
}
peptide_data
}
#### CALCULATE REGIONS ####
## Load data
chronic_peaks_data <- fread(chronic_peaks_data_file, header = TRUE, sep = "\t", na.strings = NULL)
sylvio_fasta_data <- fread(sylvio_fasta_data_file, header = TRUE, sep = "\t", na.strings = NULL)
brener_fasta_data <- fread(brener_fasta_data_file, header = TRUE, sep = "\t", na.strings = NULL)
## Combine both fasta data
chronic_all_fasta <- rbindlist(list(sylvio_fasta_data, brener_fasta_data))
## Calculate regions
region_data <- calculateRegions(peaks_data = chronic_peaks_data,
fasta_data = chronic_all_fasta,
peaks_types = chronic_peaks_types,
peaks_max_other_type_signal_ratio = chronic_peaks_max_other_type_signal_ratio,
peaks_peptide_length = chronic_peaks_peptide_length,
peaks_peptide_offset = chronic_peaks_peptide_offset,
peaks_min_peptide_length = chronic_peaks_min_peptide_length,
peaks_border_peptide_amount = chronic_peaks_border_peptide_amount,
peaks_tag = chronic_peaks_tag)
## Calculate the region numbers
region_data <- region_data[order(protein, start)]
region_data[, last_protein := c("", protein[1:.N-1])]
region_data[, last_start := c(-1, start[1:.N-1])]
region_data$same_region <- 1
region_data[(protein != last_protein) | (start > last_start + chronic_peaks_peptide_offset), same_region := 0]
region_starts <- which(region_data$same_region == 0)
region_aux <- c()
for (i in 1:length(region_starts)) {
if (i == 1) {
start_aux <- region_starts[i]
} else {
end_aux <- region_starts[i] - 1
region_aux <- c(region_aux, rep(i - 1, times = end_aux - start_aux + 1))
start_aux <- region_starts[i]
}
}
end_aux <- region_data[, .N]
region_aux <- c(region_aux, rep(i, times = end_aux - start_aux + 1))
region_data$region <- region_aux
region_data <- region_data[, .(protein, start, region, is_border, peptide, origin)]
## Calculate the region sequence
region_data[, sequence_aux := substring(peptide, 16, 16)]
region_data[, relative_pos_aux := .SD[, .I], by = region]
region_data[relative_pos_aux == 1, sequence_aux := peptide]
region_summary <- region_data[, .(protein = .SD[1]$protein,
start = min(start),
last_start = max(start),
sequence = paste(sequence_aux, collapse = "")), by = region]
region_summary[, aa_amount := last_start - start + chronic_peaks_peptide_length]
## Calculate the region sequence without OUTSIDE borders
#Calculate the min and max start for the borders in each protein
border_info_aux <- region_data[is_border == 0, .(no_border_start = min(start),
no_border_last_start = max(start)), by = region]
#Add the data
no_border_region_data <- merge(region_data,
border_info_aux,
by = "region")
#Keep only data without the outside borders
no_border_region_data <- no_border_region_data[(start >= no_border_start) & (start <= no_border_last_start)]
#Calculate the rest
no_border_region_data[, sequence_aux := substring(peptide, 16, 16)]
no_border_region_data[, relative_pos_aux := .SD[, .I], by = region]
no_border_region_data[relative_pos_aux == 1, sequence_aux := peptide]
no_border_region_summary <- no_border_region_data[, .(protein = .SD[1]$protein,
no_border_start = min(start),
no_border_last_start = max(start),
no_border_sequence = paste(sequence_aux, collapse = "")), by = region]
no_border_region_summary[, no_border_aa_amount := no_border_last_start - no_border_start + chronic_peaks_peptide_length]
#Add to global summary
region_summary <- merge(region_summary,
no_border_region_summary,
by = c("region", "protein"))
#Remove extra cols
region_data <- region_data[, .(protein, start, region, is_border, peptide)]
## Save output
write.table(region_data, file = detailed_region_output_file, col.names = T, row.names = F, sep = "\t", quote = T)
write.table(region_summary, file = summary_region_output_file, col.names = T, row.names = F, sep = "\t", quote = T)