-
Notifications
You must be signed in to change notification settings - Fork 0
/
tile.R
75 lines (62 loc) · 2.55 KB
/
tile.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
#' tile.R
#' generate all possible peptide tiles from longer sequences.
#' @param input_seqs named character vector or AAStringSet of ORF amino acid sequences
#' @param tile_length number of amino acids to include per tile
#' @param parallel logical. indicates whether to use parallel processing.
#' @param collapse_list. logical. if true (default), return single aastringset with tiles named orfname1_1, orfname1_2, ..., orfname2_1 .... if false, return list of character vectors. each list element is orfname1, orfname2, ..., and each character vector element is named with amino acid start position.
#'
#' @export
tile <-
function(input_seqs,
tile_length,
parallel = TRUE,
collapse_list = TRUE) {
# coerce to character
input_class <- class(input_seqs)[1]
if (input_class == "AAStringSet") {
input_names <- names(input_seqs)
input_seqs <- as.character(input_seqs)
names(input_seqs) <- input_names
input_class <- "character"
}
# remove orfs shorter than tile length
input_seqs <- input_seqs[nchar(input_seqs) >= tile_length]
if (parallel) {
no_cores <- future::availableCores() - 1
future::plan(workers = no_cores)
tile_sample <- furrr::future_map(input_seqs, function(x) {
nchar_x <- x %>% as.character %>% nchar
output_startpos <- 1:(nchar_x - tile_length + 1)
output_subseq <- output_startpos %>% sapply(function(y) {
x %>% Biostrings::subseq(start = y, width = tile_length)
}) %>% Biostrings::AAStringSet()
if (input_class == "character") {
output_subseq <- output_subseq %>% as.character
}
names(output_subseq) <- c(1:length(output_subseq))
output_subseq
})
} else{
tile_sample <- purrr::map(input_seqs, function(x) {
nchar_x <- x %>% as.character %>% nchar
output_startpos <- 1:(nchar_x - tile_length + 1)
output_subseq <- output_startpos %>% sapply(function(y) {
x %>% Biostrings::subseq(start = y, width = tile_length)
}) %>% Biostrings::AAStringSet()
if (input_class == "character") {
output_subseq <- output_subseq %>% as.character
}
names(output_subseq) <- c(1:length(output_subseq))
output_subseq
})
}
# unlist output
if (collapse_list) {
tile_sample <- tile_sample %>% unlist()
names(tile_sample) <-
names(tile_sample) %>% gsub("\\.([0-9]+)$", "__\\1", .)
tile_sample <- tile_sample %>% Biostrings::AAStringSet()
} else{
tile_sample
}
}