-
Notifications
You must be signed in to change notification settings - Fork 0
/
hiv_vb_variant_annotate_amino_acid_substitutions.R
231 lines (199 loc) · 9.61 KB
/
hiv_vb_variant_annotate_amino_acid_substitutions.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
# Author: Chris Wymant
# Acknowledgment: I wrote this while funded by ERC Advanced Grant PBDR-339251
# and a Li Ka Shing Foundation grant, both awarded to Christophe Fraser.
# This script collects codon-level annotations for the HXB2 HIV genome and
# transfers them to those positions where two other amino acid sequences (for
# example in the present study, the consensus of the VB variant sequences and
# the consensus of other Dutch subtype-B sequences in BEEHIVE) disagree with
# each other.
# Abbreviations:
# AA = amino acid
# df = dataframe
# seq = sequence
# aln = alignment
library(tidyverse)
library(seqinr)
# WARNING: the command below deletes all objects in memory in your current R
# session. Convenient when repeatedly re-running this script.
rm(list = ls())
# INPUT ------------------------------------------------------------------------
# Set the working directory to where this code lives on your machine
# setwd()
# The per-gene amino acid alignments with the VB variant consensus, the
# consensus of other Dutch subtype-B in BEEHIVE, and HXB2, provided as
# supplementary data with the associated paper.
files_in_aln <- Sys.glob("VB_vs_DutchB_*.fasta")
file_in_hxb2_annotated_codons <- "data_external/hxb2_annotated_SpecialSites_ByCodon.csv"
file_out_vb_differences <- "VB_vs_Dutch_B_subsitutions_annotated.csv"
# MAIN -------------------------------------------------------------------------
stopifnot(file.exists(file_in_hxb2_annotated_codons))
stopifnot(length(files_in_aln) > 0L)
# Label alns by their gene
genes <- str_match(files_in_aln, "VB_vs_DutchB_([a-z]{3}).fasta")[, 2]
names(files_in_aln) <- genes
# Read in the HXB2 annotation file. Check that each gene-codon is unique.
df_hxb2 <- read_csv(file_in_hxb2_annotated_codons, col_types = cols(
codon_in_that_gene = col_integer()
)) %>%
rename(codon = codon_in_that_gene)
stopifnot(!anyDuplicated(df_hxb2 %>% select(gene, codon)))
# Iterate through genes, reading in that alignment, recording codons which
# differ between the VB variant and the consensus of other Dutch subtype B seqs
# in BEEHIVE, and transferring HXB2 annotations to those positions. Merge the
# results from all genes into a single df.
df_differences <- map(genes, function(gene_) {
cat("Analysing alignment for", gene_, "\n")
# Read in the aln, convert seqs to upper case
aln <- read.fasta(files_in_aln[[gene_]], seqtype = "AA") %>%
map(toupper)
# Check the three expected seq names are present
name_hxb2 <- "HXB2"
name_dutch_b <- "DutchSubtypeB"
name_lineage <- "VB"
stopifnot(name_hxb2 %in% names(aln))
stopifnot(name_dutch_b %in% names(aln))
stopifnot(name_lineage %in% names(aln))
seq_hxb2 <- aln[[name_hxb2]]
seq_dutch_b <- aln[[name_dutch_b]]
seq_lineage <- aln[[name_lineage]]
# Check all seqs have the same length
length_aln <- length(seq_hxb2)
stopifnot(length_aln == length(seq_dutch_b))
stopifnot(length_aln == length(seq_lineage))
# Check the HXB2 seq in the aln is what we expected
seq_hxb2_ungapped <- seq_hxb2[seq_hxb2 != "-"]
seq_hxb2_expected <- df_hxb2 %>%
filter(gene == gene_) %>%
arrange(codon) %>%
pull(HXB2_amino_acid)
stopifnot(identical(seq_hxb2_ungapped,
seq_hxb2_expected))
# Get the HXB2 position of aln positions where the lineage and dutch B seqs
# disagree
aln_pos_differences <- integer()
hxb2_pos_differences <- integer()
hxb2_pos_current <- 0L
for (pos in 1:length_aln) {
if (seq_hxb2[[pos]] != "-") hxb2_pos_current <- hxb2_pos_current + 1L
if (seq_dutch_b[[pos]] == seq_lineage[[pos]] ||
seq_dutch_b[[pos]] == "X" ||
seq_lineage[[pos]] == "X") {
next
}
aln_pos_differences[[length(aln_pos_differences) + 1L]] <- pos
hxb2_pos_differences[[length(hxb2_pos_differences) + 1L]] <- hxb2_pos_current
}
# Record the AAs and positions of disagreements
df_gene <- tibble(gene = gene_,
gene_position_hxb2 = hxb2_pos_differences,
aa_hxb2 = seq_hxb2[aln_pos_differences],
aa_DutchB_inBEEHIVE = seq_dutch_b[aln_pos_differences],
aa_vb = seq_lineage[aln_pos_differences])
stopifnot(all(! df_gene$aa_DutchB_inBEEHIVE == df_gene$aa_vb))
# Merge in annotation details
df_gene <- left_join(df_gene,
df_hxb2 %>% filter(gene == gene_) %>% select(-gene),
by = c("gene_position_hxb2" = "codon"))
stopifnot(all(df_gene$aa_hxb2 == df_gene$HXB2_amino_acid |
df_gene$aa_hxb2 == "-"))
df_gene %>% select(-HXB2_amino_acid)
}) %>%
bind_rows() %>%
arrange(gene)
# Categorise differences either substitutions or indels
df_differences <- df_differences %>%
mutate(difference_type = if_else(aa_DutchB_inBEEHIVE != "-" & aa_vb != "-",
"substitution", "indel"))
table(df_differences$difference_type)
# Restrict the set of CTL escapes to amino acids that the VB variant has, i.e.
# not just any amino acid at a site where the variant has some mutation.
max_num_escapes_per_codon <- df_differences %>%
pull(CTL_escape_details_for_q_less_than_1percent) %>%
str_count(";") %>%
max(na.rm = TRUE) + 1
df_differences_specific_escapes <- df_differences %>%
select(gene, gene_position_hxb2, aa_vb, CTL_escape_details_for_q_less_than_1percent) %>%
separate(CTL_escape_details_for_q_less_than_1percent,
into = paste0("Escape_", 1:max_num_escapes_per_codon),
sep = "; ", fill = "right") %>%
pivot_longer(starts_with("Escape_"), values_to = "escape") %>%
filter(!is.na(escape)) %>%
select(-name) %>%
filter(substr(escape, 1, 1) == aa_vb)
# Separate the mutations into adaptive (the amino acid is enriched in the
# presence of the noted HLA type, indicated by a positive log odds for escape)
# vs nonadaptive (the amino acid is depleted in the presence of the noted HLA
# type, indicated by a negative log odds for escape).
df_differences_specific_escapes <- df_differences_specific_escapes %>%
mutate(adaptive =
str_detect(escape, "and escape log odds [0-9]*\\.{0,1}[0-9]+ for HLA")) %>%
group_by(gene, gene_position_hxb2, aa_vb, adaptive) %>%
summarise(escapes = paste(escape, collapse = "; "), .groups = "drop") %>%
mutate(escapes_adaptive = if_else( adaptive, escapes, NA_character_),
escapes_nonadaptive = if_else(!adaptive, escapes, NA_character_)) %>%
select(-c("adaptive", "escapes")) %>%
group_by(gene, gene_position_hxb2, aa_vb) %>%
summarise(escapes_adaptive = paste(na.omit(escapes_adaptive), collapse = ""),
escapes_nonadaptive = paste(na.omit(escapes_nonadaptive), collapse = ""),
.groups = "drop") %>%
mutate(escapes_adaptive = if_else(escapes_adaptive == "", NA_character_,
escapes_adaptive),
escapes_nonadaptive = if_else(escapes_nonadaptive == "", NA_character_,
escapes_nonadaptive))
# Merge the VB-specific CTL mutations back into the table with all mutations
nrow_df_differences_before_merge <- nrow(df_differences)
df_differences <- left_join(df_differences, df_differences_specific_escapes,
by = c("gene", "gene_position_hxb2", "aa_vb"))
if (nrow(df_differences) != nrow_df_differences_before_merge) {
stop(paste0("Problem merging VB-specific CTL escape info into the df with all info"))
}
# Write output
write_csv(df_differences %>% select(gene,
gene_position_hxb2,
HXB2_base_positions,
aa_hxb2,
aa_DutchB_inBEEHIVE,
aa_vb,
difference_type,
DRM_Los_Alamos,
DRM_IAS,
Other_features,
CTL_escape_details_for_q_less_than_1percent,
escapes_adaptive,
escapes_nonadaptive) %>%
replace_na(list(DRM_Los_Alamos = "",
DRM_IAS = "",
Other_features = "",
CTL_escape_details_for_q_less_than_1percent = "",
escapes_adaptive = "",
escapes_nonadaptive = "")) %>%
rename(CTL_escape_all = CTL_escape_details_for_q_less_than_1percent,
CTL_escape_vb_adaptive = escapes_adaptive,
CTL_escape_vb_nonadaptive = escapes_nonadaptive),
file_out_vb_differences)
# Print some numbers of mutations of interest
num_escapes <- df_differences %>%
mutate(escapes_either = (!is.na(escapes_adaptive)) | (!is.na(escapes_nonadaptive))) %>%
pull(escapes_either) %>%
sum()
num_escapes
num_escapes_both_directions <- df_differences %>%
mutate(escapes_both = (!is.na(escapes_adaptive)) & (!is.na(escapes_nonadaptive))) %>%
pull(escapes_both) %>%
sum()
num_escapes_adaptive <- sum(!is.na(df_differences$escapes_adaptive))
num_escapes_adaptive
num_escapes_nonadaptive <- sum(!is.na(df_differences$escapes_nonadaptive))
num_escapes_nonadaptive
num_mutations_at_ctl_site <- sum(!is.na(df_differences$CTL_escape_details_for_q_less_than_1percent))
num_mutations_at_ctl_site
# What are the HLA types for which the VB variant has adaptations?
escapes_adaptive_hlas <- df_differences$escapes_adaptive %>%
na.omit() %>%
str_split("; ") %>%
unlist()
escapes_adaptive_hlas <- str_match(escapes_adaptive_hlas, pattern = "for HLA (.+)")[,2]
escapes_adaptive_hlas
table_ <- table(escapes_adaptive_hlas)
tibble(hla = names(table_),
count = table_)