-
-
Notifications
You must be signed in to change notification settings - Fork 0
/
covid19.py
173 lines (145 loc) · 5.5 KB
/
covid19.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
import streamlit as st
import pandas as pd
import altair as alt
from Bio import SeqIO
from Bio.Seq import Seq
from Bio import pairwise2
"""
# Covid-19 Genome analysis
This is a simple analysis of SARS-COV-2 complete genome samples from Thailand and China. The data is downloaded from [NCBI](https://www.ncbi.nlm.nih.gov/labs/virus/vssi/#/virus?SeqType_s=Protein&VirusLineage_ss=Severe%20acute%20respiratory%20syndrome%20coronavirus%202%20(SARS-CoV-2),%20taxid:2697049).
Tools
- BioPython
- Streamlit
"""
covid19th = SeqIO.read('./data/thailand/covid19-dna-complete.fasta', "fasta")
covid19ch = SeqIO.read('./data/china/covid19-dna-complete.fasta', "fasta")
covid_DNA_th = covid19th.seq
covid_DNA_ch = covid19ch.seq
dna_th_200 = covid_DNA_th[:200]
dna_ch_200 = covid_DNA_ch[:200]
checkbox1 = st.sidebar.checkbox('View full TH DNA sequence')
checkbox2 = st.sidebar.checkbox('View full CH DNA sequence')
if checkbox1:
dnaLength1 = covid_DNA_th
else:
dnaLength1 = dna_th_200
if checkbox2:
dnaLength2 = covid_DNA_ch
else:
dnaLength2 = dna_ch_200
'''
## DNA sequence
'''
sequence = st.text_area("SARS-CoV-2 Thailand DNA Seqeunce", dnaLength1, height=30)
sequence = st.text_area("SARS-CoV-2 China DNA Seqeunce", dnaLength2, height=30)
st.write(f'Thailand DNA Seqeunce consists of {len(covid19th)} genetic bases or letters.')
st.write(f'China DNA Seqeunce consists of {len(covid19ch)} genetic bases or letters.')
'''
## Nucleotides
### Nucleotides frequency
Count the nucleotides frequency in the DNA
'''
DNA_th = covid_DNA_th
DNA_ch = covid_DNA_ch
nucleotides_th={}
nucleotides_ch={}
# get nucleotides frequency
def getNucleotides(dna, nucleotides):
for n in dna:
if n in nucleotides:
nucleotides[n] += 1
else:
nucleotides[n] = 1
getNucleotides(DNA_th, nucleotides_th)
getNucleotides(DNA_ch, nucleotides_ch)
col1, col2 = st.beta_columns(2)
col1.write('Thailand')
df1 = pd.DataFrame.from_dict(nucleotides_th, orient='index')
df1 = df1.rename({0: 'Nucleotide Frequency'}, axis='columns')
df1.reset_index(inplace=True)
df1 = df1.rename(columns = {'index': 'Nucleotide'})
col1.dataframe(df1)
col2.write('China')
df2 = pd.DataFrame.from_dict(nucleotides_ch, orient='index')
df2 = df2.rename({0: 'Nucleotide Frequency'}, axis='columns')
df2.reset_index(inplace=True)
df2 = df2.rename(columns = {'index': 'Nucleotide'})
col2.dataframe(df2)
'''
### Nucleotide chart
'''
st.write('Thailand')
p1 = alt.Chart(df1).mark_bar().encode(
x=alt.X('Nucleotide', sort='ascending'),
y='Nucleotide Frequency',
tooltip=['Nucleotide','Nucleotide Frequency']
)
st.altair_chart(p1, use_container_width=True)
st.write('China')
p2 = alt.Chart(df2).mark_bar().encode(
x=alt.X('Nucleotide', sort='ascending'),
y='Nucleotide Frequency',
tooltip=['Nucleotide','Nucleotide Frequency']
)
st.altair_chart(p2, use_container_width=True)
'''
## mRNA
### Transcription
Basically the mRNA is a copy of our DNA.
However, in RNA, a base called uracil (U) replaces thymine (T) as the complementary
nucleotide to adenine (that's the only difference, T is replaced by U).
'''
covid_mRNA_th = covid_DNA_th.transcribe()
covid_mRNA_ch = covid_DNA_ch.transcribe()
sequence = st.text_area("Thailand mRNA Seqeunce", covid_mRNA_th[:100], height=25)
sequence = st.text_area("China mRNA Seqeunce", covid_mRNA_ch[:100], height=25)
'''
In order to see the difference, we align the covid-19 DNA and mRNA sequences.
We can see that the mRNA is an identical copy with the T base replaced by U.
In this example we show COVID-19 from Thailand.
'''
st.markdown(f'Covid-19 TH DNA: `{covid_DNA_th[:50]}`')
st.markdown(f'Covid-19 TH RNA: `{covid_mRNA_th[:50]}`')
'''
## Amino acids
### Translation
Translation is the process that takes the information passed from DNA as
messenger RNA and turns it into a series of amino acids.
'''
def pad_seq(sequence):
""" Pad sequence to multiple of 3 with N"""
remainder = len(sequence) % 3
return sequence if remainder == 0 else sequence + Seq('N' * (3 - remainder))
aa_th = pad_seq(covid_mRNA_th).translate()
aa_ch= pad_seq(covid_mRNA_ch).translate()
proteins_th = aa_th.split('*')
proteins_ch = aa_ch.split('*')
st.text_area("Thailand mRNA Translation", aa_th[:100], height=25)
st.text_area("China mRNA Translation", aa_ch[:100], height=25)
st.markdown(f"TH Covid-19's genome has {len(aa_th)} amino acids and {len(proteins_th)} amino acid chains")
st.markdown(f"CH Covid-19's genome has {len(aa_ch)} amino acids and {len(proteins_ch)} amino acid chains")
for i in proteins_th[:]:
if len(i) < 50:
proteins_th.remove(i)
for i in proteins_ch[:]:
if len(i) < 50:
proteins_ch.remove(i)
col1, col2 = st.beta_columns(2)
protein_clean_th = [str(i) for i in proteins_th]
df3 = pd.DataFrame({'amino_acids':protein_clean_th})
df3['count'] = df3['amino_acids'].str.len()
df3.head()
col1.write(df3)
col1.markdown(f'{len(proteins_th)} proteins with more than 50 amino acids from the Thailand covid-19 genome')
protein_clean_ch = [str(i) for i in proteins_ch]
d4 = pd.DataFrame({'amino_acids':protein_clean_ch})
d4['count'] = d4['amino_acids'].str.len()
d4.head()
col2.write(d4)
col2.markdown(f'{len(proteins_ch)} proteins with more than 50 amino acids from the China covid-19 genome')
# '''
# ## Comparing Thailand and China strain
# Using pairwise2 to compare sequence alignment in order to identify similarities.
# '''
# alignment_score = pairwise2.align.globalxx(covid_DNA_th, covid_DNA_ch, one_alignment_only=True, score_only=True)
# st.write('COV-TH/COV-CH Similarity (%):', alignment_score / len(covid_DNA_th) * 100)