Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
pgarrett-scripps committed Mar 27, 2024
1 parent f75f967 commit 1e25583
Show file tree
Hide file tree
Showing 5 changed files with 149 additions and 120 deletions.
136 changes: 73 additions & 63 deletions app.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,7 @@

import pandas as pd
import streamlit as st
from peptacular.sequence import strip_modifications, calculate_sequence_length
from peptacular.digest import identify_cleavage_sites
from peptacular.mass import valid_mass_sequence
from peptacular.constants import AMINO_ACIDS
from peptacular.spans import calculate_span_coverage
import peptacular as pt
import matplotlib as mpl
import regex as reg

Expand All @@ -16,10 +12,14 @@
from util import generate_peptide_df, coverage_string, create_colorbar, generate_app_url, fetch_sequence_from_uniprot, \
make_clickable

# TODO: Add rt gradient input
# TODO: Add CCS prediction since ook0 is relative to mass spec?

st.set_page_config(page_title="proteincleaver", page_icon=":knife:", layout="wide")

# Parse query parameters
params = st.query_params
protein_id = params.get('protein_id', '')
query_peptide_sequence = params.get('protein_sequence', DEFAULT_PROTEIN_SEQUENCE)
query_proteases = params.get('proteases', ';'.join(DEFAULT_PROTEASES)).split(',')
query_custom_regex = params.get('custom_regex', '')
Expand All @@ -36,17 +36,18 @@
query_min_mz = float(params.get('min_mz', DEFAULT_MIN_MZ))
query_max_mz = float(params.get('max_mz', DEFAULT_MAX_MZ))
query_remove_non_proteotypic = params.get('remove_non_proteotypic', 'False').lower() == 'true'
query_n_term_static_mod = float(params.get('n_term_static_mod', 0.0))
query_c_term_static_mod = float(params.get('c_term_static_mod', 0.0))
query_n_term_static_mod = params.get('n_term_static_mod', '')
query_c_term_static_mod = params.get('c_term_static_mod', '')
query_num_static_mods = int(params.get('num_static_mods', DEFAULT_STATIC_MODS))
query_n_term_var_mod = float(params.get('n_term_var_mod', 0.0))
query_c_term_var_mod = float(params.get('c_term_var_mod', 0.0))
query_n_term_var_mod = params.get('n_term_var_mod', '')
query_c_term_var_mod = params.get('c_term_var_mod', '')
query_max_var_mods = int(params.get('max_var_mods', DEFAULT_MAX_VAR_MODS))
query_num_variable_mods = int(params.get('num_variable_mods', DEFAULT_VAR_MODS))
query_static_mods_str = params.get('static_mods', 'C:57.02146')
query_static_mods = [(s.split(':')[0], float(s.split(':')[1])) for s in query_static_mods_str.split(';') if s]
query_static_mods_str = params.get('static_mods', 'C:Carbamidomethyl')
query_static_mods = [(s.split(':')[0], ''.join(s.split(':')[1:])) for s in query_static_mods_str.split(';') if s]
query_variable_mods_str = params.get('variable_mods', '')
query_variable_mods = [(s.split(':')[0], float(s.split(':')[1])) for s in query_variable_mods_str.split(';') if s]
query_variable_mods = [(s.split(':')[0], ''.join(s.split(':')[1:])) for s in query_variable_mods_str.split(';') if s]
query_mod_mode = params.get('mod_mode', 'skip')

# CSS to inject contained in a string
hide_table_row_index = """
Expand All @@ -64,6 +65,9 @@
with st.sidebar:
st.title('Protein Cleaver 🔪')

st.caption('This app digests a protein sequence into peptides using specified proteases and settings. Now ProForma2.0 compliant!')
st.caption('Made with [peptacular](https://pypi.org/project/peptacular/)')

c1, c2 = st.columns([3, 1])
protein_id = c1.text_input(label='Protein accession number / identifier',
value='',
Expand All @@ -86,7 +90,7 @@

raw_sequence = st.text_area(label="Protein sequence",
value=raw_sequence,
help='An amino acid sequence to digest',
help='An amino acid sequence to digest. Any modifications will be preserved.',
max_chars=MAX_PROTEIN_INPUT_LENGTH,
height=200,
key=st.session_state['protein_sequence_key'])
Expand All @@ -101,17 +105,22 @@
st.error('Please enter a protein sequence')
st.stop()

stripped_protein_sequence = strip_modifications(protein_sequence)
protein_length = calculate_sequence_length(protein_sequence)
st.caption(f'Length: {protein_length}')
stripped_protein_sequence = pt.strip_mods(protein_sequence)
protein_length = pt.sequence_length(protein_sequence)

c1, c2 = st.columns(2)
c1.caption(f'Length: {protein_length}')

if protein_length > MAX_PROTEIN_LEN:
st.error(f'Protein sequence is too long. Please keep it under {MAX_PROTEIN_LEN} residues.')
st.stop()

# check if all residues are valid
if not valid_mass_sequence(protein_sequence):
st.error('Invalid amino acid sequence. Please check your input.')
try:
protein_mass = pt.mass(protein_sequence)
c2.caption(f'Mass: {protein_mass:.2f} Da')

except Exception as e:
st.error(f'Error calculating protein mass: {e}')
st.stop()

c1, c2 = st.columns(2)
Expand All @@ -120,7 +129,7 @@
help='The proteases to use for digestion',
default=query_proteases)

custom_regex = c2.text_input(label='(Additional) Custom protease',
custom_regex = c2.text_input(label='Protease regex',
value=query_custom_regex,
help='A custom regular expression to use for digestion. Will be used along with '
'selected proteases. For example a regex expression for trypsin would look like: '
Expand Down Expand Up @@ -231,16 +240,18 @@
value=query_remove_non_proteotypic,
help='Remove peptides that are not proteotypic')

mod_mode = st.radio('Modification Mode', pt.MOD_MODE_VALUES, horizontal=True, index=pt.MOD_MODE_VALUES.index(query_mod_mode))

with st.expander('Static Modifications'):

c1, c2 = st.columns(2)
n_term_static_mod = c1.number_input(label='N-term mod',
value=query_n_term_static_mod,
help='Apply a static modification to the N-terminus')
n_term_static_mod = c1.text_input(label='N-term mod',
value=str(query_n_term_static_mod),
help='Apply a static modification to the N-terminus')

c_term_static_mod = c2.number_input(label='C-term mod',
value=query_c_term_static_mod,
help='Apply a static modification to the C-terminus')
c_term_static_mod = c2.text_input(label='C-term mod',
value=str(query_c_term_static_mod),
help='Apply a static modification to the C-terminus')

num_static_mods = st.number_input(label='Number of unique static modifications',
min_value=MIN_STATIC_MODS,
Expand All @@ -259,18 +270,16 @@ def add_static_modification(r):
mod = query_static_mods[r][1] if r < len(query_static_mods) else 0.0

with grid[0]:
st.multiselect(label='Amino acids',
st.multiselect(label='Modified AAs',
key=f'static_mod_residue{r}',
options=list(AMINO_ACIDS),
options=list(pt.AMINO_ACIDS),
help='Select amino acids for which to apply the static modification',
default=aas)
with grid[1]:
st.number_input(label='Modification Mass (Da)',
step=0.00001,
st.text_input(label='Modification',
key=f'static_mod_mass{r}',
help='The mass of the modification (in daltons)',
value=mod,
format='%.5f')
help='The modification',
value=mod)


# Loop to create rows of input widgets
Expand All @@ -279,19 +288,19 @@ def add_static_modification(r):

static_mods = {}
for r in range(num_static_mods):
mod = "{:.5f}".format(st.session_state[f'static_mod_mass{r}'])
mod = st.session_state[f'static_mod_mass{r}']
for residue in st.session_state[f'static_mod_residue{r}']:
static_mods[residue] = mod

with st.expander('Variable Modifications'):

c1, c2 = st.columns(2)
n_term_var_mod = c1.number_input(label='N-term var mod',
value=query_n_term_var_mod,
help='Apply a variable modification to the N-terminus')
c_term_var_mod = c2.number_input(label='C-term var mod',
value=query_c_term_var_mod,
help='Apply a variable modification to the C-terminus')
n_term_var_mod = c1.text_input(label='N-term var mod',
value=query_n_term_var_mod,
help='Apply a variable modification to the N-terminus')
c_term_var_mod = c2.text_input(label='C-term var mod',
value=query_c_term_var_mod,
help='Apply a variable modification to the C-terminus')

max_var_mods = st.number_input(label='Max var mods',
min_value=MIN_MAX_VAR_MODS,
Expand All @@ -310,22 +319,22 @@ def add_static_modification(r):
# columns to lay out the inputs
grid = st.columns([3, 2])


def add_variable_modification(r):
aas = list(query_variable_mods[r][0]) if r < len(query_variable_mods) else []
mod = query_variable_mods[r][1] if r < len(query_variable_mods) else 0.0

with grid[0]:
st.multiselect(label='Amino acids',
st.multiselect(label='Modified AAs',
key=f'var_mod_residue{r}',
options=list(AMINO_ACIDS),
options=list(pt.AMINO_ACIDS),
help='Select amino acids for which to apply the variable modification',
default=aas)
with grid[1]:
st.number_input(label='Modification Mass (Da)',
step=0.00001, key=f'var_mod_mass{r}',
help='The mass of the modification (in daltons)',
value=mod,
format='%.5f')
st.text_input(label='Modification',
key=f'var_mod_mass{r}',
help='Themodification',
value=mod)


# Loop to create rows of input widgets
Expand All @@ -334,22 +343,22 @@ def add_variable_modification(r):

var_mods = {}
for r in range(num_variable_mods):
mod = "{:.5f}".format(st.session_state[f'var_mod_mass{r}'])
mod = st.session_state[f'var_mod_mass{r}']
for residue in st.session_state[f'var_mod_residue{r}']:
var_mods[residue] = mod

url = generate_app_url(protein_id, protein_sequence, proteases_selected, custom_regex, missed_cleavages, mass_type,
min_peptide_len, max_peptide_len, min_mass, max_mass, semi_enzymatic, infer_charge,
min_charge, max_charge, min_mz, max_mz, remove_non_proteotypic, n_term_static_mod,
c_term_static_mod, num_static_mods, n_term_var_mod, c_term_var_mod, max_var_mods,
num_variable_mods, static_mods, var_mods)
num_variable_mods, static_mods, var_mods, mod_mode)

sites = set()
for enzyme_regex in enzyme_regexes:
sites.update(identify_cleavage_sites(protein_sequence, enzyme_regex))
sites.update(pt.get_cleavage_sites(protein_sequence, enzyme_regex))
sites = sorted(list(sites))

#with st.expander('Edit Sites'):
# with st.expander('Edit Sites'):
# sites = st.multiselect(label="Sites",
# options=list(range(len(stripped_protein_sequence)+1)),
# help='The proteases to use for digestion',
Expand All @@ -359,7 +368,7 @@ def add_variable_modification(r):
df = generate_peptide_df(protein_sequence, sites, missed_cleavages, min_peptide_len, max_peptide_len,
semi_enzymatic, static_mods, min_mass, max_mass, is_mono, infer_charge, min_charge,
max_charge, min_mz, max_mz, var_mods, max_var_mods, n_term_static_mod, c_term_static_mod,
n_term_var_mod, c_term_var_mod, remove_non_proteotypic)
n_term_var_mod, c_term_var_mod, remove_non_proteotypic, mod_mode)

# Start the HTML string for the site indexes
site_indexes_html = '<span style="font-family: Courier New, monospace; font-size: 16px;">'
Expand Down Expand Up @@ -387,7 +396,7 @@ def add_variable_modification(r):
cmap = mpl.colormaps.get_cmap(CMAP)

spans = [(s, e, mc) for s, e, mc in df[['Start', 'End', 'MC']].values]
protein_cov_arr = calculate_span_coverage(spans, protein_length, accumulate=True)
protein_cov_arr = pt.calculate_span_coverage(spans, protein_length, accumulate=True)
protein_coverage = coverage_string(protein_cov_arr, stripped_protein_sequence, cmap)

# calculate protein coverage at different MC
Expand All @@ -396,7 +405,7 @@ def add_variable_modification(r):
for mc in mcs:
df_mc = df[df['MC'] <= mc]
spans = [(s, e, mc) for s, e, mc in df_mc[['Start', 'End', 'MC']].values]
cov = calculate_span_coverage(spans, protein_length)
cov = pt.calculate_span_coverage(spans, protein_length)
protein_cov_at_mcs.append(sum(cov) / len(cov) * 100)

# calculate protein coverage at different peptide lengths
Expand All @@ -405,7 +414,7 @@ def add_variable_modification(r):
for l in lens:
df_len = df[df['Len'] <= l]
spans = [(s, e, mc) for s, e, mc in df_len[['Start', 'End', 'MC']].values]
cov = calculate_span_coverage(spans, protein_length)
cov = pt.calculate_span_coverage(spans, protein_length)
protein_cov_at_lens.append(sum(cov) / len(cov) * 100)

# calculate protein coverage at different peptide Mass
Expand All @@ -414,7 +423,7 @@ def add_variable_modification(r):
for m in masses:
df_mass = df[df['NeutralMass'] <= m]
spans = [(s, e, mc) for s, e, mc in df_mass[['Start', 'End', 'MC']].values]
cov = calculate_span_coverage(spans, protein_length)
cov = pt.calculate_span_coverage(spans, protein_length)
protein_cov_at_mass.append(sum(cov) / len(cov) * 100)

st.write(f'##### [Analysis URL]({url}) (copy me and send to your friends!)')
Expand Down Expand Up @@ -443,14 +452,13 @@ def add_variable_modification(r):
use_container_width=True
)


with t2:
st.header('Cleavage & Coverage')

c1, c2 = st.columns(2)
c1.metric('Cleavage Sites', len(sites))

protein_cov_arr_bin = calculate_span_coverage(spans, protein_length, accumulate=False)
protein_cov_arr_bin = pt.calculate_span_coverage(spans, protein_length, accumulate=False)
protein_cov_perc = round(sum(protein_cov_arr_bin) / len(protein_cov_arr_bin) * 100, 2)
c2.metric('Protein Coverage', f'{protein_cov_perc}%')

Expand Down Expand Up @@ -485,16 +493,21 @@ def add_variable_modification(r):
motif_regex = c1.text_input('Motifs Regex', '(K)')

st.cache_data()


def get_motif_sites(motif_regex, stripped_protein_sequence):
motif_sites = list(reg.finditer(motif_regex, stripped_protein_sequence, overlapped=True))
return motif_sites


if motif_regex:
motif_sites = get_motif_sites(motif_regex, stripped_protein_sequence)


def count_motifs(row):
return sum([1 for site in motif_sites if row['Start'] <= site.start() < row['End']])


df['Motifs'] = df.apply(count_motifs, axis=1)
motif_cov_indexes = {i for site in motif_sites for i in range(site.start(), site.end())}

Expand All @@ -514,7 +527,6 @@ def count_motifs(row):
else:
motif_cov_array[i] = min(row[2], motif_cov_array[i])


min_moitifs = c2.number_input('Min Motifs', min_value=0, max_value=max(df['Motifs']), value=0)
max_motifs = c3.number_input('Max Motifs', min_value=0, max_value=max(df['Motifs']), value=max(df['Motifs']))
df = df[(df['Motifs'] >= min_moitifs) & (df['Motifs'] <= max_motifs)]
Expand All @@ -535,7 +547,6 @@ def count_motifs(row):

counter = Counter(df['Motifs'])


st.subheader('Motif Site Coverage', help='The color corresponds to the peptide with the fewest number of motif '
'matches (excluding 0 matches). Example: Lets assume that the first '
'site is covered by two peptides, the first with one match and the '
Expand All @@ -548,9 +559,9 @@ def count_motifs(row):
for i, (k, v) in enumerate(sorted(counter.items())):
df_tmp = df[df['Motifs'] == k]
tmp_spans = [(s, e, mc) for s, e, mc in df_tmp[['Start', 'End', 'MC']].values]
cov = calculate_span_coverage(tmp_spans, protein_length, accumulate=True)
cov = pt.calculate_span_coverage(tmp_spans, protein_length, accumulate=True)

cov_bin = calculate_span_coverage(tmp_spans, protein_length, accumulate=False)
cov_bin = pt.calculate_span_coverage(tmp_spans, protein_length, accumulate=False)

c1, c2 = st.columns(2)
c1.metric(f'Protein Coverage with {k} motif matches', f'{round(sum(cov_bin) / len(cov_bin) * 100, 2)}%')
Expand Down Expand Up @@ -631,4 +642,3 @@ def get_model_file_as_byte_stream(path):

st.subheader('Example Code')
st.code(MODEL_CODE)

1 change: 1 addition & 0 deletions constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,3 +82,4 @@ def get_env_str(var_name, default):
VALID_PROTEASES = {k.replace(' ', '_'): v for k, v in VALID_PROTEASES.items()}

BASE_URL = get_env_str('BASE_URL', 'http://localhost:8501')

6 changes: 3 additions & 3 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
pandas==2.1.3
streamlit==1.30.0
peptacular==1.2.0
streamlit==1.32.2
requests==2.31.0
matplotlib==3.7.2
numpy==1.25.1
xgboost==2.0.2
xgboost==2.0.2
peptacular @ git+https://github.com/pgarrett-scripps/peptacular@main
Loading

0 comments on commit 1e25583

Please sign in to comment.