docker and streamlit 1.30 update

pgarrett-scripps · Jan 19, 2024 · 0603c13 · 0603c13
1 parent d1e80b0
commit 0603c13
Show file tree

Hide file tree

Showing 5 changed files with 112 additions and 68 deletions.
diff --git a/.github/workflows/docker-image.yml b/.github/workflows/docker-image.yml
@@ -0,0 +1,13 @@
+name: Publish Docker
+on: [push]
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v3
+    - name: Publish to Registry
+      uses: elgohr/Publish-Docker-Github-Action@v4
+      with:
+        name: pgarrettscripps/protein-cleaver-streamlit
+        username: ${{ secrets.DOCKER_HUB_USERNAME }}
+        password: ${{ secrets.DOCKER_HUB_TOKEN }}
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,11 @@
+FROM python:3.11.7
+
+WORKDIR /usr/src/app
+
+COPY requirements.txt ./
+
+RUN pip install --no-cache-dir -r requirements.txt
+
+COPY . .
+
+CMD streamlit run home.py --server.port 8501
diff --git a/app.py b/app.py
@@ -1,6 +1,5 @@
 import uuid
 from collections import Counter
-import random
 
 import pandas as pd
 import streamlit as st
@@ -14,39 +13,39 @@
 
 from constants import *
 from wiki import *
-from util import make_clickable, generate_peptide_df, coverage_string, create_colorbar, generate_app_url, \
-    fetch_sequence_from_uniprot
+from util import generate_peptide_df, coverage_string, create_colorbar, generate_app_url, fetch_sequence_from_uniprot, \
+    make_clickable
 
 st.set_page_config(page_title="proteincleaver", page_icon=":knife:", layout="wide")
 
 # Parse query parameters
-params = st.experimental_get_query_params()
-query_peptide_sequence = params.get('protein_sequence', [DEFAULT_PROTEIN_SEQUENCE])[0]
-query_proteases = params.get('proteases', [';'.join(DEFAULT_PROTEASES)])[0].split(',')
-query_custom_regex = params.get('custom_regex', [''])[0]
-query_missed_cleavages = int(params.get('missed_cleavages', [DEFAULT_MISSED_CLEAVAGES])[0])
-query_mass_type = params.get('mass_type', [DEFAULT_MASS_TYPE])[0]
-query_min_peptide_len = int(params.get('min_peptide_len', [DEFAULT_MIN_PEPTIDE_LEN])[0])
-query_max_peptide_len = int(params.get('max_peptide_len', [DEFAULT_MAX_PEPTIDE_LEN])[0])
-query_min_mass = float(params.get('min_mass', [DEFAULT_MIN_PEPTIDE_MASS])[0])
-query_max_mass = float(params.get('max_mass', [DEFAULT_MAX_PEPTIDE_MASS])[0])
-query_semi_enzymatic = params.get('semi_enzymatic', ['False'])[0].lower() == 'true'
-query_infer_charge = params.get('infer_charge', ['True'])[0].lower() == 'true'
-query_min_charge = int(params.get('min_charge', [DEFAULT_MIN_CHARGE])[0])
-query_max_charge = int(params.get('max_charge', [DEFAULT_MAX_CHARGE])[0])
-query_min_mz = float(params.get('min_mz', [DEFAULT_MIN_MZ])[0])
-query_max_mz = float(params.get('max_mz', [DEFAULT_MAX_MZ])[0])
-query_remove_non_proteotypic = params.get('remove_non_proteotypic', ['False'])[0].lower() == 'true'
-query_n_term_static_mod = float(params.get('n_term_static_mod', [0.0])[0])
-query_c_term_static_mod = float(params.get('c_term_static_mod', [0.0])[0])
-query_num_static_mods = int(params.get('num_static_mods', [DEFAULT_STATIC_MODS])[0])
-query_n_term_var_mod = float(params.get('n_term_var_mod', [0.0])[0])
-query_c_term_var_mod = float(params.get('c_term_var_mod', [0.0])[0])
-query_max_var_mods = int(params.get('max_var_mods', [DEFAULT_MAX_VAR_MODS])[0])
-query_num_variable_mods = int(params.get('num_variable_mods', [DEFAULT_VAR_MODS])[0])
-query_static_mods_str = params.get('static_mods', ['C:57.02146'])[0]
+params = st.query_params
+query_peptide_sequence = params.get('protein_sequence', DEFAULT_PROTEIN_SEQUENCE)
+query_proteases = params.get('proteases', ';'.join(DEFAULT_PROTEASES)).split(',')
+query_custom_regex = params.get('custom_regex', '')
+query_missed_cleavages = int(params.get('missed_cleavages', DEFAULT_MISSED_CLEAVAGES))
+query_mass_type = params.get('mass_type', DEFAULT_MASS_TYPE)
+query_min_peptide_len = int(params.get('min_peptide_len', DEFAULT_MIN_PEPTIDE_LEN))
+query_max_peptide_len = int(params.get('max_peptide_len', DEFAULT_MAX_PEPTIDE_LEN))
+query_min_mass = float(params.get('min_mass', DEFAULT_MIN_PEPTIDE_MASS))
+query_max_mass = float(params.get('max_mass', DEFAULT_MAX_PEPTIDE_MASS))
+query_semi_enzymatic = params.get('semi_enzymatic', 'False').lower() == 'true'
+query_infer_charge = params.get('infer_charge', 'False').lower() == 'true'
+query_min_charge = int(params.get('min_charge', DEFAULT_MIN_CHARGE))
+query_max_charge = int(params.get('max_charge', DEFAULT_MAX_CHARGE))
+query_min_mz = float(params.get('min_mz', DEFAULT_MIN_MZ))
+query_max_mz = float(params.get('max_mz', DEFAULT_MAX_MZ))
+query_remove_non_proteotypic = params.get('remove_non_proteotypic', 'False').lower() == 'true'
+query_n_term_static_mod = float(params.get('n_term_static_mod', 0.0))
+query_c_term_static_mod = float(params.get('c_term_static_mod', 0.0))
+query_num_static_mods = int(params.get('num_static_mods', DEFAULT_STATIC_MODS))
+query_n_term_var_mod = float(params.get('n_term_var_mod', 0.0))
+query_c_term_var_mod = float(params.get('c_term_var_mod', 0.0))
+query_max_var_mods = int(params.get('max_var_mods', DEFAULT_MAX_VAR_MODS))
+query_num_variable_mods = int(params.get('num_variable_mods', DEFAULT_VAR_MODS))
+query_static_mods_str = params.get('static_mods', 'C:57.02146')
 query_static_mods = [(s.split(':')[0], float(s.split(':')[1])) for s in query_static_mods_str.split(';') if s]
-query_variable_mods_str = params.get('variable_mods', [''])[0]
+query_variable_mods_str = params.get('variable_mods', '')
 query_variable_mods = [(s.split(':')[0], float(s.split(':')[1])) for s in query_variable_mods_str.split(';') if s]
 
 # CSS to inject contained in a string
@@ -124,7 +123,8 @@
     custom_regex = c2.text_input(label='(Additional) Custom protease',
                                  value=query_custom_regex,
                                  help='A custom regular expression to use for digestion. Will be used along with '
-                                      'selected proteases')
+                                      'selected proteases. For example a regex expression for trypsin would look like: '
+                                      '([KR])')
 
     c1, c2 = st.columns(2)
     missed_cleavages = c1.number_input(label='Max missed cleavages',
@@ -424,23 +424,25 @@ def add_variable_modification(r):
 
 with t1:
     st.header('Digestion Metrics')
-    c1, c2, c3, c4 = st.columns(4)
+    c1, c2, c3 = st.columns(3)
     c1.metric('Total Peptides', len(df))
     c2.metric('Semi Peptides', len(df[df['Semi']]))
     c3.metric('Enzymatic Peptides', len(df[~df['Semi']]))
-    c4.metric('Unique Peptides', len(df['Sequence'].unique()))
 
     st.subheader('Peptides')
-    clickable = st.checkbox('Peptide Fragmenter Links', value=False)
 
-    if clickable:
-        df_clickable = df.copy(deep=True)
-        df_clickable['Sequence'] = [make_clickable(peptide, mass_type) for peptide in
-                                    df_clickable['Sequence']]
-        st.caption('Click on a sequence to see the fragment ions!')
-        st.write(df_clickable.to_html(escape=False), unsafe_allow_html=True, use_container_width=True)
-    else:
-        st.dataframe(df, use_container_width=True)
+    df['Link'] = [make_clickable(peptide, mass_type) for peptide in df['Sequence']]
+
+    st.dataframe(
+        df,
+        column_config={
+            "Link": st.column_config.LinkColumn(
+                display_text="View Ions"),
+        },
+        hide_index=True,
+    )
+
+
 
 with t2:
     st.header('Cleavage & Coverage')
@@ -479,10 +481,16 @@ def add_variable_modification(r):
 
 with t3:
     st.header('Motif Analysis')
-    motif_regex = st.text_input('Motifs Regex', '(K)')
+    c1, c2, c3 = st.columns(3)
+    motif_regex = c1.text_input('Motifs Regex', '(K)')
 
-    if motif_regex:
+    st.cache_data()
+    def get_motif_sites(motif_regex, stripped_protein_sequence):
         motif_sites = list(reg.finditer(motif_regex, stripped_protein_sequence, overlapped=True))
+        return motif_sites
+
+    if motif_regex:
+        motif_sites = get_motif_sites(motif_regex, stripped_protein_sequence)
 
         def count_motifs(row):
             return sum([1 for site in motif_sites if row['Start'] <= site.start() < row['End']])
@@ -506,18 +514,28 @@ def count_motifs(row):
                         else:
                             motif_cov_array[i] = min(row[2], motif_cov_array[i])
 
+
+        min_moitifs = c2.number_input('Min Motifs', min_value=0, max_value=max(df['Motifs']), value=0)
+        max_motifs = c3.number_input('Max Motifs', min_value=0, max_value=max(df['Motifs']), value=max(df['Motifs']))
+        df = df[(df['Motifs'] >= min_moitifs) & (df['Motifs'] <= max_motifs)]
+
         st.subheader('Peptides')
-        clickable2 = st.checkbox('Peptide Fragmenter Links', value=False, key=1)
-        if clickable2:
-            df_clickable = df.copy(deep=True)
-            df_clickable['Sequence'] = [make_clickable(peptide, mass_type) for peptide in df_clickable['Sequence']]
-            st.caption('Click on a sequence to see the fragment ions!')
-            st.write(df_clickable.to_html(escape=False), unsafe_allow_html=True, use_container_width=True)
-        else:
-            st.dataframe(df, use_container_width=True)
+
+        # Make the Link column the last column int he dataframe
+        df = df[[c for c in df if c not in ['Link']] + ['Link']]
+
+        st.dataframe(
+            df,
+            column_config={
+                "Link": st.column_config.LinkColumn(
+                    display_text="View Ions"),
+            },
+            hide_index=True,
+        )
 
         counter = Counter(df['Motifs'])
 
+
         st.subheader('Motif Site Coverage', help='The color corresponds to the peptide with the fewest number of motif '
                                                  'matches (excluding 0 matches). Example: Lets assume that the first '
                                                  'site is covered by two peptides, the first with one match and the '

diff --git a/requirements.txt b/requirements.txt
@@ -1,6 +1,6 @@
 pandas==2.1.3
-streamlit==1.28.2
-peptacular==1.0.1
+streamlit==1.30.0
+peptacular==1.2.0
 requests==2.31.0
 matplotlib==3.7.2
 numpy==1.25.1

diff --git a/util.py b/util.py
@@ -12,7 +12,7 @@
 import matplotlib.pyplot as plt
 import matplotlib.colors as mcolors
 import matplotlib as mpl
-from peptacular.term import add_n_term_modification, add_c_term_modification
+from peptacular.term.modification import add_n_term_modification, add_c_term_modification
 
 from constants import LINK
 
@@ -23,13 +23,6 @@ def fetch_sequence_from_uniprot(accession_number):
     return response
 
 
-def make_clickable(sequence, mass_type):
-    # target _blank to open new window
-    # extract clickable text to display for your link
-    link = LINK + f'?sequence={sequence}&mass_type={mass_type}'
-    return f'<a target="_blank" href="{link}">{sequence}</a>'
-
-
 def generate_peptide_df(sequence: str, cleavage_sites: List, missed_cleavages: int, min_len: int,
                         max_len: int, semi_enzymatic: bool, static_mods: dict, min_mass: float, max_mass: float,
                         is_mono: bool, infer_charge: bool, min_charge: int, max_charge: int, min_mz: float,
@@ -60,7 +53,7 @@ def generate_peptide_df(sequence: str, cleavage_sites: List, missed_cleavages: i
     df = df[(df['Len'] >= min_len) & (df['Len'] <= max_len)]
 
     # Apply variable modifications to each sequence in the DataFrame
-    def apply_var_mods(sequence):
+    def apply_var_mods(sequence: str) -> str:
 
         var_seqs = apply_variable_modifications(sequence, var_mods, max_var_mods)
 
@@ -92,15 +85,17 @@ def apply_var_mods(sequence):
     # expand the sequence column into multiple rows (sequences are separated by ';')
     df = df.assign(Sequence=df.Sequence.str.split(';')).explode('Sequence')
 
-    def apply_static_mods(sequence):
-        # Update var_mods dictionary based on conditions
+    def apply_static_mods(sequence: str) -> str:
+
+        sequence = apply_static_modifications(sequence, static_mods)
+
         if n_term_static_mod:
-            var_mods.update({-1: n_term_static_mod})
+            sequence = add_n_term_modification(sequence, n_term_static_mod)
+
         if c_term_static_mod:
-            var_mods.update({calculate_sequence_length(sequence): c_term_static_mod})
+            sequence = add_c_term_modification(sequence, c_term_static_mod)
 
-        # Apply variable modifications and join them with ';'
-        return apply_static_modifications(sequence, static_mods)
+        return sequence
 
     df['Sequence'] = df['Sequence'].apply(apply_static_mods)
 
@@ -246,3 +241,10 @@ def generate_app_url(protein_id, protein_sequence, proteases, custom_regex, miss
     }
     query_string = '&'.join([f'{key}={value}' for key, value in params.items() if value is not None])
     return f'{base_url}?{query_string}'
+
+
+def make_clickable(sequence, mass_type):
+    # target _blank to open new window
+    # extract clickable text to display for your link
+    link = LINK + f'?sequence={sequence}&mass_type={mass_type}'
+    return link