diff --git a/app.py b/app.py index 56bdbc8..fcf0f40 100644 --- a/app.py +++ b/app.py @@ -3,11 +3,7 @@ import pandas as pd import streamlit as st -from peptacular.sequence import strip_modifications, calculate_sequence_length -from peptacular.digest import identify_cleavage_sites -from peptacular.mass import valid_mass_sequence -from peptacular.constants import AMINO_ACIDS -from peptacular.spans import calculate_span_coverage +import peptacular as pt import matplotlib as mpl import regex as reg @@ -16,10 +12,14 @@ from util import generate_peptide_df, coverage_string, create_colorbar, generate_app_url, fetch_sequence_from_uniprot, \ make_clickable +# TODO: Add rt gradient input +# TODO: Add CCS prediction since ook0 is relative to mass spec? + st.set_page_config(page_title="proteincleaver", page_icon=":knife:", layout="wide") # Parse query parameters params = st.query_params +protein_id = params.get('protein_id', '') query_peptide_sequence = params.get('protein_sequence', DEFAULT_PROTEIN_SEQUENCE) query_proteases = params.get('proteases', ';'.join(DEFAULT_PROTEASES)).split(',') query_custom_regex = params.get('custom_regex', '') @@ -36,17 +36,18 @@ query_min_mz = float(params.get('min_mz', DEFAULT_MIN_MZ)) query_max_mz = float(params.get('max_mz', DEFAULT_MAX_MZ)) query_remove_non_proteotypic = params.get('remove_non_proteotypic', 'False').lower() == 'true' -query_n_term_static_mod = float(params.get('n_term_static_mod', 0.0)) -query_c_term_static_mod = float(params.get('c_term_static_mod', 0.0)) +query_n_term_static_mod = params.get('n_term_static_mod', '') +query_c_term_static_mod = params.get('c_term_static_mod', '') query_num_static_mods = int(params.get('num_static_mods', DEFAULT_STATIC_MODS)) -query_n_term_var_mod = float(params.get('n_term_var_mod', 0.0)) -query_c_term_var_mod = float(params.get('c_term_var_mod', 0.0)) +query_n_term_var_mod = params.get('n_term_var_mod', '') +query_c_term_var_mod = params.get('c_term_var_mod', '') query_max_var_mods = int(params.get('max_var_mods', DEFAULT_MAX_VAR_MODS)) query_num_variable_mods = int(params.get('num_variable_mods', DEFAULT_VAR_MODS)) -query_static_mods_str = params.get('static_mods', 'C:57.02146') -query_static_mods = [(s.split(':')[0], float(s.split(':')[1])) for s in query_static_mods_str.split(';') if s] +query_static_mods_str = params.get('static_mods', 'C:Carbamidomethyl') +query_static_mods = [(s.split(':')[0], ''.join(s.split(':')[1:])) for s in query_static_mods_str.split(';') if s] query_variable_mods_str = params.get('variable_mods', '') -query_variable_mods = [(s.split(':')[0], float(s.split(':')[1])) for s in query_variable_mods_str.split(';') if s] +query_variable_mods = [(s.split(':')[0], ''.join(s.split(':')[1:])) for s in query_variable_mods_str.split(';') if s] +query_mod_mode = params.get('mod_mode', 'skip') # CSS to inject contained in a string hide_table_row_index = """ @@ -64,6 +65,9 @@ with st.sidebar: st.title('Protein Cleaver 🔪') + st.caption('This app digests a protein sequence into peptides using specified proteases and settings. Now ProForma2.0 compliant!') + st.caption('Made with [peptacular](https://pypi.org/project/peptacular/)') + c1, c2 = st.columns([3, 1]) protein_id = c1.text_input(label='Protein accession number / identifier', value='', @@ -86,7 +90,7 @@ raw_sequence = st.text_area(label="Protein sequence", value=raw_sequence, - help='An amino acid sequence to digest', + help='An amino acid sequence to digest. Any modifications will be preserved.', max_chars=MAX_PROTEIN_INPUT_LENGTH, height=200, key=st.session_state['protein_sequence_key']) @@ -101,17 +105,22 @@ st.error('Please enter a protein sequence') st.stop() - stripped_protein_sequence = strip_modifications(protein_sequence) - protein_length = calculate_sequence_length(protein_sequence) - st.caption(f'Length: {protein_length}') + stripped_protein_sequence = pt.strip_mods(protein_sequence) + protein_length = pt.sequence_length(protein_sequence) + + c1, c2 = st.columns(2) + c1.caption(f'Length: {protein_length}') if protein_length > MAX_PROTEIN_LEN: st.error(f'Protein sequence is too long. Please keep it under {MAX_PROTEIN_LEN} residues.') st.stop() - # check if all residues are valid - if not valid_mass_sequence(protein_sequence): - st.error('Invalid amino acid sequence. Please check your input.') + try: + protein_mass = pt.mass(protein_sequence) + c2.caption(f'Mass: {protein_mass:.2f} Da') + + except Exception as e: + st.error(f'Error calculating protein mass: {e}') st.stop() c1, c2 = st.columns(2) @@ -120,7 +129,7 @@ help='The proteases to use for digestion', default=query_proteases) - custom_regex = c2.text_input(label='(Additional) Custom protease', + custom_regex = c2.text_input(label='Protease regex', value=query_custom_regex, help='A custom regular expression to use for digestion. Will be used along with ' 'selected proteases. For example a regex expression for trypsin would look like: ' @@ -231,16 +240,18 @@ value=query_remove_non_proteotypic, help='Remove peptides that are not proteotypic') + mod_mode = st.radio('Modification Mode', pt.MOD_MODE_VALUES, horizontal=True, index=pt.MOD_MODE_VALUES.index(query_mod_mode)) + with st.expander('Static Modifications'): c1, c2 = st.columns(2) - n_term_static_mod = c1.number_input(label='N-term mod', - value=query_n_term_static_mod, - help='Apply a static modification to the N-terminus') + n_term_static_mod = c1.text_input(label='N-term mod', + value=str(query_n_term_static_mod), + help='Apply a static modification to the N-terminus') - c_term_static_mod = c2.number_input(label='C-term mod', - value=query_c_term_static_mod, - help='Apply a static modification to the C-terminus') + c_term_static_mod = c2.text_input(label='C-term mod', + value=str(query_c_term_static_mod), + help='Apply a static modification to the C-terminus') num_static_mods = st.number_input(label='Number of unique static modifications', min_value=MIN_STATIC_MODS, @@ -259,18 +270,16 @@ def add_static_modification(r): mod = query_static_mods[r][1] if r < len(query_static_mods) else 0.0 with grid[0]: - st.multiselect(label='Amino acids', + st.multiselect(label='Modified AAs', key=f'static_mod_residue{r}', - options=list(AMINO_ACIDS), + options=list(pt.AMINO_ACIDS), help='Select amino acids for which to apply the static modification', default=aas) with grid[1]: - st.number_input(label='Modification Mass (Da)', - step=0.00001, + st.text_input(label='Modification', key=f'static_mod_mass{r}', - help='The mass of the modification (in daltons)', - value=mod, - format='%.5f') + help='The modification', + value=mod) # Loop to create rows of input widgets @@ -279,19 +288,19 @@ def add_static_modification(r): static_mods = {} for r in range(num_static_mods): - mod = "{:.5f}".format(st.session_state[f'static_mod_mass{r}']) + mod = st.session_state[f'static_mod_mass{r}'] for residue in st.session_state[f'static_mod_residue{r}']: static_mods[residue] = mod with st.expander('Variable Modifications'): c1, c2 = st.columns(2) - n_term_var_mod = c1.number_input(label='N-term var mod', - value=query_n_term_var_mod, - help='Apply a variable modification to the N-terminus') - c_term_var_mod = c2.number_input(label='C-term var mod', - value=query_c_term_var_mod, - help='Apply a variable modification to the C-terminus') + n_term_var_mod = c1.text_input(label='N-term var mod', + value=query_n_term_var_mod, + help='Apply a variable modification to the N-terminus') + c_term_var_mod = c2.text_input(label='C-term var mod', + value=query_c_term_var_mod, + help='Apply a variable modification to the C-terminus') max_var_mods = st.number_input(label='Max var mods', min_value=MIN_MAX_VAR_MODS, @@ -310,22 +319,22 @@ def add_static_modification(r): # columns to lay out the inputs grid = st.columns([3, 2]) + def add_variable_modification(r): aas = list(query_variable_mods[r][0]) if r < len(query_variable_mods) else [] mod = query_variable_mods[r][1] if r < len(query_variable_mods) else 0.0 with grid[0]: - st.multiselect(label='Amino acids', + st.multiselect(label='Modified AAs', key=f'var_mod_residue{r}', - options=list(AMINO_ACIDS), + options=list(pt.AMINO_ACIDS), help='Select amino acids for which to apply the variable modification', default=aas) with grid[1]: - st.number_input(label='Modification Mass (Da)', - step=0.00001, key=f'var_mod_mass{r}', - help='The mass of the modification (in daltons)', - value=mod, - format='%.5f') + st.text_input(label='Modification', + key=f'var_mod_mass{r}', + help='Themodification', + value=mod) # Loop to create rows of input widgets @@ -334,7 +343,7 @@ def add_variable_modification(r): var_mods = {} for r in range(num_variable_mods): - mod = "{:.5f}".format(st.session_state[f'var_mod_mass{r}']) + mod = st.session_state[f'var_mod_mass{r}'] for residue in st.session_state[f'var_mod_residue{r}']: var_mods[residue] = mod @@ -342,14 +351,14 @@ def add_variable_modification(r): min_peptide_len, max_peptide_len, min_mass, max_mass, semi_enzymatic, infer_charge, min_charge, max_charge, min_mz, max_mz, remove_non_proteotypic, n_term_static_mod, c_term_static_mod, num_static_mods, n_term_var_mod, c_term_var_mod, max_var_mods, - num_variable_mods, static_mods, var_mods) + num_variable_mods, static_mods, var_mods, mod_mode) sites = set() for enzyme_regex in enzyme_regexes: - sites.update(identify_cleavage_sites(protein_sequence, enzyme_regex)) + sites.update(pt.get_cleavage_sites(protein_sequence, enzyme_regex)) sites = sorted(list(sites)) -#with st.expander('Edit Sites'): +# with st.expander('Edit Sites'): # sites = st.multiselect(label="Sites", # options=list(range(len(stripped_protein_sequence)+1)), # help='The proteases to use for digestion', @@ -359,7 +368,7 @@ def add_variable_modification(r): df = generate_peptide_df(protein_sequence, sites, missed_cleavages, min_peptide_len, max_peptide_len, semi_enzymatic, static_mods, min_mass, max_mass, is_mono, infer_charge, min_charge, max_charge, min_mz, max_mz, var_mods, max_var_mods, n_term_static_mod, c_term_static_mod, - n_term_var_mod, c_term_var_mod, remove_non_proteotypic) + n_term_var_mod, c_term_var_mod, remove_non_proteotypic, mod_mode) # Start the HTML string for the site indexes site_indexes_html = '' @@ -387,7 +396,7 @@ def add_variable_modification(r): cmap = mpl.colormaps.get_cmap(CMAP) spans = [(s, e, mc) for s, e, mc in df[['Start', 'End', 'MC']].values] -protein_cov_arr = calculate_span_coverage(spans, protein_length, accumulate=True) +protein_cov_arr = pt.calculate_span_coverage(spans, protein_length, accumulate=True) protein_coverage = coverage_string(protein_cov_arr, stripped_protein_sequence, cmap) # calculate protein coverage at different MC @@ -396,7 +405,7 @@ def add_variable_modification(r): for mc in mcs: df_mc = df[df['MC'] <= mc] spans = [(s, e, mc) for s, e, mc in df_mc[['Start', 'End', 'MC']].values] - cov = calculate_span_coverage(spans, protein_length) + cov = pt.calculate_span_coverage(spans, protein_length) protein_cov_at_mcs.append(sum(cov) / len(cov) * 100) # calculate protein coverage at different peptide lengths @@ -405,7 +414,7 @@ def add_variable_modification(r): for l in lens: df_len = df[df['Len'] <= l] spans = [(s, e, mc) for s, e, mc in df_len[['Start', 'End', 'MC']].values] - cov = calculate_span_coverage(spans, protein_length) + cov = pt.calculate_span_coverage(spans, protein_length) protein_cov_at_lens.append(sum(cov) / len(cov) * 100) # calculate protein coverage at different peptide Mass @@ -414,7 +423,7 @@ def add_variable_modification(r): for m in masses: df_mass = df[df['NeutralMass'] <= m] spans = [(s, e, mc) for s, e, mc in df_mass[['Start', 'End', 'MC']].values] - cov = calculate_span_coverage(spans, protein_length) + cov = pt.calculate_span_coverage(spans, protein_length) protein_cov_at_mass.append(sum(cov) / len(cov) * 100) st.write(f'##### [Analysis URL]({url}) (copy me and send to your friends!)') @@ -443,14 +452,13 @@ def add_variable_modification(r): use_container_width=True ) - with t2: st.header('Cleavage & Coverage') c1, c2 = st.columns(2) c1.metric('Cleavage Sites', len(sites)) - protein_cov_arr_bin = calculate_span_coverage(spans, protein_length, accumulate=False) + protein_cov_arr_bin = pt.calculate_span_coverage(spans, protein_length, accumulate=False) protein_cov_perc = round(sum(protein_cov_arr_bin) / len(protein_cov_arr_bin) * 100, 2) c2.metric('Protein Coverage', f'{protein_cov_perc}%') @@ -485,16 +493,21 @@ def add_variable_modification(r): motif_regex = c1.text_input('Motifs Regex', '(K)') st.cache_data() + + def get_motif_sites(motif_regex, stripped_protein_sequence): motif_sites = list(reg.finditer(motif_regex, stripped_protein_sequence, overlapped=True)) return motif_sites + if motif_regex: motif_sites = get_motif_sites(motif_regex, stripped_protein_sequence) + def count_motifs(row): return sum([1 for site in motif_sites if row['Start'] <= site.start() < row['End']]) + df['Motifs'] = df.apply(count_motifs, axis=1) motif_cov_indexes = {i for site in motif_sites for i in range(site.start(), site.end())} @@ -514,7 +527,6 @@ def count_motifs(row): else: motif_cov_array[i] = min(row[2], motif_cov_array[i]) - min_moitifs = c2.number_input('Min Motifs', min_value=0, max_value=max(df['Motifs']), value=0) max_motifs = c3.number_input('Max Motifs', min_value=0, max_value=max(df['Motifs']), value=max(df['Motifs'])) df = df[(df['Motifs'] >= min_moitifs) & (df['Motifs'] <= max_motifs)] @@ -535,7 +547,6 @@ def count_motifs(row): counter = Counter(df['Motifs']) - st.subheader('Motif Site Coverage', help='The color corresponds to the peptide with the fewest number of motif ' 'matches (excluding 0 matches). Example: Lets assume that the first ' 'site is covered by two peptides, the first with one match and the ' @@ -548,9 +559,9 @@ def count_motifs(row): for i, (k, v) in enumerate(sorted(counter.items())): df_tmp = df[df['Motifs'] == k] tmp_spans = [(s, e, mc) for s, e, mc in df_tmp[['Start', 'End', 'MC']].values] - cov = calculate_span_coverage(tmp_spans, protein_length, accumulate=True) + cov = pt.calculate_span_coverage(tmp_spans, protein_length, accumulate=True) - cov_bin = calculate_span_coverage(tmp_spans, protein_length, accumulate=False) + cov_bin = pt.calculate_span_coverage(tmp_spans, protein_length, accumulate=False) c1, c2 = st.columns(2) c1.metric(f'Protein Coverage with {k} motif matches', f'{round(sum(cov_bin) / len(cov_bin) * 100, 2)}%') @@ -631,4 +642,3 @@ def get_model_file_as_byte_stream(path): st.subheader('Example Code') st.code(MODEL_CODE) - diff --git a/constants.py b/constants.py index 2676297..46469de 100644 --- a/constants.py +++ b/constants.py @@ -82,3 +82,4 @@ def get_env_str(var_name, default): VALID_PROTEASES = {k.replace(' ', '_'): v for k, v in VALID_PROTEASES.items()} BASE_URL = get_env_str('BASE_URL', 'http://localhost:8501') + diff --git a/requirements.txt b/requirements.txt index 6683804..1babcba 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ pandas==2.1.3 -streamlit==1.30.0 -peptacular==1.2.0 +streamlit==1.32.2 requests==2.31.0 matplotlib==3.7.2 numpy==1.25.1 -xgboost==2.0.2 \ No newline at end of file +xgboost==2.0.2 +peptacular @ git+https://github.com/pgarrett-scripps/peptacular@main \ No newline at end of file diff --git a/util.py b/util.py index c3215a0..784d934 100644 --- a/util.py +++ b/util.py @@ -5,21 +5,23 @@ import numpy as np import pandas as pd import requests -from peptacular.mass import calculate_mass -from peptacular.sequence import span_to_sequence, calculate_sequence_length, apply_static_modifications, \ - strip_modifications, apply_variable_modifications -from peptacular.spans import build_enzymatic_spans, build_semi_spans +import peptacular as pt import matplotlib.pyplot as plt import matplotlib.colors as mcolors import matplotlib as mpl -from peptacular.term.modification import add_n_term_modification, add_c_term_modification +from peptacular import InvalidModificationMassError +import streamlit as st from constants import LINK, BASE_URL def fetch_sequence_from_uniprot(accession_number): - url = f"https://www.uniprot.org/uniprot/{accession_number}.fasta" - response = requests.get(url) + url = f"http://www.uniprot.org/uniprot/{accession_number}.fasta" + try: + response = requests.get(url) + except Exception as e: + st.error(e) + st.stop() return response @@ -28,17 +30,26 @@ def generate_peptide_df(sequence: str, cleavage_sites: List, missed_cleavages: i is_mono: bool, infer_charge: bool, min_charge: int, max_charge: int, min_mz: float, max_mz: float, var_mods: dict, max_var_mods: int, n_term_static_mod: float, c_term_static_mod: float, n_term_var_mod: float, c_term_var_mod: float, - remove_non_proteotypic: bool): + remove_non_proteotypic: bool, mod_mode: str): + + n_term_static_mod = [n_term_static_mod] if n_term_static_mod else [] + c_term_static_mod = [c_term_static_mod] if c_term_static_mod else [] + n_term_var_mod = [n_term_var_mod] if n_term_var_mod else [] + c_term_var_mod = [c_term_var_mod] if c_term_var_mod else [] + + var_mods = {k: [v] for k, v in var_mods.items()} + static_mods = {k: [v] for k, v in static_mods.items()} + cleavage_sites = sorted(cleavage_sites) - spans = build_enzymatic_spans(calculate_sequence_length(sequence), cleavage_sites, missed_cleavages, 1, None) + spans = pt.build_enzymatic_spans(pt.sequence_length(sequence), cleavage_sites, missed_cleavages, 1, None) df = pd.DataFrame(spans, columns=['Start', 'End', 'MC']) - df['Sequence'] = [span_to_sequence(sequence, span) for span in spans] + df['Sequence'] = [pt.span_to_sequence(sequence, span) for span in spans] df['Semi'] = 0 if semi_enzymatic is True: - semi_spans = build_semi_spans(spans, min_len, max_len) + semi_spans = pt.build_semi_spans(spans, min_len, max_len) semi_df = pd.DataFrame(semi_spans, columns=['Start', 'End', 'MC']) - semi_df['Sequence'] = [span_to_sequence(sequence, span) for span in semi_spans] + semi_df['Sequence'] = [pt.span_to_sequence(sequence, span) for span in semi_spans] semi_df['Semi'] = 1 df = pd.concat([df, semi_df], ignore_index=True) @@ -55,29 +66,14 @@ def generate_peptide_df(sequence: str, cleavage_sites: List, missed_cleavages: i # Apply variable modifications to each sequence in the DataFrame def apply_var_mods(sequence: str) -> str: - var_seqs = apply_variable_modifications(sequence, var_mods, max_var_mods) - - if n_term_var_mod and c_term_var_mod: - n_term_seq = add_n_term_modification(sequence, n_term_var_mod) - c_term_seq = add_c_term_modification(sequence, c_term_var_mod) - n_c_term_seq = add_c_term_modification(n_term_seq, c_term_var_mod) - - n_term_seqs = apply_variable_modifications(n_term_seq, var_mods, max_var_mods) - c_term_seqs = apply_variable_modifications(c_term_seq, var_mods, max_var_mods) - n_c_term_seqs = apply_variable_modifications(n_c_term_seq, var_mods, max_var_mods) - - return ';'.join(list(set(var_seqs + n_term_seqs + c_term_seqs + n_c_term_seqs))) + var_seqs = pt.apply_variable_mods(sequence=sequence, + internal_mods=var_mods, + max_mods=max_var_mods, + nterm_mods=n_term_var_mod, + cterm_mods=c_term_var_mod, + mode=mod_mode) - elif n_term_var_mod: - n_term_seq = add_n_term_modification(sequence, n_term_var_mod) - n_term_seqs = apply_variable_modifications(n_term_seq, var_mods, max_var_mods) - return ';'.join(list(set(var_seqs + n_term_seqs))) - elif c_term_var_mod: - c_term_seq = add_c_term_modification(sequence, c_term_var_mod) - c_term_seqs = apply_variable_modifications(c_term_seq, var_mods, max_var_mods) - return ';'.join(list(set(var_seqs + c_term_seqs))) - else: - return ';'.join(var_seqs) + return ';'.join(var_seqs) # Apply the apply_var_mods function to the 'Sequence' column df['Sequence'] = df['Sequence'].apply(apply_var_mods) @@ -86,24 +82,27 @@ def apply_var_mods(sequence: str) -> str: df = df.assign(Sequence=df.Sequence.str.split(';')).explode('Sequence') def apply_static_mods(sequence: str) -> str: - - sequence = apply_static_modifications(sequence, static_mods) - - if n_term_static_mod: - sequence = add_n_term_modification(sequence, n_term_static_mod) - - if c_term_static_mod: - sequence = add_c_term_modification(sequence, c_term_static_mod) - + sequence = pt.apply_static_mods(sequence=sequence, + internal_mods=static_mods, + nterm_mods=n_term_static_mod, + cterm_mods=c_term_static_mod, + mode=mod_mode) return sequence df['Sequence'] = df['Sequence'].apply(apply_static_mods) # drop duplicates - df['NeutralMass'] = [round(calculate_mass(sequence, monoisotopic=is_mono), 5) for sequence in df['Sequence']] + def calc_mass(sequence: str, monoisotopic: bool, precision: int) -> float: + try: + return pt.mass(sequence, monoisotopic=monoisotopic, precision=precision, ion_type='p') + except InvalidModificationMassError as e: + st.error(e) + st.stop() + + df['NeutralMass'] = df['Sequence'].apply(calc_mass, monoisotopic=is_mono, precision=5) df = df[(df['NeutralMass'] >= min_mass) & (df['NeutralMass'] <= max_mass)] - df['StrippedPeptide'] = df['Sequence'].apply(strip_modifications) + df['StrippedPeptide'] = df['Sequence'].apply(pt.strip_mods) df.sort_values(by=['MC'], inplace=True) df.drop_duplicates(subset=['Start', 'Sequence'], inplace=True) @@ -120,18 +119,18 @@ def apply_static_mods(sequence: str) -> str: return df rt_model = pickle.load(open("rt_model.pkl", "rb")) - df['RT'] = rt_model.predict(np.array([bin_aa_counts(strip_modifications(seq)) for seq in df['Sequence']])) + df['RT'] = rt_model.predict(np.array([bin_aa_counts(pt.strip_mods(seq)) for seq in df['Sequence']])) df['RT'] = df['RT'].round(3) if infer_charge: im_model = pickle.load(open("im_model.pkl", "rb")) df['IM'] = im_model.predict( - np.array([bin_aa_counts(strip_modifications(seq), c) for seq, c in df[['Sequence', 'Charge']].values])) + np.array([bin_aa_counts(pt.strip_mods(seq), c) for seq, c in df[['Sequence', 'Charge']].values])) df['IM'] = df['IM'].round(3) proteotypic_model = pickle.load(open("proteotypic_model.pkl", "rb")) df['Proteotypic'] = proteotypic_model.predict( - np.array([bin_aa_counts(strip_modifications(seq)) for seq in df['Sequence']])).astype(bool) + np.array([bin_aa_counts(pt.strip_mods(seq)) for seq in df['Sequence']])).astype(bool) if remove_non_proteotypic: df = df[df['Proteotypic']] @@ -197,7 +196,7 @@ def generate_app_url(protein_id, protein_sequence, proteases, custom_regex, miss min_peptide_len, max_peptide_len, min_mass, max_mass, semi_enzymatic, infer_charge, min_charge, max_charge, min_mz, max_mz, remove_non_proteotypic, n_term_static_mod, c_term_static_mod, num_static_mods, n_term_var_mod, c_term_var_mod, max_var_mods, - num_variable_mods, static_mods, variable_mods): + num_variable_mods, static_mods, variable_mods, mod_mode): # flip the dictionary static_mods_rev = {} @@ -236,7 +235,8 @@ def generate_app_url(protein_id, protein_sequence, proteases, custom_regex, miss 'max_var_mods': max_var_mods, 'num_variable_mods': num_variable_mods, 'static_mods': static_mod_str, - 'variable_mods': variable_mod_str + 'variable_mods': variable_mod_str, + 'mod_mode': mod_mode } query_string = '&'.join([f'{key}={value}' for key, value in params.items() if value is not None]) return f'{BASE_URL}?{query_string}' diff --git a/wiki.py b/wiki.py index 46ba1a0..8ff8b1c 100644 --- a/wiki.py +++ b/wiki.py @@ -18,7 +18,9 @@ conducive to binding the target protein. -For example, trypsin binds after Lysine (K) or Arginine (R). Let's use the first Arginine (R) in the following protein +For example, trypsin cleaves after (on the C terminal side of) Lysine (K) or Arginine (R). Let's use the first Arginine + (R) in the following protein. + as an example: ``` Protein: H-W-P-R-A-T-G-A-K-Y-G-G-L @@ -119,15 +121,31 @@ ## Average vs. Monoisotopic Mass -In the context of mass spectrometry and proteomics, two important concepts related to the mass of atoms, ions, -molecules, or compounds are average mass and monoisotopic mass. +In the context of mass spectrometry and proteomics,there are two methodologies for calculating the mass of analytes: +Monoisotopic and Average. ### Average Mass The average mass, also known as the molecular weight or the molecular mass, is the weighted average of the masses of all isotopes of an element, taking into account their natural abundance. For instance, carbon (C) has two naturally occurring isotopes: C-12 and C-13. The average mass of carbon takes into account the masses and abundances of these -two isotopes. +two isotopes. Average mass is calculated as follows: + +``` +Isotope = 12C +Relative Atomic Mass = 12.0000000(00) +Isotopic Composition = 0.9893(8) +``` + +``` +Isotope = 13C +Relative Atomic Mass = 13.00335483507(23) +Isotopic Composition = 0.0107(8) +``` + +``` +Average Mass = (12.0 * 0.99) + (13.0 * 0.01) = 12.01 Da +``` Similarly, when calculating the average mass of a peptide or a protein, the average masses of all the individual amino acids (which again, take into account the different isotopes of all the atoms in the amino acid) are summed.