Merge pull request #40 from Jumitti/beta

improvement and preparation for upgrade
Jumitti · Nov 7, 2023 · 46b2061 · 46b2061
2 parents 5297b93 + fbe81be
commit 46b2061
Show file tree

Hide file tree

Showing 6 changed files with 138 additions and 72 deletions.
diff --git a/.streamlit/TFinder_logo_site.ico b/.streamlit/TFinder_logo_site.ico
diff --git a/TFinder-v1.py b/TFinder-v1.py
@@ -117,28 +117,8 @@
 
 streamlit_analytics.start_tracking()
 
-# Credit rating
+# Credit
 st.sidebar.image("img/TFinder_logo_site.png")
-# try:
-#     with open("ratings.pkl", "rb") as file:
-#         ratings = pickle.load(file)
-# except FileNotFoundError:
-#     ratings = []
-# rating = st.sidebar.slider("Rate it 😊 (1-5 ⭐)", 1, 5, 5)
-# colrate1, colrate2 = st.sidebar.columns(2)
-# with colrate1:
-#     submit_button = st.button("Submit Rating")
-# if submit_button:
-#     st.balloons()
-#     ratings.append(rating)
-#     with open("ratings.pkl", "wb") as file:
-#         pickle.dump(ratings, file)
-#     st.toast("Thank you for rating the application!", icon='😍')
-#     st.balloons()
-# average_rating = sum(ratings) / len(ratings) if ratings else 0
-# num_ratings = len(ratings)
-# with colrate2:
-#     st.write(f"{average_rating:.2f} ⭐ ({num_ratings} votes)")
 
 # Help
 st.sidebar.title("Help")

diff --git a/navigation/aio.py b/navigation/aio.py
@@ -28,7 +28,6 @@
 from email.mime.image import MIMEImage
 from email.mime.multipart import MIMEMultipart
 from email.mime.text import MIMEText
-from typing import Any, List
 
 import altair as alt
 import pandas as pd
@@ -50,6 +49,34 @@
 import pickle
 
 
+def search_species_at_NCBI(query_species_name):
+    def clean_text(text):
+        return re.sub(r' <[^>]+>', '', text)
+
+    scientificnames = []
+
+    with st.spinner("Searching for species..."):
+        url = f'https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?name="{query_species_name}"&srchmode=3&filter=genome_filter'
+
+        response = requests.get(url)
+        response.raise_for_status()
+
+        soup = BeautifulSoup(response.text, "html.parser")
+
+        ul_tag = soup.find("ul")
+        taxonomy_id_tag = soup.find(text="Taxonomy ID:")
+
+        if ul_tag:
+            strong_tags = ul_tag.find_all("strong")
+            for strong_tag in strong_tags:
+                scientificnames.append(clean_text(strong_tag.get_text()))
+
+        elif taxonomy_id_tag:
+            scientificnames.append(query_species_name)
+
+    return scientificnames
+
+
 def email(excel_file, csv_file, txt_output, email_receiver, body, jaspar):
     try:
         current_date_time = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
@@ -135,8 +162,9 @@ def result_table_output(df):
         y=alt.Y('Rel Score:Q', axis=alt.Axis(title='Relative Score'),
                 scale=alt.Scale(domain=[ystart, ystop])),
         color=alt.condition(gene_region_selection, color_scale, alt.value('lightgray')),
-        tooltip=['Position'] + (['Rel Position'] if "Rel Position" in source else []) + ['Rel Score'] + (
-            ['p-value'] if 'p-value' in source else []) + ['Sequence', 'Gene', 'Species', 'Region'],
+        tooltip=['Sequence', 'Position'] + (['Rel Position'] if "Rel Position" in source else []) + ['Rel Score'] + (
+            ['p-value'] if 'p-value' in source else []) + (
+                    ['LCS', 'LCS length', 'LCS Rel Score'] if "LCS" in source else []) + ['Gene', 'Species', 'Region'],
         opacity=alt.condition(gene_region_selection, alt.value(0.8), alt.value(0.2))
     ).transform_calculate(x=f'datum[{xcol_param.name}]').properties(width=600,
                                                                     height=400).interactive().add_params(
@@ -165,24 +193,6 @@ def graph_threeD(df):
     st.plotly_chart(fig, theme=None, use_container_width=True)
 
 
-@st.cache_resource
-def taxo():
-    with open("utils/species.pkl", "rb") as file:
-        species_lists = pickle.load(file)
-    return species_lists
-
-
-def search_taxo(searchterm: str) -> List[any]:
-    st.toast('Searching species... ⏳')
-    species_lists = taxo()
-    st.toast('Species retrieved 😀')
-    searchterm1 = searchterm[0].lower()
-    species_set = species_lists.get(f"species_{searchterm1}", set())
-    matches = [species for species in species_set if searchterm.lower() in species]
-    st.toast('Species retrieved 😀')
-    return matches
-
-
 def aio_page():
     st.subheader(':blue[Step 1] Promoter and Terminator Extractor')
     colprom1, colprom2 = st.columns([0.8, 1.2], gap="small")
@@ -235,12 +245,18 @@ def aio_page():
                                        ["Human", "Mouse", "Rat", "Drosophila", "Zebrafish"], index=0,
                                        label_visibility='collapsed')
 
+                # species = st_searchbox(
+                #     search_species_at_NCBI,
+                #     key="search_taxo", delay=0.25
+                # )
+
             with col2:
                 all_variants = st.toggle('All variant')
 
             # Upstream/Downstream Promoter
             st.markdown("🔹 :blue[**Step 1.3**] Regulatory region:")
             prom_term = st.radio("🔹 :blue[**Step 1.3**] Regulatory region:", ('Promoter', 'Terminator'),
+                                 horizontal=True,
                                  label_visibility='collapsed')
             if prom_term == 'Promoter':
                 st.markdown("🔹 :blue[**Step 1.4**] Upstream/downstream from the TSS (bp)")
@@ -284,7 +300,7 @@ def aio_page():
                             if not str(result_promoter_output).startswith('P'):
                                 pbar.progress((i + 1) / len(gene_ids),
                                               text=f'**:blue[Extract sequence... {gene_id}] ⚠️:red[PLEASE WAIT UNTIL END WITHOUT CHANGING ANYTHING]**')
-                                st.toast(f'{prom_term} **{gene_id}** from **{species}** extracted', icon='🧬')
+                                st.toast(f"{prom_term} **{gene_id}** from **{species}** extracted", icon='🧬')
 
                                 result_promoter.append(result_promoter_output)
                             else:
@@ -723,6 +739,23 @@ def aio_page():
         else:
             calc_pvalue = None
 
+        # lcs = st.toggle('LCS')
+        # if lcs:
+        #     max_variant_allowed = 1048576
+        #     num_positions = len(next(iter(matrix.values())))
+        #     total_sequences_pwm = 1
+        #     for pos in range(num_positions):
+        #         valid_probabilities = [matrix[nuc][pos] for nuc in matrix if matrix[nuc][pos] > 0]
+        #         total_sequences_pwm *= len(valid_probabilities)
+        #     if total_sequences_pwm > max_variant_allowed:
+        #         st.error(
+        #             f'Too many sequences. LCS not allowed for this PWM. Limit: {max_variant_allowed} | Total sequences : {total_sequences_pwm}')
+        #         button = True
+        #     else:
+        #         button = False
+        # else:
+        #     lcs = None
+
     if tss_ge_input != 0:
         tss_ge_distance = int(tss_ge_input)
     else:
@@ -803,19 +836,21 @@ def aio_page():
                                    mime="application/vnd.ms-excel", key='download-excel')
                 st.download_button(label="💾 Download table (.csv)", data=csv_file,
                                    file_name=f"Results_TFinder_{current_date_time}.csv", mime="text/csv")
-                email_receiver = st.text_input('Send results by email ✉',
-                                               value='', placeholder='Send results by email ✉',
-                                               label_visibility="collapsed")
-                if st.button("Send ✉"):
-                    if jaspar == 'PWM':
-                        if matrix_type == 'With PWM':
-                            body = f"Hello 🧬\n\nResults obtained with TFinder.\n\nPosition Weight Matrix:\n{matrix_text}\n\nThis email also includes the sequences used in FASTA format and an Excel table of results.\n\nFor all requests/information, please refer to the 'Contact' tab on the TFinder website. We would be happy to answer all your questions.\n\nBest regards\nTFinder Team 🔎🧬"
-                        if matrix_type == 'With FASTA sequences':
-                            body = f"Hello 🧬\n\nResults obtained with TFinder.\n\nResponsive Elements:\n{individual_motif}\n\nPosition Weight Matrix:\n{matrix_text}\n\nThis email also includes the sequences used in FASTA format and an Excel table of results.\n\nFor all requests/information, please refer to the 'Contact' tab on the TFinder website. We would be happy to answer all your questions.\n\nBest regards\nTFinder Team 🔎🧬"
-                    elif jaspar == 'JASPAR_ID':
-                        body = f"Hello 🧬\n\nResults obtained with TFinder.\n\nJASPAR_ID: {jaspar_id} | Transcription Factor name: {TF_name}\n\nThis email also includes the sequences used in FASTA format and an Excel table of results.\n\nFor all requests/information, please refer to the 'Contact' tab on the TFinder website. We would be happy to answer all your questions.\n\nBest regards\nTFinder Team 🔎🧬"
-                    else:
-                        body = f"Hello 🧬\n\nResults obtained with TFinder.\n\nResponsive Elements:\n{IUPAC}\n\nPosition Weight Matrix:\n{matrix_text}\n\nThis email also includes the sequences used in FASTA format and an Excel table of results.\n\nFor all requests/information, please refer to the 'Contact' tab on the TFinder website. We would be happy to answer all your questions.\n\nBest regards\nTFinder Team 🔎🧬"
-                    email(excel_file, csv_file, txt_output, email_receiver, body, jaspar)
+
+                if st.session_state["LOCAL"] == 'True':
+                    email_receiver = st.text_input('Send results by email ✉',
+                                                   value='', placeholder='Send results by email ✉',
+                                                   label_visibility="collapsed")
+                    if st.button("Send ✉"):
+                        if jaspar == 'PWM':
+                            if matrix_type == 'With PWM':
+                                body = f"Hello 🧬\n\nResults obtained with TFinder.\n\nPosition Weight Matrix:\n{matrix_text}\n\nThis email also includes the sequences used in FASTA format and an Excel table of results.\n\nFor all requests/information, please refer to the 'Contact' tab on the TFinder website. We would be happy to answer all your questions.\n\nBest regards\nTFinder Team 🔎🧬"
+                            if matrix_type == 'With FASTA sequences':
+                                body = f"Hello 🧬\n\nResults obtained with TFinder.\n\nResponsive Elements:\n{individual_motif}\n\nPosition Weight Matrix:\n{matrix_text}\n\nThis email also includes the sequences used in FASTA format and an Excel table of results.\n\nFor all requests/information, please refer to the 'Contact' tab on the TFinder website. We would be happy to answer all your questions.\n\nBest regards\nTFinder Team 🔎🧬"
+                        elif jaspar == 'JASPAR_ID':
+                            body = f"Hello 🧬\n\nResults obtained with TFinder.\n\nJASPAR_ID: {jaspar_id} | Transcription Factor name: {TF_name}\n\nThis email also includes the sequences used in FASTA format and an Excel table of results.\n\nFor all requests/information, please refer to the 'Contact' tab on the TFinder website. We would be happy to answer all your questions.\n\nBest regards\nTFinder Team 🔎🧬"
+                        else:
+                            body = f"Hello 🧬\n\nResults obtained with TFinder.\n\nResponsive Elements:\n{IUPAC}\n\nPosition Weight Matrix:\n{matrix_text}\n\nThis email also includes the sequences used in FASTA format and an Excel table of results.\n\nFor all requests/information, please refer to the 'Contact' tab on the TFinder website. We would be happy to answer all your questions.\n\nBest regards\nTFinder Team 🔎🧬"
+                        email(excel_file, csv_file, txt_output, email_receiver, body, jaspar)
         else:
             st.error(f"No consensus sequence found with the specified threshold")
diff --git a/requirements.txt b/requirements.txt
@@ -36,7 +36,7 @@ Bio==1.5.9
 logomaker==0.8
 openpyxl==3.1.2
 sendgrid==6.10.0
-Pillow==9.5.0
+pillow
 stqdm==0.0.5
 tqdm==4.66.1
 streamlit-modal==0.1.0
@@ -47,5 +47,4 @@ altair==5.0.1
 matplotlib==3.7.1
 future==0.18.3
 pyperclip==1.8.2
-tabulate==0.9.0
-streamlit-searchbox
+tabulate==0.9.0
diff --git a/streamlit_searchbox/__init__.py b/streamlit_searchbox/__init__.py
@@ -11,6 +11,13 @@
 import streamlit as st
 import streamlit.components.v1 as components
 
+try:
+    from streamlit import rerun as rerun  # type: ignore
+except ImportError:
+    # conditional import for streamlit version <1.27
+    from streamlit import experimental_rerun as rerun  # type: ignore
+
+
 # point to build directory
 parent_dir = os.path.dirname(os.path.abspath(__file__))
 build_dir = os.path.join(parent_dir, "frontend/build")
@@ -68,7 +75,7 @@ def _process_search(
     search_function: Callable[[str], List[Any]],
     key: str,
     searchterm: str,
-    rerun_on_update: bool,
+    rerun_on_update: bool
 ) -> None:
     # nothing changed, avoid new search
     if searchterm == st.session_state[key]["search"]:
@@ -84,7 +91,7 @@ def _process_search(
     st.session_state[key]["options_py"] = _list_to_options_py(search_results)
 
     if rerun_on_update:
-        st.rerun()
+        rerun()
 
 
 @wrap_inactive_session
@@ -95,7 +102,7 @@ def st_searchbox(
     default: Any = None,
     default_options: List[Any] | None = None,
     clear_on_submit: bool = False,
-    rerun_on_update: bool = True,
+    rerun_on_update: bool = True, delay: float = None,
     key: str = "searchbox",
     **kwargs,
 ) -> Any:
@@ -120,6 +127,7 @@ def st_searchbox(
     Returns:
         any: based on user selection
         :param key:
+        :param delay:
         :param rerun_on_update:
         :param clear_on_submit:
         :param default:
@@ -129,6 +137,10 @@ def st_searchbox(
         :param default_options:
     """
 
+    # delay request (useful if you have a limit of request API)
+    if delay:
+        time.sleep(delay)
+
     # key without prefix used by react component
     key_react = f"{key}_react"
 

diff --git a/tfinder/__init__.py b/tfinder/__init__.py
@@ -451,7 +451,8 @@ def is_dna(dna_sequence):
 
     @staticmethod
     # Find with JASPAR and manual matrix
-    def individual_motif_finder(dna_sequences, threshold, matrix, progress_bar, calc_pvalue=None, tss_ge_distance=None):
+    def individual_motif_finder(dna_sequences, threshold, matrix, progress_bar, calc_pvalue=None, tss_ge_distance=None,
+                                lcs=None):
         if calc_pvalue is not None:
             if calc_pvalue not in ["ATGCPreset", "ATGCProportion"]:
                 raise ValueError("Use 'ATGCPreset' or 'ATGCProportion'")
@@ -474,10 +475,11 @@ def individual_motif_finder(dna_sequences, threshold, matrix, progress_bar, calc
 
             random_sequences = IMO.generate_ranseq(probabilities, seq_length, progress_bar, num_random_seqs)
 
-        if calc_pvalue == 'ATGCPreset':
-            random_scores = {}
-            matrix_random_scores = []
-            for matrix_name, matrix in matrices.items():
+        random_scores = {}
+        matrix_random_scores = []
+        LCS = {}
+        for matrix_name, matrix in matrices.items():
+            if calc_pvalue == 'ATGCPreset':
                 max_score = sum(max(matrix[base][i] for base in matrix.keys()) for i in range(seq_length))
                 min_score = sum(min(matrix[base][i] for base in matrix.keys()) for i in range(seq_length))
                 for random_sequence in random_sequences:
@@ -489,8 +491,11 @@ def individual_motif_finder(dna_sequences, threshold, matrix, progress_bar, calc
                         normalized_random_score = (random_score - min_score) / (max_score - min_score)
                     matrix_random_scores.append(normalized_random_score)
                     progress_bar.update(1)
+                random_scores = np.array(matrix_random_scores)
 
-            random_scores = np.array(matrix_random_scores)
+            if lcs is not None:
+                generated_sequences = IMO.generate_sequences(matrix)
+                LCS[matrix_name] = generated_sequences
 
         for name, dna_sequence, species, region in dna_sequences:
             if calc_pvalue == 'ATGCProportion':
@@ -584,6 +589,28 @@ def individual_motif_finder(dna_sequences, threshold, matrix, progress_bar, calc
                             tis_position = position - tss_ge_distance
 
                         if normalized_score >= threshold:
+
+                            if lcs is not None:
+                                seqint = sequence_with_context[3:-3]
+                                best_lcs_continuous = ""
+                                best_lcs_for_relscore = ""
+                                sequences_for_matrix_name = LCS[matrix_name]
+
+                                for seqref in sequences_for_matrix_name:
+                                    lcs_continuous, lcs_for_relscore = IMO.LCScontinuous(seqint, seqref)
+
+                                    if len(lcs_continuous) > len(best_lcs_continuous):
+                                        best_lcs_continuous = lcs_continuous
+                                        score_lcs_continuous = len(best_lcs_continuous)
+                                        best_lcs_for_relscore = lcs_for_relscore
+
+                                lcs_rel_score = IMO.calculate_score(best_lcs_for_relscore, matrix)
+                                if max_score == min_score:
+                                    lcs_normalized_score = lcs_rel_score / max_score
+                                else:
+                                    lcs_normalized_score = (lcs_rel_score - min_score) / (
+                                            max_score - min_score)
+
                             if calc_pvalue is not None:
                                 p_value = (random_scores >= normalized_score).sum() / len(random_scores)
 
@@ -594,20 +621,33 @@ def individual_motif_finder(dna_sequences, threshold, matrix, progress_bar, calc
                                     "{:.6f}".format(normalized_score).ljust(12)]
                             if calc_pvalue is not None:
                                 row.append("{:.3e}".format(p_value).ljust(12))
+                            if lcs is not None:
+                                row += [best_lcs_continuous, str(score_lcs_continuous).ljust(15),
+                                        "{:.6f}".format(lcs_normalized_score).ljust(12)]
                             row += [strand, direction, name, species, region]
                             individual_motif_occurrences.append(row)
 
         if len(individual_motif_occurrences) > 0:
-            if tss_ge_distance is not None:
-                individual_motif_occurrences.sort(key=lambda x: float(x[3]), reverse=True)
+            if tss_ge_distance is not None and calc_pvalue is not None and lcs is not None:
+                individual_motif_occurrences.sort(key=lambda x: (float(x[3]), x[6], x[7]), reverse=True)
+            elif tss_ge_distance is not None and lcs is not None:
+                individual_motif_occurrences.sort(key=lambda x: (float(x[3]), x[5], x[6]), reverse=True)
+            elif calc_pvalue is not None and lcs is not None:
+                individual_motif_occurrences.sort(key=lambda x: (float(x[2]), x[5], x[6]), reverse=True)
+            elif lcs is not None:
+                individual_motif_occurrences.sort(key=lambda x: (float(x[2]), x[4], x[5]), reverse=True)
+            elif tss_ge_distance is not None:
+                individual_motif_occurrences.sort(key=lambda x: (float(x[3])), reverse=True)
             else:
-                individual_motif_occurrences.sort(key=lambda x: float(x[2]), reverse=True)
+                individual_motif_occurrences.sort(key=lambda x: (float(x[2])), reverse=True)
             header = ["Position"]
             if tss_ge_distance is not None:
                 header.append("Rel Position")
             header += ["Sequence", "Rel Score"]
             if calc_pvalue is not None:
                 header.append("p-value")
+            if lcs is not None:
+                header += ["LCS", 'LCS length', 'LCS Rel Score']
             header += ["Strand", "Direction", "Gene", "Species", "Region"]
             individual_motif_occurrences.insert(0, header)
         else: