Merge pull request #63 from Jumitti/beta

Reverse and complement as an option
Jumitti · Aug 7, 2024 · 71995a3 · 71995a3
2 parents d4419af + f446bd9
commit 71995a3
Show file tree

Hide file tree

Showing 4 changed files with 115 additions and 34 deletions.
diff --git a/TFinder-v1.py b/TFinder-v1.py
@@ -126,7 +126,6 @@ def load_lottiefile(filepath: str):
 st.success("If the application does not work, here are other deployments:\n"
            f"   - TFinder on [Streamlit](https://streamlit.io/): [https://tfinder-ipmc.streamlit.app/](https://tfinder-ipmc.streamlit.app/)\n"
            f"   - TFinder on [Health Universe](https://www.healthuniverse.com/): [https://apps.healthuniverse.com/nhu-dxv-ktj](https://apps.healthuniverse.com/nhu-dxv-ktj)\n"
-           f"   - TFinder on [Ploomber](https://ploomber.io/): [https://rough-meadow-6083.ploomberapp.io/](https://rough-meadow-6083.ploomberapp.io/)\n"
            f"   - (BETA) TFinder: [https://tfinder-beta.streamlit.app/](https://tfinder-beta.streamlit.app/)\n")
 
 if chosen_tab == HOME:

diff --git a/navigation/aio.py b/navigation/aio.py
@@ -705,6 +705,17 @@ def aio_page():
         else:
             calc_pvalue = None
 
+    with BSFcol3:
+        st.markdown("🔹 :blue[**_Experimental_**] Analyse all directions", help='Directions: **original (+ →)**, **reverse-complement (- ←)**, reverse (+ ←), complement (- →)\n\n'
+                                                                               'Directions in bold are the default directions.')
+        alldirection = st.toggle('All directions')
+        if alldirection:
+            st.markdown(
+                '⚠️Analyzes in the reverse (+ ←) and complement (- →) directions are generally not suitable for studying TFBS.')
+            analyse = 4
+        else:
+            analyse = 2
+
     if tss_ge_input != 0:
         tss_ge_distance = int(tss_ge_input)
     else:
@@ -725,13 +736,13 @@ def aio_page():
         else:
             button = False
 
-    sequence_iteration = 2 * total_sequences_region_length
+    sequence_iteration = analyse * total_sequences_region_length
     num_random_seqs = 1000000
     if total_sequences <= 10:
         random_gen = total_sequences * num_random_seqs
     else:
         random_gen = num_random_seqs
-    random_score = random_gen * 2
+    random_score = random_gen * analyse
 
     if pvalue:
         iteration = sequence_iteration + random_gen + random_score
@@ -743,12 +754,12 @@ def aio_page():
                  use_container_width=True,
                  disabled=button):
         with stqdm(total=iteration,
-                   desc='**:blue[Extract sequence...] ⚠️:red[PLEASE WAIT UNTIL END WITHOUT CHANGING ANYTHING]**',
+                   desc='**:blue[Analyse sequence...] ⚠️:red[PLEASE WAIT UNTIL END WITHOUT CHANGING ANYTHING]**',
                    mininterval=0.1) as progress_bar:
             individual_motif_occurrences = IMO.individual_motif_finder(dna_sequences, threshold, matrix,
                                                                        progress_bar,
                                                                        calc_pvalue,
-                                                                       tss_ge_distance)
+                                                                       tss_ge_distance, alldirection)
         st.session_state['individual_motif_occurrences'] = individual_motif_occurrences
 
     st.divider()

diff --git a/requirements.txt b/requirements.txt
@@ -35,18 +35,24 @@
 # tabulate
 # weblogo
 
-altair
-deprecation
+altair~=5.3.0
+deprecation~=2.1.0
 future
 hydralit_components
-logomaker
-numpy
+logomaker~=0.8
+numpy~=1.26.4
 openpyxl
-pandas
+pandas~=2.2.2
 plotly
-requests
-stqdm
-streamlit==1.34
+requests~=2.32.3
+stqdm~=0.0.5
+streamlit~=1.34.0
 streamlit_analytics
 streamlit-lottie
-tqdm
+tqdm~=4.66.4
+bs4~=0.0.2
+beautifulsoup4~=4.12.3
+pillow~=10.3.0
+scipy
+matplotlib
+scikit-learn
diff --git a/tfinder/__init__.py b/tfinder/__init__.py
@@ -18,15 +18,22 @@
 # OUT OF OR IN CONNECTION WITH TFINDER OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
+import random
+import re
+import time
+import xml.etree.ElementTree as ET
+
+import altair as alt
 import logomaker
 import numpy as np
 import pandas as pd
-import random
 import requests
-import time
+import streamlit as st
 from bs4 import BeautifulSoup
-import re
-import xml.etree.ElementTree as ET
+from sklearn.linear_model import LinearRegression
+from sklearn.metrics import mean_squared_error
+from sklearn.preprocessing import PolynomialFeatures
+from sklearn.preprocessing import StandardScaler
 from tqdm import tqdm
 
 
@@ -407,8 +414,9 @@ def matrix_extraction(jaspar_id):
 
     @staticmethod
     # Transform JASPAR matrix
-    def transform_matrix(matrix):
-        # reversed_matrix = {base: list(reversed(scores)) for base, scores in matrix.items()}
+    def transform_matrix(matrix, alldirection):
+        if alldirection is True:
+            reversed_matrix = {base: list(reversed(scores)) for base, scores in matrix.items()}
         complement_matrix = {
             'A': matrix['T'],
             'C': matrix['G'],
@@ -417,17 +425,18 @@ def transform_matrix(matrix):
         }
         reversed_complement_matrix = {base: list(reversed(scores)) for base, scores in complement_matrix.items()}
 
-        # return {
-        #     '+ f': matrix,
-        #     '+ r': reversed_matrix,
-        #     '- f': complement_matrix,
-        #     '- r': reversed_complement_matrix
-        # }
-
-        return {
-            '+ f': matrix,
-            '- r': reversed_complement_matrix
-        }
+        if alldirection is True:
+            return {
+                '+ f': matrix,
+                '+ r': reversed_matrix,
+                '- f': complement_matrix,
+                '- r': reversed_complement_matrix
+            }
+        else:
+            return {
+                '+ f': matrix,
+                '- r': reversed_complement_matrix
+            }
 
     @staticmethod
     # Generate random sequences for p_value
@@ -475,14 +484,14 @@ def is_dna(dna_sequence):
 
     @staticmethod
     # Find with JASPAR and manual matrix
-    def individual_motif_finder(dna_sequences, threshold, matrix, progress_bar, calc_pvalue=None, tss_ge_distance=None):
+    def individual_motif_finder(dna_sequences, threshold, matrix, progress_bar, calc_pvalue=None, tss_ge_distance=None, alldirection=None):
         if calc_pvalue is not None:
             if calc_pvalue not in ["ATGCPreset", "ATGCProportion"]:
                 raise ValueError("Use 'ATGCPreset' or 'ATGCProportion'")
 
         individual_motif_occurrences = []
 
-        matrices = IMO.transform_matrix(matrix)
+        matrices = IMO.transform_matrix(matrix, alldirection)
 
         seq_length = len(matrices['+ f']['A'])
 
@@ -562,6 +571,55 @@ def individual_motif_finder(dna_sequences, threshold, matrix, progress_bar, calc
 
                     random_scores = np.array(matrix_random_scores)
 
+                # p_val = []
+                # for score_pval in np.arange(0.50, 1, 0.001):
+                #     p_val_test = (random_scores >= score_pval).sum() / len(random_scores)
+                #     row = [score_pval, p_val_test]
+                #     p_val.append(row)
+                # headpval = ["Score_pvalue", "p_value"]
+                # p_val.insert(0, headpval)
+                #
+                # p_value_df = pd.DataFrame(p_val[1:], columns=p_val[0])
+                # st.dataframe(p_value_df)
+                #
+                # filtered_df = p_value_df[p_value_df['p_value'] > 0]
+                #
+                # scores = filtered_df['Score_pvalue'].values
+                # p_values = filtered_df['p_value'].values
+                #
+                # log_p_values = np.log(p_values)
+                # model = LinearRegression()
+                # model.fit(scores.reshape(-1, 1), log_p_values)
+                #
+                # fitted_log_values = model.predict(scores.reshape(-1, 1))
+                # fitted_values = np.exp(fitted_log_values)
+                # filtered_df['fitted'] = np.maximum(fitted_values, 0)
+                #
+                # # RMSE
+                # rmse = np.sqrt(mean_squared_error(p_values, filtered_df['fitted']))
+                # st.write(f"RMSE : {rmse}")
+                #
+                # # Coef
+                # coef = model.coef_[0]
+                # intercept = model.intercept_
+                #
+                # # Standard curve
+                # equation = f"y = exp({intercept:.3f} + {coef:.3f}*x)"
+                # st.write(f"Standard curve : {equation}")
+                #
+                # scatter = alt.Chart(filtered_df).mark_point().encode(
+                #     x='Score_pvalue',
+                #     y='p_value'
+                # )
+                #
+                # line = alt.Chart(filtered_df).mark_line(color='red').encode(
+                #     x='Score_pvalue',
+                #     y='fitted'
+                # ).interactive()
+                #
+                # chart = scatter + line
+                # st.altair_chart(chart)
+
                 for i in range(len(dna_sequence) - seq_length + 1):
                     seq = dna_sequence[i:i + seq_length]
                     score = IMO.calculate_score(seq, matrix)
@@ -609,13 +667,19 @@ def individual_motif_finder(dna_sequences, threshold, matrix, progress_bar, calc
                             if calc_pvalue is not None:
                                 p_value = (random_scores >= normalized_score).sum() / len(random_scores)
 
+                                # def evaluate_exponential(x, coef, intercept):
+                                #     return np.exp(intercept + coef * x)
+                                #
+                                # y_value = evaluate_exponential(normalized_score, coef, intercept)
+
                             row = [position]
                             if tss_ge_distance is not None:
                                 row.append(tis_position)
                             row += [sequence_with_context,
                                     "{:.6f}".format(normalized_score).ljust(12)]
                             if calc_pvalue is not None:
-                                row.append("{:.3e}".format(p_value).ljust(12))
+                                row.append("{:.6e}".format(p_value))
+                                # row.append("{:.6e}".format(y_value))
                             row += [strand, direction, name, species, region]
                             individual_motif_occurrences.append(row)
 
@@ -634,6 +698,7 @@ def individual_motif_finder(dna_sequences, threshold, matrix, progress_bar, calc
             header += ["Sequence", "Rel Score"]
             if calc_pvalue is not None:
                 header.append("p-value")
+                # header.append("p-value_pred")
             header += ["Strand", "Direction", "Gene", "Species", "Region"]
             individual_motif_occurrences.insert(0, header)
         else: