Skip to content

Commit

Permalink
Merge pull request #63 from Jumitti/beta
Browse files Browse the repository at this point in the history
Reverse and complement as an option
  • Loading branch information
Jumitti authored Aug 7, 2024
2 parents d4419af + f446bd9 commit 71995a3
Show file tree
Hide file tree
Showing 4 changed files with 115 additions and 34 deletions.
1 change: 0 additions & 1 deletion TFinder-v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,6 @@ def load_lottiefile(filepath: str):
st.success("If the application does not work, here are other deployments:\n"
f" - TFinder on [Streamlit](https://streamlit.io/): [https://tfinder-ipmc.streamlit.app/](https://tfinder-ipmc.streamlit.app/)\n"
f" - TFinder on [Health Universe](https://www.healthuniverse.com/): [https://apps.healthuniverse.com/nhu-dxv-ktj](https://apps.healthuniverse.com/nhu-dxv-ktj)\n"
f" - TFinder on [Ploomber](https://ploomber.io/): [https://rough-meadow-6083.ploomberapp.io/](https://rough-meadow-6083.ploomberapp.io/)\n"
f" - (BETA) TFinder: [https://tfinder-beta.streamlit.app/](https://tfinder-beta.streamlit.app/)\n")

if chosen_tab == HOME:
Expand Down
19 changes: 15 additions & 4 deletions navigation/aio.py
Original file line number Diff line number Diff line change
Expand Up @@ -705,6 +705,17 @@ def aio_page():
else:
calc_pvalue = None

with BSFcol3:
st.markdown("🔹 :blue[**_Experimental_**] Analyse all directions", help='Directions: **original (+ →)**, **reverse-complement (- ←)**, reverse (+ ←), complement (- →)\n\n'
'Directions in bold are the default directions.')
alldirection = st.toggle('All directions')
if alldirection:
st.markdown(
'⚠️Analyzes in the reverse (+ ←) and complement (- →) directions are generally not suitable for studying TFBS.')
analyse = 4
else:
analyse = 2

if tss_ge_input != 0:
tss_ge_distance = int(tss_ge_input)
else:
Expand All @@ -725,13 +736,13 @@ def aio_page():
else:
button = False

sequence_iteration = 2 * total_sequences_region_length
sequence_iteration = analyse * total_sequences_region_length
num_random_seqs = 1000000
if total_sequences <= 10:
random_gen = total_sequences * num_random_seqs
else:
random_gen = num_random_seqs
random_score = random_gen * 2
random_score = random_gen * analyse

if pvalue:
iteration = sequence_iteration + random_gen + random_score
Expand All @@ -743,12 +754,12 @@ def aio_page():
use_container_width=True,
disabled=button):
with stqdm(total=iteration,
desc='**:blue[Extract sequence...] ⚠️:red[PLEASE WAIT UNTIL END WITHOUT CHANGING ANYTHING]**',
desc='**:blue[Analyse sequence...] ⚠️:red[PLEASE WAIT UNTIL END WITHOUT CHANGING ANYTHING]**',
mininterval=0.1) as progress_bar:
individual_motif_occurrences = IMO.individual_motif_finder(dna_sequences, threshold, matrix,
progress_bar,
calc_pvalue,
tss_ge_distance)
tss_ge_distance, alldirection)
st.session_state['individual_motif_occurrences'] = individual_motif_occurrences

st.divider()
Expand Down
24 changes: 15 additions & 9 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -35,18 +35,24 @@
# tabulate
# weblogo

altair
deprecation
altair~=5.3.0
deprecation~=2.1.0
future
hydralit_components
logomaker
numpy
logomaker~=0.8
numpy~=1.26.4
openpyxl
pandas
pandas~=2.2.2
plotly
requests
stqdm
streamlit==1.34
requests~=2.32.3
stqdm~=0.0.5
streamlit~=1.34.0
streamlit_analytics
streamlit-lottie
tqdm
tqdm~=4.66.4
bs4~=0.0.2
beautifulsoup4~=4.12.3
pillow~=10.3.0
scipy
matplotlib
scikit-learn
105 changes: 85 additions & 20 deletions tfinder/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,22 @@
# OUT OF OR IN CONNECTION WITH TFINDER OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

import random
import re
import time
import xml.etree.ElementTree as ET

import altair as alt
import logomaker
import numpy as np
import pandas as pd
import random
import requests
import time
import streamlit as st
from bs4 import BeautifulSoup
import re
import xml.etree.ElementTree as ET
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm


Expand Down Expand Up @@ -407,8 +414,9 @@ def matrix_extraction(jaspar_id):

@staticmethod
# Transform JASPAR matrix
def transform_matrix(matrix):
# reversed_matrix = {base: list(reversed(scores)) for base, scores in matrix.items()}
def transform_matrix(matrix, alldirection):
if alldirection is True:
reversed_matrix = {base: list(reversed(scores)) for base, scores in matrix.items()}
complement_matrix = {
'A': matrix['T'],
'C': matrix['G'],
Expand All @@ -417,17 +425,18 @@ def transform_matrix(matrix):
}
reversed_complement_matrix = {base: list(reversed(scores)) for base, scores in complement_matrix.items()}

# return {
# '+ f': matrix,
# '+ r': reversed_matrix,
# '- f': complement_matrix,
# '- r': reversed_complement_matrix
# }

return {
'+ f': matrix,
'- r': reversed_complement_matrix
}
if alldirection is True:
return {
'+ f': matrix,
'+ r': reversed_matrix,
'- f': complement_matrix,
'- r': reversed_complement_matrix
}
else:
return {
'+ f': matrix,
'- r': reversed_complement_matrix
}

@staticmethod
# Generate random sequences for p_value
Expand Down Expand Up @@ -475,14 +484,14 @@ def is_dna(dna_sequence):

@staticmethod
# Find with JASPAR and manual matrix
def individual_motif_finder(dna_sequences, threshold, matrix, progress_bar, calc_pvalue=None, tss_ge_distance=None):
def individual_motif_finder(dna_sequences, threshold, matrix, progress_bar, calc_pvalue=None, tss_ge_distance=None, alldirection=None):
if calc_pvalue is not None:
if calc_pvalue not in ["ATGCPreset", "ATGCProportion"]:
raise ValueError("Use 'ATGCPreset' or 'ATGCProportion'")

individual_motif_occurrences = []

matrices = IMO.transform_matrix(matrix)
matrices = IMO.transform_matrix(matrix, alldirection)

seq_length = len(matrices['+ f']['A'])

Expand Down Expand Up @@ -562,6 +571,55 @@ def individual_motif_finder(dna_sequences, threshold, matrix, progress_bar, calc

random_scores = np.array(matrix_random_scores)

# p_val = []
# for score_pval in np.arange(0.50, 1, 0.001):
# p_val_test = (random_scores >= score_pval).sum() / len(random_scores)
# row = [score_pval, p_val_test]
# p_val.append(row)
# headpval = ["Score_pvalue", "p_value"]
# p_val.insert(0, headpval)
#
# p_value_df = pd.DataFrame(p_val[1:], columns=p_val[0])
# st.dataframe(p_value_df)
#
# filtered_df = p_value_df[p_value_df['p_value'] > 0]
#
# scores = filtered_df['Score_pvalue'].values
# p_values = filtered_df['p_value'].values
#
# log_p_values = np.log(p_values)
# model = LinearRegression()
# model.fit(scores.reshape(-1, 1), log_p_values)
#
# fitted_log_values = model.predict(scores.reshape(-1, 1))
# fitted_values = np.exp(fitted_log_values)
# filtered_df['fitted'] = np.maximum(fitted_values, 0)
#
# # RMSE
# rmse = np.sqrt(mean_squared_error(p_values, filtered_df['fitted']))
# st.write(f"RMSE : {rmse}")
#
# # Coef
# coef = model.coef_[0]
# intercept = model.intercept_
#
# # Standard curve
# equation = f"y = exp({intercept:.3f} + {coef:.3f}*x)"
# st.write(f"Standard curve : {equation}")
#
# scatter = alt.Chart(filtered_df).mark_point().encode(
# x='Score_pvalue',
# y='p_value'
# )
#
# line = alt.Chart(filtered_df).mark_line(color='red').encode(
# x='Score_pvalue',
# y='fitted'
# ).interactive()
#
# chart = scatter + line
# st.altair_chart(chart)

for i in range(len(dna_sequence) - seq_length + 1):
seq = dna_sequence[i:i + seq_length]
score = IMO.calculate_score(seq, matrix)
Expand Down Expand Up @@ -609,13 +667,19 @@ def individual_motif_finder(dna_sequences, threshold, matrix, progress_bar, calc
if calc_pvalue is not None:
p_value = (random_scores >= normalized_score).sum() / len(random_scores)

# def evaluate_exponential(x, coef, intercept):
# return np.exp(intercept + coef * x)
#
# y_value = evaluate_exponential(normalized_score, coef, intercept)

row = [position]
if tss_ge_distance is not None:
row.append(tis_position)
row += [sequence_with_context,
"{:.6f}".format(normalized_score).ljust(12)]
if calc_pvalue is not None:
row.append("{:.3e}".format(p_value).ljust(12))
row.append("{:.6e}".format(p_value))
# row.append("{:.6e}".format(y_value))
row += [strand, direction, name, species, region]
individual_motif_occurrences.append(row)

Expand All @@ -634,6 +698,7 @@ def individual_motif_finder(dna_sequences, threshold, matrix, progress_bar, calc
header += ["Sequence", "Rel Score"]
if calc_pvalue is not None:
header.append("p-value")
# header.append("p-value_pred")
header += ["Strand", "Direction", "Gene", "Species", "Region"]
individual_motif_occurrences.insert(0, header)
else:
Expand Down

0 comments on commit 71995a3

Please sign in to comment.