Skip to content

Commit

Permalink
Merge pull request #40 from Jumitti/beta
Browse files Browse the repository at this point in the history
improvement and preparation for upgrade
  • Loading branch information
Jumitti authored Nov 7, 2023
2 parents 5297b93 + fbe81be commit 46b2061
Show file tree
Hide file tree
Showing 6 changed files with 138 additions and 72 deletions.
Binary file added .streamlit/TFinder_logo_site.ico
Binary file not shown.
22 changes: 1 addition & 21 deletions TFinder-v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,28 +117,8 @@

streamlit_analytics.start_tracking()

# Credit rating
# Credit
st.sidebar.image("img/TFinder_logo_site.png")
# try:
# with open("ratings.pkl", "rb") as file:
# ratings = pickle.load(file)
# except FileNotFoundError:
# ratings = []
# rating = st.sidebar.slider("Rate it 😊 (1-5 ⭐)", 1, 5, 5)
# colrate1, colrate2 = st.sidebar.columns(2)
# with colrate1:
# submit_button = st.button("Submit Rating")
# if submit_button:
# st.balloons()
# ratings.append(rating)
# with open("ratings.pkl", "wb") as file:
# pickle.dump(ratings, file)
# st.toast("Thank you for rating the application!", icon='😍')
# st.balloons()
# average_rating = sum(ratings) / len(ratings) if ratings else 0
# num_ratings = len(ratings)
# with colrate2:
# st.write(f"{average_rating:.2f} ⭐ ({num_ratings} votes)")

# Help
st.sidebar.title("Help")
Expand Down
107 changes: 71 additions & 36 deletions navigation/aio.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@
from email.mime.image import MIMEImage
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
from typing import Any, List

import altair as alt
import pandas as pd
Expand All @@ -50,6 +49,34 @@
import pickle


def search_species_at_NCBI(query_species_name):
def clean_text(text):
return re.sub(r' <[^>]+>', '', text)

scientificnames = []

with st.spinner("Searching for species..."):
url = f'https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?name="{query_species_name}"&srchmode=3&filter=genome_filter'

response = requests.get(url)
response.raise_for_status()

soup = BeautifulSoup(response.text, "html.parser")

ul_tag = soup.find("ul")
taxonomy_id_tag = soup.find(text="Taxonomy ID:")

if ul_tag:
strong_tags = ul_tag.find_all("strong")
for strong_tag in strong_tags:
scientificnames.append(clean_text(strong_tag.get_text()))

elif taxonomy_id_tag:
scientificnames.append(query_species_name)

return scientificnames


def email(excel_file, csv_file, txt_output, email_receiver, body, jaspar):
try:
current_date_time = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
Expand Down Expand Up @@ -135,8 +162,9 @@ def result_table_output(df):
y=alt.Y('Rel Score:Q', axis=alt.Axis(title='Relative Score'),
scale=alt.Scale(domain=[ystart, ystop])),
color=alt.condition(gene_region_selection, color_scale, alt.value('lightgray')),
tooltip=['Position'] + (['Rel Position'] if "Rel Position" in source else []) + ['Rel Score'] + (
['p-value'] if 'p-value' in source else []) + ['Sequence', 'Gene', 'Species', 'Region'],
tooltip=['Sequence', 'Position'] + (['Rel Position'] if "Rel Position" in source else []) + ['Rel Score'] + (
['p-value'] if 'p-value' in source else []) + (
['LCS', 'LCS length', 'LCS Rel Score'] if "LCS" in source else []) + ['Gene', 'Species', 'Region'],
opacity=alt.condition(gene_region_selection, alt.value(0.8), alt.value(0.2))
).transform_calculate(x=f'datum[{xcol_param.name}]').properties(width=600,
height=400).interactive().add_params(
Expand Down Expand Up @@ -165,24 +193,6 @@ def graph_threeD(df):
st.plotly_chart(fig, theme=None, use_container_width=True)


@st.cache_resource
def taxo():
with open("utils/species.pkl", "rb") as file:
species_lists = pickle.load(file)
return species_lists


def search_taxo(searchterm: str) -> List[any]:
st.toast('Searching species... ⏳')
species_lists = taxo()
st.toast('Species retrieved 😀')
searchterm1 = searchterm[0].lower()
species_set = species_lists.get(f"species_{searchterm1}", set())
matches = [species for species in species_set if searchterm.lower() in species]
st.toast('Species retrieved 😀')
return matches


def aio_page():
st.subheader(':blue[Step 1] Promoter and Terminator Extractor')
colprom1, colprom2 = st.columns([0.8, 1.2], gap="small")
Expand Down Expand Up @@ -235,12 +245,18 @@ def aio_page():
["Human", "Mouse", "Rat", "Drosophila", "Zebrafish"], index=0,
label_visibility='collapsed')

# species = st_searchbox(
# search_species_at_NCBI,
# key="search_taxo", delay=0.25
# )

with col2:
all_variants = st.toggle('All variant')

# Upstream/Downstream Promoter
st.markdown("🔹 :blue[**Step 1.3**] Regulatory region:")
prom_term = st.radio("🔹 :blue[**Step 1.3**] Regulatory region:", ('Promoter', 'Terminator'),
horizontal=True,
label_visibility='collapsed')
if prom_term == 'Promoter':
st.markdown("🔹 :blue[**Step 1.4**] Upstream/downstream from the TSS (bp)")
Expand Down Expand Up @@ -284,7 +300,7 @@ def aio_page():
if not str(result_promoter_output).startswith('P'):
pbar.progress((i + 1) / len(gene_ids),
text=f'**:blue[Extract sequence... {gene_id}] ⚠️:red[PLEASE WAIT UNTIL END WITHOUT CHANGING ANYTHING]**')
st.toast(f'{prom_term} **{gene_id}** from **{species}** extracted', icon='🧬')
st.toast(f"{prom_term} **{gene_id}** from **{species}** extracted", icon='🧬')

result_promoter.append(result_promoter_output)
else:
Expand Down Expand Up @@ -723,6 +739,23 @@ def aio_page():
else:
calc_pvalue = None

# lcs = st.toggle('LCS')
# if lcs:
# max_variant_allowed = 1048576
# num_positions = len(next(iter(matrix.values())))
# total_sequences_pwm = 1
# for pos in range(num_positions):
# valid_probabilities = [matrix[nuc][pos] for nuc in matrix if matrix[nuc][pos] > 0]
# total_sequences_pwm *= len(valid_probabilities)
# if total_sequences_pwm > max_variant_allowed:
# st.error(
# f'Too many sequences. LCS not allowed for this PWM. Limit: {max_variant_allowed} | Total sequences : {total_sequences_pwm}')
# button = True
# else:
# button = False
# else:
# lcs = None

if tss_ge_input != 0:
tss_ge_distance = int(tss_ge_input)
else:
Expand Down Expand Up @@ -803,19 +836,21 @@ def aio_page():
mime="application/vnd.ms-excel", key='download-excel')
st.download_button(label="💾 Download table (.csv)", data=csv_file,
file_name=f"Results_TFinder_{current_date_time}.csv", mime="text/csv")
email_receiver = st.text_input('Send results by email ✉',
value='', placeholder='Send results by email ✉',
label_visibility="collapsed")
if st.button("Send ✉"):
if jaspar == 'PWM':
if matrix_type == 'With PWM':
body = f"Hello 🧬\n\nResults obtained with TFinder.\n\nPosition Weight Matrix:\n{matrix_text}\n\nThis email also includes the sequences used in FASTA format and an Excel table of results.\n\nFor all requests/information, please refer to the 'Contact' tab on the TFinder website. We would be happy to answer all your questions.\n\nBest regards\nTFinder Team 🔎🧬"
if matrix_type == 'With FASTA sequences':
body = f"Hello 🧬\n\nResults obtained with TFinder.\n\nResponsive Elements:\n{individual_motif}\n\nPosition Weight Matrix:\n{matrix_text}\n\nThis email also includes the sequences used in FASTA format and an Excel table of results.\n\nFor all requests/information, please refer to the 'Contact' tab on the TFinder website. We would be happy to answer all your questions.\n\nBest regards\nTFinder Team 🔎🧬"
elif jaspar == 'JASPAR_ID':
body = f"Hello 🧬\n\nResults obtained with TFinder.\n\nJASPAR_ID: {jaspar_id} | Transcription Factor name: {TF_name}\n\nThis email also includes the sequences used in FASTA format and an Excel table of results.\n\nFor all requests/information, please refer to the 'Contact' tab on the TFinder website. We would be happy to answer all your questions.\n\nBest regards\nTFinder Team 🔎🧬"
else:
body = f"Hello 🧬\n\nResults obtained with TFinder.\n\nResponsive Elements:\n{IUPAC}\n\nPosition Weight Matrix:\n{matrix_text}\n\nThis email also includes the sequences used in FASTA format and an Excel table of results.\n\nFor all requests/information, please refer to the 'Contact' tab on the TFinder website. We would be happy to answer all your questions.\n\nBest regards\nTFinder Team 🔎🧬"
email(excel_file, csv_file, txt_output, email_receiver, body, jaspar)

if st.session_state["LOCAL"] == 'True':
email_receiver = st.text_input('Send results by email ✉',
value='', placeholder='Send results by email ✉',
label_visibility="collapsed")
if st.button("Send ✉"):
if jaspar == 'PWM':
if matrix_type == 'With PWM':
body = f"Hello 🧬\n\nResults obtained with TFinder.\n\nPosition Weight Matrix:\n{matrix_text}\n\nThis email also includes the sequences used in FASTA format and an Excel table of results.\n\nFor all requests/information, please refer to the 'Contact' tab on the TFinder website. We would be happy to answer all your questions.\n\nBest regards\nTFinder Team 🔎🧬"
if matrix_type == 'With FASTA sequences':
body = f"Hello 🧬\n\nResults obtained with TFinder.\n\nResponsive Elements:\n{individual_motif}\n\nPosition Weight Matrix:\n{matrix_text}\n\nThis email also includes the sequences used in FASTA format and an Excel table of results.\n\nFor all requests/information, please refer to the 'Contact' tab on the TFinder website. We would be happy to answer all your questions.\n\nBest regards\nTFinder Team 🔎🧬"
elif jaspar == 'JASPAR_ID':
body = f"Hello 🧬\n\nResults obtained with TFinder.\n\nJASPAR_ID: {jaspar_id} | Transcription Factor name: {TF_name}\n\nThis email also includes the sequences used in FASTA format and an Excel table of results.\n\nFor all requests/information, please refer to the 'Contact' tab on the TFinder website. We would be happy to answer all your questions.\n\nBest regards\nTFinder Team 🔎🧬"
else:
body = f"Hello 🧬\n\nResults obtained with TFinder.\n\nResponsive Elements:\n{IUPAC}\n\nPosition Weight Matrix:\n{matrix_text}\n\nThis email also includes the sequences used in FASTA format and an Excel table of results.\n\nFor all requests/information, please refer to the 'Contact' tab on the TFinder website. We would be happy to answer all your questions.\n\nBest regards\nTFinder Team 🔎🧬"
email(excel_file, csv_file, txt_output, email_receiver, body, jaspar)
else:
st.error(f"No consensus sequence found with the specified threshold")
5 changes: 2 additions & 3 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ Bio==1.5.9
logomaker==0.8
openpyxl==3.1.2
sendgrid==6.10.0
Pillow==9.5.0
pillow
stqdm==0.0.5
tqdm==4.66.1
streamlit-modal==0.1.0
Expand All @@ -47,5 +47,4 @@ altair==5.0.1
matplotlib==3.7.1
future==0.18.3
pyperclip==1.8.2
tabulate==0.9.0
streamlit-searchbox
tabulate==0.9.0
18 changes: 15 additions & 3 deletions streamlit_searchbox/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,13 @@
import streamlit as st
import streamlit.components.v1 as components

try:
from streamlit import rerun as rerun # type: ignore
except ImportError:
# conditional import for streamlit version <1.27
from streamlit import experimental_rerun as rerun # type: ignore


# point to build directory
parent_dir = os.path.dirname(os.path.abspath(__file__))
build_dir = os.path.join(parent_dir, "frontend/build")
Expand Down Expand Up @@ -68,7 +75,7 @@ def _process_search(
search_function: Callable[[str], List[Any]],
key: str,
searchterm: str,
rerun_on_update: bool,
rerun_on_update: bool
) -> None:
# nothing changed, avoid new search
if searchterm == st.session_state[key]["search"]:
Expand All @@ -84,7 +91,7 @@ def _process_search(
st.session_state[key]["options_py"] = _list_to_options_py(search_results)

if rerun_on_update:
st.rerun()
rerun()


@wrap_inactive_session
Expand All @@ -95,7 +102,7 @@ def st_searchbox(
default: Any = None,
default_options: List[Any] | None = None,
clear_on_submit: bool = False,
rerun_on_update: bool = True,
rerun_on_update: bool = True, delay: float = None,
key: str = "searchbox",
**kwargs,
) -> Any:
Expand All @@ -120,6 +127,7 @@ def st_searchbox(
Returns:
any: based on user selection
:param key:
:param delay:
:param rerun_on_update:
:param clear_on_submit:
:param default:
Expand All @@ -129,6 +137,10 @@ def st_searchbox(
:param default_options:
"""

# delay request (useful if you have a limit of request API)
if delay:
time.sleep(delay)

# key without prefix used by react component
key_react = f"{key}_react"

Expand Down
58 changes: 49 additions & 9 deletions tfinder/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -451,7 +451,8 @@ def is_dna(dna_sequence):

@staticmethod
# Find with JASPAR and manual matrix
def individual_motif_finder(dna_sequences, threshold, matrix, progress_bar, calc_pvalue=None, tss_ge_distance=None):
def individual_motif_finder(dna_sequences, threshold, matrix, progress_bar, calc_pvalue=None, tss_ge_distance=None,
lcs=None):
if calc_pvalue is not None:
if calc_pvalue not in ["ATGCPreset", "ATGCProportion"]:
raise ValueError("Use 'ATGCPreset' or 'ATGCProportion'")
Expand All @@ -474,10 +475,11 @@ def individual_motif_finder(dna_sequences, threshold, matrix, progress_bar, calc

random_sequences = IMO.generate_ranseq(probabilities, seq_length, progress_bar, num_random_seqs)

if calc_pvalue == 'ATGCPreset':
random_scores = {}
matrix_random_scores = []
for matrix_name, matrix in matrices.items():
random_scores = {}
matrix_random_scores = []
LCS = {}
for matrix_name, matrix in matrices.items():
if calc_pvalue == 'ATGCPreset':
max_score = sum(max(matrix[base][i] for base in matrix.keys()) for i in range(seq_length))
min_score = sum(min(matrix[base][i] for base in matrix.keys()) for i in range(seq_length))
for random_sequence in random_sequences:
Expand All @@ -489,8 +491,11 @@ def individual_motif_finder(dna_sequences, threshold, matrix, progress_bar, calc
normalized_random_score = (random_score - min_score) / (max_score - min_score)
matrix_random_scores.append(normalized_random_score)
progress_bar.update(1)
random_scores = np.array(matrix_random_scores)

random_scores = np.array(matrix_random_scores)
if lcs is not None:
generated_sequences = IMO.generate_sequences(matrix)
LCS[matrix_name] = generated_sequences

for name, dna_sequence, species, region in dna_sequences:
if calc_pvalue == 'ATGCProportion':
Expand Down Expand Up @@ -584,6 +589,28 @@ def individual_motif_finder(dna_sequences, threshold, matrix, progress_bar, calc
tis_position = position - tss_ge_distance

if normalized_score >= threshold:

if lcs is not None:
seqint = sequence_with_context[3:-3]
best_lcs_continuous = ""
best_lcs_for_relscore = ""
sequences_for_matrix_name = LCS[matrix_name]

for seqref in sequences_for_matrix_name:
lcs_continuous, lcs_for_relscore = IMO.LCScontinuous(seqint, seqref)

if len(lcs_continuous) > len(best_lcs_continuous):
best_lcs_continuous = lcs_continuous
score_lcs_continuous = len(best_lcs_continuous)
best_lcs_for_relscore = lcs_for_relscore

lcs_rel_score = IMO.calculate_score(best_lcs_for_relscore, matrix)
if max_score == min_score:
lcs_normalized_score = lcs_rel_score / max_score
else:
lcs_normalized_score = (lcs_rel_score - min_score) / (
max_score - min_score)

if calc_pvalue is not None:
p_value = (random_scores >= normalized_score).sum() / len(random_scores)

Expand All @@ -594,20 +621,33 @@ def individual_motif_finder(dna_sequences, threshold, matrix, progress_bar, calc
"{:.6f}".format(normalized_score).ljust(12)]
if calc_pvalue is not None:
row.append("{:.3e}".format(p_value).ljust(12))
if lcs is not None:
row += [best_lcs_continuous, str(score_lcs_continuous).ljust(15),
"{:.6f}".format(lcs_normalized_score).ljust(12)]
row += [strand, direction, name, species, region]
individual_motif_occurrences.append(row)

if len(individual_motif_occurrences) > 0:
if tss_ge_distance is not None:
individual_motif_occurrences.sort(key=lambda x: float(x[3]), reverse=True)
if tss_ge_distance is not None and calc_pvalue is not None and lcs is not None:
individual_motif_occurrences.sort(key=lambda x: (float(x[3]), x[6], x[7]), reverse=True)
elif tss_ge_distance is not None and lcs is not None:
individual_motif_occurrences.sort(key=lambda x: (float(x[3]), x[5], x[6]), reverse=True)
elif calc_pvalue is not None and lcs is not None:
individual_motif_occurrences.sort(key=lambda x: (float(x[2]), x[5], x[6]), reverse=True)
elif lcs is not None:
individual_motif_occurrences.sort(key=lambda x: (float(x[2]), x[4], x[5]), reverse=True)
elif tss_ge_distance is not None:
individual_motif_occurrences.sort(key=lambda x: (float(x[3])), reverse=True)
else:
individual_motif_occurrences.sort(key=lambda x: float(x[2]), reverse=True)
individual_motif_occurrences.sort(key=lambda x: (float(x[2])), reverse=True)
header = ["Position"]
if tss_ge_distance is not None:
header.append("Rel Position")
header += ["Sequence", "Rel Score"]
if calc_pvalue is not None:
header.append("p-value")
if lcs is not None:
header += ["LCS", 'LCS length', 'LCS Rel Score']
header += ["Strand", "Direction", "Gene", "Species", "Region"]
individual_motif_occurrences.insert(0, header)
else:
Expand Down

0 comments on commit 46b2061

Please sign in to comment.