diff --git a/pages/4_T-SNE_for_smiles_app3.py b/pages/4_T-SNE_for_smiles_app3.py new file mode 100644 index 0000000..6f82a36 --- /dev/null +++ b/pages/4_T-SNE_for_smiles_app3.py @@ -0,0 +1,138 @@ +import streamlit as st +import pandas as pd +import numpy as np +from rdkit import Chem +from rdkit.Chem import Descriptors, Draw +from sklearn.manifold import TSNE +from bokeh.plotting import figure +from bokeh.models import ColumnDataSource, HoverTool, CustomJS +from bokeh.palettes import Category10 +from bokeh.transform import factor_cmap +import io +import base64 + +# Streamlit app title +st.title("t-SNE Plot of Molecules") +st.markdown("*This Streamlit app* allows user to upload **any** :blue-background[CSV file] containing a column named ***SMILES*** and a label column named ***Type***. It generates a t-SNE plot (https://en.wikipedia.org/wiki/T-distributed_stochastic_neighbor_embedding) which shows the chemical space calculated on a selection of most common RDKIT 2D descriptors. Plot colors are dependent on ***Type*** column indicating the groups.") + +def plot_molecule(smiles): + try: + mol = Chem.MolFromSmiles(str(smiles)) + if mol is not None: + img = Draw.MolToImage(mol, size=(200, 200)) + return img + else: + return None + except: + return None + +def get_molecule_image_src(smiles): + img = plot_molecule(smiles) + if img: + buffered = io.BytesIO() + img.save(buffered, format="PNG") + img_str = base64.b64encode(buffered.getvalue()).decode() + return f"data:image/png;base64,{img_str}" + return "" + +# File uploader +uploaded_file = st.file_uploader("Choose a CSV file", type="csv") + +if uploaded_file is not None: + # Read the CSV file + df = pd.read_csv(uploaded_file) + + # Check if required columns exist + if 'SMILES' not in df.columns or 'Type' not in df.columns: + st.error("The CSV file must contain 'SMILES' and 'Type' columns.") + else: + # Continue with the analysis + st.success("File uploaded successfully. Generating t-SNE plot...") + + # Convert SMILES to RDKit molecules + mols = [Chem.MolFromSmiles(smiles) for smiles in df['SMILES']] + + # Convert SMILES column to string if it exists + if 'SMILES' in df.columns: + df['SMILES'] = df['SMILES'].astype(str) + df['molecule_img'] = df['SMILES'].apply(get_molecule_image_src) + + # Calculate 2D RDKit descriptors + desc_names = ['MolWt', 'FractionCSP3', 'HeavyAtomCount', 'NumAliphaticCarbocycles', + 'NumAliphaticHeterocycles', 'NumAliphaticRings', 'NumAromaticCarbocycles', + 'NumAromaticHeterocycles', 'NumAromaticRings', 'NumHAcceptors', + 'NumHDonors', 'NumHeteroatoms', 'NumRotatableBonds', 'NumSaturatedCarbocycles', + 'NumSaturatedHeterocycles', 'NumSaturatedRings', 'RingCount', + 'MolLogP', 'MolMR'] + + desc_functions = [(name, getattr(Descriptors, name)) for name in desc_names] + calc = lambda m: [func(m) for _, func in desc_functions] + descriptors = [calc(mol) for mol in mols] + descriptors_df = pd.DataFrame(descriptors, columns=desc_names) + + # Perform t-SNE + tsne = TSNE(n_components=2, random_state=42, perplexity=30, n_iter=500) + tsne_results = tsne.fit_transform(descriptors_df) + + # Create a Bokeh ColumnDataSource + source = ColumnDataSource(data=dict( + x=tsne_results[:, 0], + y=tsne_results[:, 1], + smiles=df['SMILES'], + type=df['Type'], + molecule_img=df['molecule_img'] + )) + + # Create Bokeh figure + p = figure(width=800, height=600, title="t-SNE Plot of 2D RDKit Descriptors") + + # Create a color mapper + colors = Category10[10][:len(df['Type'].unique())] + color_mapper = factor_cmap('type', palette=colors, factors=df['Type'].unique()) + + # Create the scatter plot + scatter = p.scatter('x', 'y', source=source, color=color_mapper, legend_field='type', size=10, alpha=0.8) + + # Create tooltip HTML + tooltip_html = """ +