diff --git a/alphastats/DataSet_Preprocess.py b/alphastats/DataSet_Preprocess.py index 24229126..b89fd887 100644 --- a/alphastats/DataSet_Preprocess.py +++ b/alphastats/DataSet_Preprocess.py @@ -1,16 +1,16 @@ -from random import random -import pandas as pd -import sklearn +import itertools import logging + import numpy as np +import pandas as pd +import sklearn import sklearn.ensemble import sklearn.impute -from alphastats.utils import ignore_warning -from sklearn.experimental import enable_iterative_imputer -import itertools - import streamlit as st +from sklearn.experimental import enable_iterative_imputer +from alphastats.utils import ignore_warning + class Preprocess: def _remove_sampels(self, sample_list: list): @@ -31,9 +31,14 @@ def preprocess_print_info(self): print(pd.DataFrame(self.preprocessing_info.items())) def _remove_na_values(self, cut_off): - if self.preprocessing_info.get("Missing values were removed") and self.preprocessing_info.get("Data completeness cut-off") == cut_off: + if ( + self.preprocessing_info.get("Missing values were removed") + and self.preprocessing_info.get("Data completeness cut-off") == cut_off + ): logging.info("Missing values have already been filtered.") - st.warning("Missing values have already been filtered. To apply another cutoff, reset preprocessing.") + st.warning( + "Missing values have already been filtered. To apply another cutoff, reset preprocessing." + ) return cut = 1 - cut_off @@ -59,25 +64,25 @@ def _remove_na_values(self, cut_off): self.preprocessing_info.update( { - "Number of removed ProteinGroups due to data completeness cutoff": num_proteins - self.mat.shape[1], + "Number of removed ProteinGroups due to data completeness cutoff": num_proteins + - self.mat.shape[1], "Missing values were removed": True, "Data completeness cut-off": cut_off, } ) - def _filter(self): if len(self.filter_columns) == 0: logging.info("No columns to filter.") return - if self.preprocessing_info.get("Contaminations have been removed") == True: + if self.preprocessing_info.get("Contaminations have been removed"): logging.info("Contaminatons have already been filtered.") return #  print column names with contamination protein_groups_to_remove = self.rawinput[ - (self.rawinput[self.filter_columns] == True).any(axis=1) + self.rawinput[self.filter_columns].any(axis=1) ][self.index_column].tolist() protein_groups_to_remove = list( @@ -186,10 +191,11 @@ def _linear_normalization(self, array): @ignore_warning(UserWarning) @ignore_warning(RuntimeWarning) def _normalization(self, method: str): - if method == "zscore": scaler = sklearn.preprocessing.StandardScaler() - normalized_array = scaler.fit_transform(self.mat.values.transpose()).transpose() + normalized_array = scaler.fit_transform( + self.mat.values.transpose() + ).transpose() elif method == "quantile": qt = sklearn.preprocessing.QuantileTransformer(random_state=0) @@ -268,7 +274,6 @@ def batch_correction(self, batch: str): Args: batch (str): column name in the metadata describing the different batches """ - import combat from combat.pycombat import pycombat data = self.mat.transpose() diff --git a/alphastats/gui/pages/02_Import Data.py b/alphastats/gui/pages/02_Import Data.py index a700f428..1e6322b1 100644 --- a/alphastats/gui/pages/02_Import Data.py +++ b/alphastats/gui/pages/02_Import Data.py @@ -1,26 +1,30 @@ -import streamlit as st -import sys -import os import io +import os + +import streamlit as st try: - from alphastats.gui.utils.ui_helper import sidebar_info - from alphastats.gui.utils.analysis_helper import * + from alphastats.DataSet import DataSet + from alphastats.gui.utils.analysis_helper import ( + get_sample_names_from_software_file, + read_uploaded_file_into_df, + ) from alphastats.gui.utils.software_options import software_options + from alphastats.gui.utils.ui_helper import sidebar_info from alphastats.loader.MaxQuantLoader import MaxQuantLoader - from alphastats.DataSet import DataSet except ModuleNotFoundError: from utils.ui_helper import sidebar_info - from utils.analysis_helper import * + from utils.analysis_helper import ( + get_sample_names_from_software_file, + read_uploaded_file_into_df, + ) from utils.software_options import software_options from alphastats import MaxQuantLoader from alphastats import DataSet - import pandas as pd import plotly.express as px - from streamlit.runtime import get_instance from streamlit.runtime.scriptrunner.script_run_context import get_script_run_ctx @@ -56,7 +60,7 @@ def check_software_file(df, software): if software == "MaxQuant": expected_columns = ["Protein IDs", "Reverse", "Potential contaminant"] - if (set(expected_columns).issubset(set(df.columns.to_list()))) == False: + if not set(expected_columns).issubset(set(df.columns.to_list())): st.error( "This is not a valid MaxQuant file. Please check:" "http://www.coxdocs.org/doku.php?id=maxquant:table:proteingrouptable" @@ -71,7 +75,7 @@ def check_software_file(df, software): "Protein.Group", ] - if (set(expected_columns).issubset(set(df.columns.to_list()))) == False: + if not set(expected_columns).issubset(set(df.columns.to_list())): st.error("This is not a valid DIA-NN file.") elif software == "Spectronaut": @@ -79,12 +83,12 @@ def check_software_file(df, software): "PG.ProteinGroups", ] - if (set(expected_columns).issubset(set(df.columns.to_list()))) == False: + if not set(expected_columns).issubset(set(df.columns.to_list())): st.error("This is not a valid Spectronaut file.") elif software == "FragPipe": expected_columns = ["Protein"] - if (set(expected_columns).issubset(set(df.columns.to_list()))) == False: + if not set(expected_columns).issubset(set(df.columns.to_list())): st.error( "This is not a valid FragPipe file. Please check:" "https://fragpipe.nesvilab.org/docs/tutorial_fragpipe_outputs.html#combined_proteintsv" @@ -145,7 +149,6 @@ def select_sample_column_metadata(df, software): for col in df.columns.to_list(): if bool(set(samples_proteomics_data) & set(df[col].to_list())): - print("comparing lengths", len(samples_proteomics_data), len(df[col].to_list())) valid_sample_columns.append(col) if len(valid_sample_columns) == 0: @@ -155,16 +158,18 @@ def select_sample_column_metadata(df, software): ) st.write( - f"Select column that contains sample IDs matching the sample names described " + "Select column that contains sample IDs matching the sample names described " + f"in {software_options.get(software).get('import_file')}" ) with st.form("sample_column"): st.selectbox("Sample Column", options=valid_sample_columns, key="sample_column") submitted = st.form_submit_button("Create DataSet") - + if submitted: - if len(df[st.session_state.sample_column].to_list()) != len(df[st.session_state.sample_column].unique()): + if len(df[st.session_state.sample_column].to_list()) != len( + df[st.session_state.sample_column].unique() + ): st.error("Sample names have to be unique.") st.stop() return True @@ -212,8 +217,6 @@ def create_metadata_file(): with pd.ExcelWriter(buffer, engine="xlsxwriter") as writer: # Write each dataframe to a different worksheet. metadata.to_excel(writer, sheet_name="Sheet1", index=False) - # Close the Pandas Excel writer and output the Excel file to the buffer - # writer.close() st.download_button( label="Download metadata template as Excel", @@ -249,14 +252,8 @@ def upload_metadatafile(software): sample_column=st.session_state.sample_column, ) st.session_state["metadata_columns"] = metadatafile_df.columns.to_list() - # if len(st.session_state["dataset"].metadata[self.sample].tolist()) != len(self.metadata[self.sample].unique()): - # st.error("Sample names have to be unique.") - - load_options() - # display_loaded_dataset() - if st.session_state.loader is not None: create_metadata_file() st.write( @@ -272,8 +269,6 @@ def upload_metadatafile(software): load_options() - # display_loaded_dataset() - def load_sample_data(): _this_file = os.path.abspath(__file__) @@ -319,7 +314,6 @@ def import_data(): options=options, key="software", ) - session_state_empty = False if st.session_state.software != "" - from streamlit.runtime import get_instance from streamlit.runtime.scriptrunner.script_run_context import get_script_run_ctx user_session_id = get_script_run_ctx().session_id @@ -379,8 +372,6 @@ def empty_session_state(): sidebar_info() -# import_data() - if "dataset" not in st.session_state: st.markdown("### Import Proteomics Data") @@ -389,7 +380,6 @@ def empty_session_state(): "Create a DataSet with the output of your proteomics software package and the corresponding metadata (optional). " ) - # import_data() import_data() if "dataset" in st.session_state: diff --git a/alphastats/gui/pages/03_Preprocessing.py b/alphastats/gui/pages/03_Preprocessing.py index f55374c5..8a927ccb 100644 --- a/alphastats/gui/pages/03_Preprocessing.py +++ b/alphastats/gui/pages/03_Preprocessing.py @@ -12,7 +12,7 @@ def preprocessing(): st.markdown( "Before analyzing your data, consider normalizing and imputing your data as well as the removal of contaminants. " - + "A more detailed description about the preprocessing methods can be found in the AlphaPeptStats " + + "A more detailed description about the preprocessing methods can be found in the AlphaPeptStats " + "[documentation](https://alphapeptstats.readthedocs.io/en/main/data_preprocessing.html)." ) @@ -30,23 +30,17 @@ def preprocessing(): ) remove_samples = st.multiselect( - "Remove samples from analysis", - options=st.session_state.dataset.metadata[ - st.session_state.dataset.sample - ].to_list(), + "Remove samples from analysis", + options=st.session_state.dataset.metadata[st.session_state.dataset.sample].to_list() ) data_completeness = st.number_input( f"Data completeness across samples cut-off \n(0.7 -> protein has to be detected in at least 70% of the samples)", - value=0.0, - min_value=0.0, - max_value=1.0, - step=0.1, + value=0, min_value=0, max_value=1 ) log2_transform = st.selectbox( - "Log2-transform dataset", - options=[True, False], + "Log2-transform dataset", options=[True, False], ) normalization = st.selectbox( @@ -62,64 +56,57 @@ def preprocessing(): if submitted: if len(remove_samples) == 0: remove_samples = None - + st.session_state.dataset.preprocess( remove_contaminations=remove_contaminations, log2_transform=log2_transform, - remove_samples=remove_samples, + remove_samples = remove_samples, data_completeness=data_completeness, subset=subset, normalization=normalization, imputation=imputation, ) - - st.session_state["preprocessing_info"] = st.session_state.dataset.preprocessing_info - - - if submitted or "preprocessing_info" in st.session_state: - st.info( + preprocessing = st.session_state.dataset.preprocessing_info + st.info( "Data has been processed. " + datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S") ) - st.dataframe( - pd.DataFrame.from_dict(st.session_state["preprocessing_info"], orient="index").astype(str), - use_container_width=True, - ) - with c2: - - if submitted: - st.markdown("**Intensity Distribution after preprocessing per sample**") - fig_processed = st.session_state.dataset.plot_sampledistribution() - st.plotly_chart( - fig_processed.update_layout(plot_bgcolor="white"), - use_container_width=True, - ) - - else: - st.markdown("**Intensity Distribution per sample**") - fig_none_processed = st.session_state.dataset.plot_sampledistribution() - st.plotly_chart( - fig_none_processed.update_layout(plot_bgcolor="white"), + st.dataframe( + pd.DataFrame.from_dict(preprocessing, orient="index").astype(str), use_container_width=True, ) - c1, c2 = st.columns(2) - with c1: + st.markdown("#### Batch correction: correct for technical bias") with st.form("Batch correction: correct for technical bias"): batch = st.selectbox( - "Batch", options=st.session_state.dataset.metadata.columns.to_list() + "Batch", + options= st.session_state.dataset.metadata.columns.to_list() ) submit_batch_correction = st.form_submit_button("Submit") - + if submit_batch_correction: - st.session_state.dataset.batch_correction(batch=batch) + st.session_state.dataset.batch_correction( + batch=batch + ) st.info( "Data has been processed. " + datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S") ) + with c2: + + if submitted: + st.markdown("**Intensity Distribution after preprocessing per sample**") + fig_processed = st.session_state.dataset.plot_sampledistribution() + st.plotly_chart(fig_processed.update_layout(plot_bgcolor="white"), use_container_width=True) + + else: + st.markdown("**Intensity Distribution per sample**") + fig_none_processed = st.session_state.dataset.plot_sampledistribution() + st.plotly_chart(fig_none_processed.update_layout(plot_bgcolor="white"), use_container_width=True) + reset_steps = st.button("Reset all Preprocessing steps") @@ -129,12 +116,14 @@ def preprocessing(): def reset_preprocessing(): st.session_state.dataset.create_matrix() + preprocessing = st.session_state.dataset.preprocessing_info st.info( "Data has been reset. " + datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S") ) - st.session_state["preprocessing_info"] = st.session_state.dataset.preprocessing_info - # reset the page - st.rerun() + st.dataframe( + pd.DataFrame.from_dict(preprocessing, orient="index").astype(str), + use_container_width=True, + ) def main_preprocessing(): @@ -156,4 +145,4 @@ def main_preprocessing(): def plot_intensity_distribution(): st.selectbox( "Sample", options=st.session_state.dataset.metadata["sample"].to_list() - ) + ) \ No newline at end of file diff --git a/tests/test_DataSet.py b/tests/test_DataSet.py index 559e6161..4aef9607 100644 --- a/tests/test_DataSet.py +++ b/tests/test_DataSet.py @@ -270,9 +270,9 @@ def test_preprocess_normalize_zscore(self): self.obj.preprocess(log2_transform=False, normalization="zscore") expected_mat = pd.DataFrame( { - "a": [-1.33630621, 1.06904497, 0.26726124], - "b": [1.41421356, -0.70710678, -0.70710678], - "c": [-1.38873015, 0.9258201, 0.46291005], + "a": [-0.162221, -0.508001, -0.707107], + "b": [1.297771, -0.889001, -0.707107], + "c": [-1.135550, 1.397001, 1.414214], } ) pd._testing.assert_frame_equal(self.obj.mat, expected_mat) @@ -282,7 +282,9 @@ def test_preprocess_normalize_quantile(self): # Quantile Normalization self.obj.preprocess(log2_transform=False, normalization="quantile") expected_mat = pd.DataFrame( - {"a": [0.0, 1.0, 0.5], "b": [1.0, 0.0, 0.0], "c": [0.0, 1.0, 0.5]} + {"a": [0.5, 0.5, 0.0], + "b": [1.0, 0.0, 0.0], + "c": [0.0, 1.0, 1.0]} ) pd._testing.assert_frame_equal(self.obj.mat, expected_mat) @@ -306,9 +308,9 @@ def test_preprocess_normalize_vst(self): self.obj.preprocess(log2_transform=False, normalization="vst") expected_mat = pd.DataFrame( { - "a": [-1.307734, 1.120100, 0.187634], - "b": [ 1.414214, -0.707107, -0.707107], - "c": [-1.360307, 1.015077, 0.345230], + "a": [-0.009526, -0.236399, -0.707107], + "b": [ 1.229480, -1.089313, -0.707107], + "c": [-1.219954, 1.325712, 1.414214], } ) pd._testing.assert_frame_equal(self.obj.mat.round(2), expected_mat.round(2)) @@ -507,7 +509,7 @@ def test_plot_intenstity_subgroup_significance_warning(self, mock): ) plot_dict = plot.to_plotly_json() self.assertEqual(len(plot_dict.get("data")), 5) - mock.assert_called_once() + self.assertEqual(mock.call_count, 2) def test_anova_with_tukey(self): # with first 100 protein ids @@ -577,8 +579,8 @@ def test_plot_volcano_sam(self): ) # fdr lines get drawn - line_1 = plot.to_plotly_json()["data"][3].get("line").get("shape") - line_2 = plot.to_plotly_json()["data"][4].get("line").get("shape") + line_1 = plot.to_plotly_json()["data"][-2].get("line").get("shape") + line_2 = plot.to_plotly_json()["data"][-1].get("line").get("shape") self.assertEqual(line_1, "spline") self.assertEqual(line_2, "spline") @@ -739,10 +741,10 @@ def test_plot_samplehistograms(self): self.assertEqual(312, len(fig["data"])) def test_batch_correction(self): - self.obj.preprocess(subset=True, imputation="knn", normalization="quantile") + self.obj.preprocess(subset=True, imputation="knn", normalization="linear") self.obj.batch_correction(batch="batch_artifical_added") first_value = self.obj.mat.values[0, 0] - self.assertAlmostEqual(0.0111, first_value, places=2) + self.assertAlmostEqual(-0.00555, first_value, places=3) def test_multicova_analysis_invalid_covariates(self): self.obj.preprocess(imputation="knn", normalization="zscore", subset=True)