Skip to content

Commit

Permalink
CHORE: ruff
Browse files Browse the repository at this point in the history
  • Loading branch information
Mikhail Lebedev committed May 17, 2024
1 parent 18ff81b commit 1c64535
Show file tree
Hide file tree
Showing 4 changed files with 95 additions and 109 deletions.
37 changes: 21 additions & 16 deletions alphastats/DataSet_Preprocess.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
from random import random
import pandas as pd
import sklearn
import itertools
import logging

import numpy as np
import pandas as pd
import sklearn
import sklearn.ensemble
import sklearn.impute
from alphastats.utils import ignore_warning
from sklearn.experimental import enable_iterative_imputer
import itertools

import streamlit as st

from sklearn.experimental import enable_iterative_imputer
from alphastats.utils import ignore_warning


class Preprocess:
def _remove_sampels(self, sample_list: list):
Expand All @@ -31,9 +31,14 @@ def preprocess_print_info(self):
print(pd.DataFrame(self.preprocessing_info.items()))

def _remove_na_values(self, cut_off):
if self.preprocessing_info.get("Missing values were removed") and self.preprocessing_info.get("Data completeness cut-off") == cut_off:
if (
self.preprocessing_info.get("Missing values were removed")
and self.preprocessing_info.get("Data completeness cut-off") == cut_off
):
logging.info("Missing values have already been filtered.")
st.warning("Missing values have already been filtered. To apply another cutoff, reset preprocessing.")
st.warning(
"Missing values have already been filtered. To apply another cutoff, reset preprocessing."
)
return
cut = 1 - cut_off

Expand All @@ -59,25 +64,25 @@ def _remove_na_values(self, cut_off):

self.preprocessing_info.update(
{
"Number of removed ProteinGroups due to data completeness cutoff": num_proteins - self.mat.shape[1],
"Number of removed ProteinGroups due to data completeness cutoff": num_proteins
- self.mat.shape[1],
"Missing values were removed": True,
"Data completeness cut-off": cut_off,
}
)


def _filter(self):
if len(self.filter_columns) == 0:
logging.info("No columns to filter.")
return

if self.preprocessing_info.get("Contaminations have been removed") == True:
if self.preprocessing_info.get("Contaminations have been removed"):
logging.info("Contaminatons have already been filtered.")
return

#  print column names with contamination
protein_groups_to_remove = self.rawinput[
(self.rawinput[self.filter_columns] == True).any(axis=1)
self.rawinput[self.filter_columns].any(axis=1)
][self.index_column].tolist()

protein_groups_to_remove = list(
Expand Down Expand Up @@ -186,10 +191,11 @@ def _linear_normalization(self, array):
@ignore_warning(UserWarning)
@ignore_warning(RuntimeWarning)
def _normalization(self, method: str):

if method == "zscore":
scaler = sklearn.preprocessing.StandardScaler()
normalized_array = scaler.fit_transform(self.mat.values.transpose()).transpose()
normalized_array = scaler.fit_transform(
self.mat.values.transpose()
).transpose()

elif method == "quantile":
qt = sklearn.preprocessing.QuantileTransformer(random_state=0)
Expand Down Expand Up @@ -268,7 +274,6 @@ def batch_correction(self, batch: str):
Args:
batch (str): column name in the metadata describing the different batches
"""
import combat
from combat.pycombat import pycombat

data = self.mat.transpose()
Expand Down
58 changes: 24 additions & 34 deletions alphastats/gui/pages/02_Import Data.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,30 @@
import streamlit as st
import sys
import os
import io
import os

import streamlit as st

try:
from alphastats.gui.utils.ui_helper import sidebar_info
from alphastats.gui.utils.analysis_helper import *
from alphastats.DataSet import DataSet
from alphastats.gui.utils.analysis_helper import (
get_sample_names_from_software_file,
read_uploaded_file_into_df,
)
from alphastats.gui.utils.software_options import software_options
from alphastats.gui.utils.ui_helper import sidebar_info
from alphastats.loader.MaxQuantLoader import MaxQuantLoader
from alphastats.DataSet import DataSet

except ModuleNotFoundError:
from utils.ui_helper import sidebar_info
from utils.analysis_helper import *
from utils.analysis_helper import (
get_sample_names_from_software_file,
read_uploaded_file_into_df,
)
from utils.software_options import software_options
from alphastats import MaxQuantLoader
from alphastats import DataSet


import pandas as pd
import plotly.express as px

from streamlit.runtime import get_instance
from streamlit.runtime.scriptrunner.script_run_context import get_script_run_ctx

Expand Down Expand Up @@ -56,7 +60,7 @@ def check_software_file(df, software):

if software == "MaxQuant":
expected_columns = ["Protein IDs", "Reverse", "Potential contaminant"]
if (set(expected_columns).issubset(set(df.columns.to_list()))) == False:
if not set(expected_columns).issubset(set(df.columns.to_list())):
st.error(
"This is not a valid MaxQuant file. Please check:"
"http://www.coxdocs.org/doku.php?id=maxquant:table:proteingrouptable"
Expand All @@ -71,20 +75,20 @@ def check_software_file(df, software):
"Protein.Group",
]

if (set(expected_columns).issubset(set(df.columns.to_list()))) == False:
if not set(expected_columns).issubset(set(df.columns.to_list())):
st.error("This is not a valid DIA-NN file.")

elif software == "Spectronaut":
expected_columns = [
"PG.ProteinGroups",
]

if (set(expected_columns).issubset(set(df.columns.to_list()))) == False:
if not set(expected_columns).issubset(set(df.columns.to_list())):
st.error("This is not a valid Spectronaut file.")

elif software == "FragPipe":
expected_columns = ["Protein"]
if (set(expected_columns).issubset(set(df.columns.to_list()))) == False:
if not set(expected_columns).issubset(set(df.columns.to_list())):
st.error(
"This is not a valid FragPipe file. Please check:"
"https://fragpipe.nesvilab.org/docs/tutorial_fragpipe_outputs.html#combined_proteintsv"
Expand Down Expand Up @@ -145,7 +149,6 @@ def select_sample_column_metadata(df, software):

for col in df.columns.to_list():
if bool(set(samples_proteomics_data) & set(df[col].to_list())):
print("comparing lengths", len(samples_proteomics_data), len(df[col].to_list()))
valid_sample_columns.append(col)

if len(valid_sample_columns) == 0:
Expand All @@ -155,16 +158,18 @@ def select_sample_column_metadata(df, software):
)

st.write(
f"Select column that contains sample IDs matching the sample names described "
"Select column that contains sample IDs matching the sample names described "
+ f"in {software_options.get(software).get('import_file')}"
)

with st.form("sample_column"):
st.selectbox("Sample Column", options=valid_sample_columns, key="sample_column")
submitted = st.form_submit_button("Create DataSet")

if submitted:
if len(df[st.session_state.sample_column].to_list()) != len(df[st.session_state.sample_column].unique()):
if len(df[st.session_state.sample_column].to_list()) != len(
df[st.session_state.sample_column].unique()
):
st.error("Sample names have to be unique.")
st.stop()
return True
Expand Down Expand Up @@ -212,8 +217,6 @@ def create_metadata_file():
with pd.ExcelWriter(buffer, engine="xlsxwriter") as writer:
# Write each dataframe to a different worksheet.
metadata.to_excel(writer, sheet_name="Sheet1", index=False)
# Close the Pandas Excel writer and output the Excel file to the buffer
# writer.close()

st.download_button(
label="Download metadata template as Excel",
Expand Down Expand Up @@ -249,14 +252,8 @@ def upload_metadatafile(software):
sample_column=st.session_state.sample_column,
)
st.session_state["metadata_columns"] = metadatafile_df.columns.to_list()
# if len(st.session_state["dataset"].metadata[self.sample].tolist()) != len(self.metadata[self.sample].unique()):
# st.error("Sample names have to be unique.")


load_options()

# display_loaded_dataset()

if st.session_state.loader is not None:
create_metadata_file()
st.write(
Expand All @@ -272,8 +269,6 @@ def upload_metadatafile(software):

load_options()

# display_loaded_dataset()


def load_sample_data():
_this_file = os.path.abspath(__file__)
Expand Down Expand Up @@ -319,7 +314,6 @@ def import_data():
options=options,
key="software",
)
session_state_empty = False

if st.session_state.software != "<select>":
upload_softwarefile(software=st.session_state.software)
Expand All @@ -336,10 +330,10 @@ def display_loaded_dataset():
st.markdown(f"*Preview:* Raw data from {st.session_state.dataset.software}")
st.dataframe(st.session_state.dataset.rawinput.head(5))

st.markdown(f"*Preview:* Metadata")
st.markdown("*Preview:* Metadata")
st.dataframe(st.session_state.dataset.metadata.head(5))

st.markdown(f"*Preview:* Matrix")
st.markdown("*Preview:* Matrix")

df = pd.DataFrame(
st.session_state.dataset.mat.values,
Expand Down Expand Up @@ -370,7 +364,6 @@ def empty_session_state():
st.empty()
st.session_state["software"] = "<select>"

from streamlit.runtime import get_instance
from streamlit.runtime.scriptrunner.script_run_context import get_script_run_ctx

user_session_id = get_script_run_ctx().session_id
Expand All @@ -379,8 +372,6 @@ def empty_session_state():

sidebar_info()

# import_data()


if "dataset" not in st.session_state:
st.markdown("### Import Proteomics Data")
Expand All @@ -389,7 +380,6 @@ def empty_session_state():
"Create a DataSet with the output of your proteomics software package and the corresponding metadata (optional). "
)

# import_data()
import_data()

if "dataset" in st.session_state:
Expand Down
Loading

0 comments on commit 1c64535

Please sign in to comment.