diff --git a/2_ProteinFucciPsuedotime.py b/2_ProteinFucciPsuedotime.py index ca225fe..dc3e99c 100644 --- a/2_ProteinFucciPsuedotime.py +++ b/2_ProteinFucciPsuedotime.py @@ -16,300 +16,315 @@ import pandas as pd import numpy as np import scipy +import argparse -# Make PDF text readable -plt.rcParams["pdf.fonttype"] = 42 -plt.rcParams["ps.fonttype"] = 42 -plt.rcParams["savefig.dpi"] = 300 - -#%% Read in the protein data -import_dict = Loaders.load_protein_fucci_pseudotime() -u_plate, well_plate, well_plate_imgnb, well_plate_imgnb_objnb, u_well_plates = ( - import_dict["u_plate"], - import_dict["well_plate"], - import_dict["well_plate_imgnb"], - import_dict["well_plate_imgnb_objnb"], - import_dict["u_well_plates"], -) -ab_nuc, ab_cyto, ab_cell, mt_cell = ( - import_dict["ab_nuc"], - import_dict["ab_cyto"], - import_dict["ab_cell"], - import_dict["mt_cell"], -) -area_cell, area_nuc = import_dict["area_cell"], import_dict["area_nuc"] -wp_ensg, wp_ab = import_dict["wp_ensg"], import_dict["wp_ab"] -green_fucci, red_fucci = import_dict["green_fucci"], import_dict["red_fucci"] -log_green_fucci_zeroc, log_red_fucci_zeroc = ( - import_dict["log_green_fucci_zeroc"], - import_dict["log_red_fucci_zeroc"], -) -log_green_fucci_zeroc_rescale, log_red_fucci_zeroc_rescale = ( - import_dict["log_green_fucci_zeroc_rescale"], - import_dict["log_red_fucci_zeroc_rescale"], -) -wp_comp_kruskal_gaussccd_adj, wp_pass_kruskal_gaussccd_bh_comp = ( - import_dict["wp_comp_kruskal_gaussccd_adj"], - import_dict["wp_pass_kruskal_gaussccd_bh_comp"], -) -fucci_data = import_dict["fucci_data"] -wp_iscell, wp_isnuc, wp_iscyto = ( - import_dict["wp_iscell"], - import_dict["wp_isnuc"], - import_dict["wp_iscyto"], -) -curr_wp_phases, mockbulk_phases = ( - import_dict["curr_wp_phases"], - import_dict["mockbulk_phases"], -) - -#%% -# Idea: Calculate the polar coordinates and other stuff -# Exec: Devin's calculations -# Output: fucci plot with polar coordinates -pseudotime_result = FucciPseudotime.pseudotime_protein( - fucci_data, - ab_nuc, - ab_cyto, - ab_cell, - mt_cell, - area_cell, - area_nuc, - well_plate, - well_plate_imgnb, - well_plate_imgnb_objnb, - log_red_fucci_zeroc_rescale, - log_green_fucci_zeroc_rescale, - mockbulk_phases, -) -pol_sort_well_plate, pol_sort_norm_rev, pol_sort_well_plate_imgnb, pol_sort_well_plate_imgnb_objnb, pol_sort_ab_nuc, pol_sort_ab_cyto, pol_sort_ab_cell, pol_sort_mt_cell, pol_sort_area_cell, pol_sort_area_nuc, pol_sort_fred, pol_sort_fgreen, pol_sort_mockbulk_phases = ( - pseudotime_result -) - -#%% Calculate measures of variance of protein abundance in single cells -# Idea: Calculate measures of variance, and show them in plots -# Execution: Now that we already have the data filtered for variability, this is just descriptive. -# Output: scatters of antibody vs microtubule variances by different measures of variaibility - -# toggle for using log-transformed intensities -# we decided to use natural intensities -use_log = False -calculate_variaton_result = ProteinVariability.calculate_variation( - use_log, - u_well_plates, - wp_iscell, - wp_isnuc, - wp_iscyto, - pol_sort_well_plate, - pol_sort_ab_cell, - pol_sort_ab_nuc, - pol_sort_ab_cyto, - pol_sort_mt_cell, - pol_sort_well_plate_imgnb, -) -mean_mean_comp, var_comp, gini_comp, cv_comp, var_cell, gini_cell, cv_cell, var_mt, gini_mt, cv_mt = ( - calculate_variaton_result -) - -# Compare variances for protein and microtubules, the internal control for each image -removeThese = pd.read_csv("input/ProteinData/ReplicatesToRemove.txt", header=None)[0] # make these independent samples for one-sided Kruskal-Wallis tests -utils.general_boxplot( - (var_comp[~np.isin(u_well_plates, removeThese)], var_mt[~np.isin(u_well_plates, removeThese)]), - ("Protein", "Microtubules"), - "", - "Variance", - "", - False, - f"figures/ProteinMicrotubuleVariances.pdf", -) -utils.general_boxplot( - (cv_comp[~np.isin(u_well_plates, removeThese)], gini_mt[~np.isin(u_well_plates, removeThese)]), - ("Protein", "Microtubules"), - "", - "CV", - "", - False, - f"figures/ProteinMicrotubuleCVs.pdf", -) -utils.general_boxplot( - (gini_comp[~np.isin(u_well_plates, removeThese)], gini_mt[~np.isin(u_well_plates, removeThese)]), - ("Protein", "Microtubules"), - "", - "Gini", - "", - False, - f"figures/ProteinMicrotubuleGinis.pdf", -) -p_varProt_varMt = 2*scipy.stats.kruskal(var_comp[~np.isin(u_well_plates, removeThese)], var_mt[~np.isin(u_well_plates, removeThese)])[1] -p_cvProt_cvMt = 2*scipy.stats.kruskal(cv_comp[~np.isin(u_well_plates, removeThese)], cv_mt[~np.isin(u_well_plates, removeThese)])[1] -p_giniProt_giniMt = 2*scipy.stats.kruskal(gini_comp[~np.isin(u_well_plates, removeThese)], gini_mt[~np.isin(u_well_plates, removeThese)])[1] -print( - f"{p_varProt_varMt}: p-value for difference between protein and microtubule variances" -) -print( - f"{p_cvProt_cvMt}: p-value for difference between protein and microtubule CVs" -) -print( - f"{p_giniProt_giniMt}: p-value for difference between protein and microtubule Gini indices" -) - -#%% Gaussian clustering to identify biomodal intensity distributions -bimodal_results = ProteinBimodality.identify_bimodal_intensity_distributions( - u_well_plates, - wp_ensg, - pol_sort_well_plate, - pol_sort_norm_rev, - pol_sort_ab_cell, - pol_sort_ab_nuc, - pol_sort_ab_cyto, - pol_sort_mt_cell, - wp_iscell, - wp_isnuc, - wp_iscyto, -) -wp_isbimodal_fcpadj_pass, wp_bimodal_cluster_idxs, wp_isbimodal_generally, wp_bimodal_fcmaxmin = ( - bimodal_results -) - -#%% Determine cell cycle dependence for each protein -use_log_ccd = False -do_remove_outliers = True -alphaa = 0.05 - -# Determine cell cycle dependence for proteins -ccd_results = ProteinCellCycleDependence.cell_cycle_dependence_protein( - u_well_plates, - wp_ensg, - wp_ab, - use_log_ccd, - do_remove_outliers, - pol_sort_well_plate, - pol_sort_norm_rev, - pol_sort_ab_cell, - pol_sort_ab_nuc, - pol_sort_ab_cyto, - pol_sort_mt_cell, - pol_sort_fred, - pol_sort_fgreen, - pol_sort_mockbulk_phases, - pol_sort_area_cell, - pol_sort_area_nuc, - pol_sort_well_plate_imgnb, - wp_iscell, - wp_isnuc, - wp_iscyto, - wp_isbimodal_fcpadj_pass, - wp_bimodal_cluster_idxs, - wp_comp_kruskal_gaussccd_adj, -) -wp_comp_ccd_difffromrng, mean_diff_from_rng_mt, wp_comp_ccd_clust1, wp_comp_ccd_clust2, wp_ccd_unibimodal, wp_comp_ccd_gauss, perc_var_comp, mean_diff_from_rng, wp_comp_eq_percvar_adj, mean_diff_from_rng_clust1, wp_comp_eq_percvar_adj_clust1, mean_diff_from_rng_clust2, wp_comp_eq_percvar_adj_clust2, mvavgs_x, mvavgs_comp, curr_pols, curr_ab_norms, mvperc_comps, curr_freds, curr_fgreens, curr_mockbulk_phases, curr_area_cell, curr_ab_nuc, curr_well_plate_imgnb, folder = ( - ccd_results -) - -# Move the temporal average plots to more informative places -ProteinCellCycleDependence.copy_mvavg_plots_protein( - folder, - wp_ensg, - wp_comp_ccd_difffromrng, - wp_isbimodal_fcpadj_pass, - wp_comp_ccd_clust1, - wp_comp_ccd_clust2, - wp_ccd_unibimodal, - wp_comp_ccd_gauss, -) -ProteinCellCycleDependence.global_plots_protein( - alphaa, - u_well_plates, - wp_ccd_unibimodal, - perc_var_comp, - mean_mean_comp, - gini_comp, - cv_comp, - mean_diff_from_rng, - wp_comp_eq_percvar_adj, - wp_comp_kruskal_gaussccd_adj, -) - -# Analyze the CCD results and save them -ccd_comp, nonccd_comp, bioccd = ProteinCellCycleDependence.analyze_ccd_variation_protein( - folder, - u_well_plates, - wp_ensg, - wp_ab, - wp_iscell, - wp_isnuc, - wp_iscyto, - wp_comp_ccd_difffromrng, - wp_comp_ccd_clust1, - wp_comp_ccd_clust2, - var_comp, - gini_comp, - perc_var_comp, - mean_diff_from_rng, - wp_comp_kruskal_gaussccd_adj, - wp_comp_eq_percvar_adj, - mean_diff_from_rng_clust1, - wp_comp_eq_percvar_adj_clust1, - mean_diff_from_rng_clust2, - wp_comp_eq_percvar_adj_clust2, - wp_isbimodal_fcpadj_pass, - wp_isbimodal_generally, - wp_ccd_unibimodal, - wp_bimodal_fcmaxmin, - wp_comp_ccd_gauss, -) - -# Make a dataframe for plotting on the HPA website -ProteinCellCycleDependence.make_plotting_dataframe( - wp_ensg, - wp_ab, - u_well_plates, - wp_iscell, - wp_iscyto, - wp_isnuc, - ccd_comp, - bioccd, - curr_pols, - curr_ab_norms, - curr_freds, - curr_fgreens, - curr_mockbulk_phases, - mvavgs_x, - mvavgs_comp, - mvperc_comps, - gini_comp, - perc_var_comp, -) - -# Perform comparison to LASSO for finding CCD proteins -ProteinCellCycleDependence.compare_to_lasso_analysis( - u_well_plates, - pol_sort_norm_rev, - pol_sort_well_plate, - pol_sort_ab_cell, - pol_sort_ab_nuc, - pol_sort_ab_cyto, - pol_sort_mt_cell, - pol_sort_fred, - pol_sort_fgreen, - wp_iscell, - wp_isnuc, - wp_iscyto, - wp_ensg, - ccd_comp, -) - -# Generate UMAPs to illustrate cutoffs and stability -ProteinCellCycleDependence.generate_protein_umaps( - u_well_plates, - pol_sort_norm_rev, - pol_sort_well_plate, - pol_sort_ab_cell, - pol_sort_ab_nuc, - pol_sort_ab_cyto, - pol_sort_mt_cell, - wp_iscell, - wp_isnuc, - wp_iscyto, - mean_diff_from_rng, -) +if __name__ == "__main__": + description = "Single cell proteogenomic analysis script -- protein cell cycle dependence" + parser = argparse.ArgumentParser(description=description) + parser.add_argument( + "--quicker", action="store_true", help="Skip some plotting and analyses, namely for HPA releases." + ) + args = parser.parse_args() + doAllPlotsAndAnalyses = not args.quicker + + # Make PDF text readable + plt.rcParams["pdf.fonttype"] = 42 + plt.rcParams["ps.fonttype"] = 42 + plt.rcParams["savefig.dpi"] = 300 + + #%% Read in the protein data + import_dict = Loaders.load_protein_fucci_pseudotime() + u_plate, well_plate, well_plate_imgnb, well_plate_imgnb_objnb, u_well_plates = ( + import_dict["u_plate"], + import_dict["well_plate"], + import_dict["well_plate_imgnb"], + import_dict["well_plate_imgnb_objnb"], + import_dict["u_well_plates"], + ) + ab_nuc, ab_cyto, ab_cell, mt_cell = ( + import_dict["ab_nuc"], + import_dict["ab_cyto"], + import_dict["ab_cell"], + import_dict["mt_cell"], + ) + area_cell, area_nuc = import_dict["area_cell"], import_dict["area_nuc"] + wp_ensg, wp_ab = import_dict["wp_ensg"], import_dict["wp_ab"] + green_fucci, red_fucci = import_dict["green_fucci"], import_dict["red_fucci"] + log_green_fucci_zeroc, log_red_fucci_zeroc = ( + import_dict["log_green_fucci_zeroc"], + import_dict["log_red_fucci_zeroc"], + ) + log_green_fucci_zeroc_rescale, log_red_fucci_zeroc_rescale = ( + import_dict["log_green_fucci_zeroc_rescale"], + import_dict["log_red_fucci_zeroc_rescale"], + ) + wp_comp_kruskal_gaussccd_adj, wp_pass_kruskal_gaussccd_bh_comp = ( + import_dict["wp_comp_kruskal_gaussccd_adj"], + import_dict["wp_pass_kruskal_gaussccd_bh_comp"], + ) + fucci_data = import_dict["fucci_data"] + wp_iscell, wp_isnuc, wp_iscyto = ( + import_dict["wp_iscell"], + import_dict["wp_isnuc"], + import_dict["wp_iscyto"], + ) + curr_wp_phases, mockbulk_phases = ( + import_dict["curr_wp_phases"], + import_dict["mockbulk_phases"], + ) + + #%% + # Idea: Calculate the polar coordinates and other stuff + # Exec: Devin's calculations + # Output: fucci plot with polar coordinates + pseudotime_result = FucciPseudotime.pseudotime_protein( + fucci_data, + ab_nuc, + ab_cyto, + ab_cell, + mt_cell, + area_cell, + area_nuc, + well_plate, + well_plate_imgnb, + well_plate_imgnb_objnb, + log_red_fucci_zeroc_rescale, + log_green_fucci_zeroc_rescale, + mockbulk_phases, + ) + pol_sort_well_plate, pol_sort_norm_rev, pol_sort_well_plate_imgnb, pol_sort_well_plate_imgnb_objnb, pol_sort_ab_nuc, pol_sort_ab_cyto, pol_sort_ab_cell, pol_sort_mt_cell, pol_sort_area_cell, pol_sort_area_nuc, pol_sort_fred, pol_sort_fgreen, pol_sort_mockbulk_phases = ( + pseudotime_result + ) + + #%% Calculate measures of variance of protein abundance in single cells + # Idea: Calculate measures of variance, and show them in plots + # Execution: Now that we already have the data filtered for variability, this is just descriptive. + # Output: scatters of antibody vs microtubule variances by different measures of variaibility + + # toggle for using log-transformed intensities + # we decided to use natural intensities + use_log = False + calculate_variaton_result = ProteinVariability.calculate_variation( + use_log, + u_well_plates, + wp_iscell, + wp_isnuc, + wp_iscyto, + pol_sort_well_plate, + pol_sort_ab_cell, + pol_sort_ab_nuc, + pol_sort_ab_cyto, + pol_sort_mt_cell, + pol_sort_well_plate_imgnb, + ) + mean_mean_comp, var_comp, gini_comp, cv_comp, var_cell, gini_cell, cv_cell, var_mt, gini_mt, cv_mt = ( + calculate_variaton_result + ) + + # Compare variances for protein and microtubules, the internal control for each image + if doAllPlotsAndAnalyses: + removeThese = pd.read_csv("input/ProteinData/ReplicatesToRemove.txt", header=None)[0] # make these independent samples for one-sided Kruskal-Wallis tests + utils.general_boxplot( + (var_comp[~np.isin(u_well_plates, removeThese)], var_mt[~np.isin(u_well_plates, removeThese)]), + ("Protein", "Microtubules"), + "", + "Variance", + "", + False, + f"figures/ProteinMicrotubuleVariances.pdf", + ) + utils.general_boxplot( + (cv_comp[~np.isin(u_well_plates, removeThese)], gini_mt[~np.isin(u_well_plates, removeThese)]), + ("Protein", "Microtubules"), + "", + "CV", + "", + False, + f"figures/ProteinMicrotubuleCVs.pdf", + ) + utils.general_boxplot( + (gini_comp[~np.isin(u_well_plates, removeThese)], gini_mt[~np.isin(u_well_plates, removeThese)]), + ("Protein", "Microtubules"), + "", + "Gini", + "", + False, + f"figures/ProteinMicrotubuleGinis.pdf", + ) + p_varProt_varMt = 2*scipy.stats.kruskal(var_comp[~np.isin(u_well_plates, removeThese)], var_mt[~np.isin(u_well_plates, removeThese)])[1] + p_cvProt_cvMt = 2*scipy.stats.kruskal(cv_comp[~np.isin(u_well_plates, removeThese)], cv_mt[~np.isin(u_well_plates, removeThese)])[1] + p_giniProt_giniMt = 2*scipy.stats.kruskal(gini_comp[~np.isin(u_well_plates, removeThese)], gini_mt[~np.isin(u_well_plates, removeThese)])[1] + print( + f"{p_varProt_varMt}: p-value for difference between protein and microtubule variances" + ) + print( + f"{p_cvProt_cvMt}: p-value for difference between protein and microtubule CVs" + ) + print( + f"{p_giniProt_giniMt}: p-value for difference between protein and microtubule Gini indices" + ) + + #%% Gaussian clustering to identify biomodal intensity distributions + bimodal_results = ProteinBimodality.identify_bimodal_intensity_distributions( + u_well_plates, + wp_ensg, + pol_sort_well_plate, + pol_sort_norm_rev, + pol_sort_ab_cell, + pol_sort_ab_nuc, + pol_sort_ab_cyto, + pol_sort_mt_cell, + wp_iscell, + wp_isnuc, + wp_iscyto, + doAllPlotsAndAnalyses + ) + wp_isbimodal_fcpadj_pass, wp_bimodal_cluster_idxs, wp_isbimodal_generally, wp_bimodal_fcmaxmin = ( + bimodal_results + ) + + #%% Determine cell cycle dependence for each protein + use_log_ccd = False + do_remove_outliers = True + alphaa = 0.05 + + # Determine cell cycle dependence for proteins + ccd_results = ProteinCellCycleDependence.cell_cycle_dependence_protein( + u_well_plates, + wp_ensg, + wp_ab, + use_log_ccd, + do_remove_outliers, + pol_sort_well_plate, + pol_sort_norm_rev, + pol_sort_ab_cell, + pol_sort_ab_nuc, + pol_sort_ab_cyto, + pol_sort_mt_cell, + pol_sort_fred, + pol_sort_fgreen, + pol_sort_mockbulk_phases, + pol_sort_area_cell, + pol_sort_area_nuc, + pol_sort_well_plate_imgnb, + wp_iscell, + wp_isnuc, + wp_iscyto, + wp_isbimodal_fcpadj_pass, + wp_bimodal_cluster_idxs, + wp_comp_kruskal_gaussccd_adj, + doAllPlotsAndAnalyses + ) + wp_comp_ccd_difffromrng, mean_diff_from_rng_mt, wp_comp_ccd_clust1, wp_comp_ccd_clust2, wp_ccd_unibimodal, wp_comp_ccd_gauss, perc_var_comp, mean_diff_from_rng, wp_comp_eq_percvar_adj, mean_diff_from_rng_clust1, wp_comp_eq_percvar_adj_clust1, mean_diff_from_rng_clust2, wp_comp_eq_percvar_adj_clust2, mvavgs_x, mvavgs_comp, curr_pols, curr_ab_norms, mvperc_comps, curr_freds, curr_fgreens, curr_mockbulk_phases, curr_area_cell, curr_ab_nuc, curr_well_plate_imgnb, folder = ( + ccd_results + ) + + # Move the temporal average plots to more informative places + if doAllPlotsAndAnalyses: + ProteinCellCycleDependence.copy_mvavg_plots_protein( + folder, + wp_ensg, + wp_comp_ccd_difffromrng, + wp_isbimodal_fcpadj_pass, + wp_comp_ccd_clust1, + wp_comp_ccd_clust2, + wp_ccd_unibimodal, + wp_comp_ccd_gauss, + ) + ProteinCellCycleDependence.global_plots_protein( + alphaa, + u_well_plates, + wp_ccd_unibimodal, + perc_var_comp, + mean_mean_comp, + gini_comp, + cv_comp, + mean_diff_from_rng, + wp_comp_eq_percvar_adj, + wp_comp_kruskal_gaussccd_adj, + ) + + # Analyze the CCD results and save them + ccd_comp, nonccd_comp, bioccd = ProteinCellCycleDependence.analyze_ccd_variation_protein( + folder, + u_well_plates, + wp_ensg, + wp_ab, + wp_iscell, + wp_isnuc, + wp_iscyto, + wp_comp_ccd_difffromrng, + wp_comp_ccd_clust1, + wp_comp_ccd_clust2, + var_comp, + gini_comp, + perc_var_comp, + mean_diff_from_rng, + wp_comp_kruskal_gaussccd_adj, + wp_comp_eq_percvar_adj, + mean_diff_from_rng_clust1, + wp_comp_eq_percvar_adj_clust1, + mean_diff_from_rng_clust2, + wp_comp_eq_percvar_adj_clust2, + wp_isbimodal_fcpadj_pass, + wp_isbimodal_generally, + wp_ccd_unibimodal, + wp_bimodal_fcmaxmin, + wp_comp_ccd_gauss, + ) + + # Make a dataframe for plotting on the HPA website + ProteinCellCycleDependence.make_plotting_dataframe( + wp_ensg, + wp_ab, + u_well_plates, + wp_iscell, + wp_iscyto, + wp_isnuc, + ccd_comp, + bioccd, + curr_pols, + curr_ab_norms, + curr_freds, + curr_fgreens, + curr_mockbulk_phases, + mvavgs_x, + mvavgs_comp, + mvperc_comps, + gini_comp, + perc_var_comp, + ) + + # Perform comparison to LASSO for finding CCD proteins + if doAllPlotsAndAnalyses: + ProteinCellCycleDependence.compare_to_lasso_analysis( + u_well_plates, + pol_sort_norm_rev, + pol_sort_well_plate, + pol_sort_ab_cell, + pol_sort_ab_nuc, + pol_sort_ab_cyto, + pol_sort_mt_cell, + pol_sort_fred, + pol_sort_fgreen, + wp_iscell, + wp_isnuc, + wp_iscyto, + wp_ensg, + ccd_comp, + ) + + # Generate UMAPs to illustrate cutoffs and stability + ProteinCellCycleDependence.generate_protein_umaps( + u_well_plates, + pol_sort_norm_rev, + pol_sort_well_plate, + pol_sort_ab_cell, + pol_sort_ab_nuc, + pol_sort_ab_cyto, + pol_sort_mt_cell, + wp_iscell, + wp_isnuc, + wp_iscyto, + mean_diff_from_rng, + ) diff --git a/3_RNAFucciPseudotime.py b/3_RNAFucciPseudotime.py index 9b48e0c..c1e1cfe 100644 --- a/3_RNAFucciPseudotime.py +++ b/3_RNAFucciPseudotime.py @@ -18,188 +18,197 @@ import matplotlib.pyplot as plt import shutil import os - -# Make PDF text readable -plt.rcParams["pdf.fonttype"] = 42 -plt.rcParams["ps.fonttype"] = 42 -plt.rcParams["savefig.dpi"] = 300 - -bioccd = np.genfromtxt( - "input/ProteinData/BiologicallyDefinedCCD.txt", dtype="str" -) # from mitotic structures -wp_ensg = np.load("output/pickles/wp_ensg.npy", allow_pickle=True) -ccd_comp = np.load("output/pickles/ccd_comp.npy", allow_pickle=True) -nonccd_comp = np.load("output/pickles/nonccd_comp.npy", allow_pickle=True) -u_plates = ["355","356","357"] - -#%% Check for new RNA inputs -newinputsfolder = "newinputs/RNAData/" -if os.path.exists(newinputsfolder): - for file in os.listdir(newinputsfolder): - filepath=f"{newinputsfolder}{file}" - targetfile=f"input/RNAData/{os.path.basename(file)}" - targetfilepc=f"{targetfile}.protein_coding.csv" - if os.path.exists(targetfile): os.remove(targetfile) - if os.path.exists(targetfilepc): os.remove(targetfilepc) - shutil.copyfile(filepath, targetfile) - -#%% Convert FACS intensities for FUCCI markers to pseudotime using the same polar coordinate methods as for protein -# Idea: Use the polar coordinate pseudotime calculations to calculate the pseudotime for each cell -# Execution: Adapt Devin's code for the cells sorted for RNA-Seq -# Output: Make log-log fucci intensity plots for the cells analyzed by RNA-Seq; Plot of all fucci pseudotimes; table of pseudotimes for each cell -adata, phases_filt = RNADataPreparation.read_counts_and_phases( - "Counts", False, "protein_coding", u_plates -) -adata = RNADataPreparation.zero_center_fucci(adata) -FucciPseudotime.pseudotime_rna(adata) - -#%% Single cell RNA-Seq data preparation and general analysis -RNADataPreparation.general_plots(u_plates) -RNADataPreparation.analyze_noncycling_cells(u_plates) -adata, phasesfilt = RNADataPreparation.qc_filtering( - adata, do_log_normalize=True, do_remove_blob=False -) -adata = RNADataPreparation.zero_center_fucci(adata) -RNADataPreparation.plot_pca_for_batch_effect_analysis(adata, "BeforeRemovingNoncycling") - -#%% Idea: Similar to mock-bulk analysis for proteins, we can evaluate each gene bundled by phase across cells -# Execution: Make boxplots of RNA expression by phase -# Output: boxplots for each gene -valuetype, use_spikeins, biotype_to_use = "Tpms", False, "protein_coding" -adata, phases = RNADataPreparation.read_counts_and_phases( - valuetype, use_spikeins, biotype_to_use, u_plates -) -adata, phasesfilt = RNADataPreparation.qc_filtering( - adata, do_log_normalize=True, do_remove_blob=True -) -adata = RNADataPreparation.zero_center_fucci(adata) -RNADataPreparation.plot_markers_vs_reads(adata) -RNADataPreparation.plot_pca_for_batch_effect_analysis(adata, "AfterRemovingNoncycling") -g1 = adata.obs["phase"] == "G1" -s = adata.obs["phase"] == "S-ph" -g2 = adata.obs["phase"] == "G2M" -do_make_boxplots = False -if do_make_boxplots: - for iii, ensg in enumerate(adata.var_names): - maxtpm = np.max( - np.concatenate((adata.X[g1, iii], adata.X[s, iii], adata.X[g2, iii])) +import argparse + +if __name__ == "__main__": + description = "Single cell proteogenomic analysis script -- protein cell cycle dependence" + parser = argparse.ArgumentParser(description=description) + parser.add_argument( + "--quicker", action="store_true", help="Skip some plotting and analyses, namely for HPA releases." + ) + args = parser.parse_args() + doAllPlotsAndAnalyses = not args.quicker + + # Make PDF text readable + plt.rcParams["pdf.fonttype"] = 42 + plt.rcParams["ps.fonttype"] = 42 + plt.rcParams["savefig.dpi"] = 300 + + bioccd = np.genfromtxt( + "input/ProteinData/BiologicallyDefinedCCD.txt", dtype="str" + ) # from mitotic structures + wp_ensg = np.load("output/pickles/wp_ensg.npy", allow_pickle=True) + ccd_comp = np.load("output/pickles/ccd_comp.npy", allow_pickle=True) + nonccd_comp = np.load("output/pickles/nonccd_comp.npy", allow_pickle=True) + u_plates = ["355","356","357"] + + #%% Convert FACS intensities for FUCCI markers to pseudotime using the same polar coordinate methods as for protein + # Idea: Use the polar coordinate pseudotime calculations to calculate the pseudotime for each cell + # Execution: Adapt Devin's code for the cells sorted for RNA-Seq + # Output: Make log-log fucci intensity plots for the cells analyzed by RNA-Seq; Plot of all fucci pseudotimes; table of pseudotimes for each cell + adata, phases_filt = RNADataPreparation.read_counts_and_phases( + "Counts", False, "protein_coding", u_plates + ) + adata = RNADataPreparation.zero_center_fucci(adata) + FucciPseudotime.pseudotime_rna(adata) + + #%% Single cell RNA-Seq data preparation and general analysis + if doAllPlotsAndAnalyses: + RNADataPreparation.general_plots(u_plates) + RNADataPreparation.analyze_noncycling_cells(u_plates) + adata, phasesfilt = RNADataPreparation.qc_filtering( + adata, do_log_normalize=True, do_remove_blob=False ) - RNACellCycleDependence.boxplot_result( - adata.X[g1, iii] / maxtpm, - adata.X[s, iii] / maxtpm, - adata.X[g2, iii] / maxtpm, - "figures/RNABoxplotByPhase", - ensg, + adata = RNADataPreparation.zero_center_fucci(adata) + RNADataPreparation.plot_pca_for_batch_effect_analysis(adata, "BeforeRemovingNoncycling") + + #%% Idea: Similar to mock-bulk analysis for proteins, we can evaluate each gene bundled by phase across cells + # Execution: Make boxplots of RNA expression by phase + # Output: boxplots for each gene + valuetype, use_spikeins, biotype_to_use = "Tpms", False, "protein_coding" + adata, phases = RNADataPreparation.read_counts_and_phases( + valuetype, use_spikeins, biotype_to_use, u_plates + ) + adata, phasesfilt = RNADataPreparation.qc_filtering( + adata, do_log_normalize=True, do_remove_blob=True + ) + adata = RNADataPreparation.zero_center_fucci(adata) + + if doAllPlotsAndAnalyses: + RNADataPreparation.plot_markers_vs_reads(adata) + RNADataPreparation.plot_pca_for_batch_effect_analysis(adata, "AfterRemovingNoncycling") + g1 = adata.obs["phase"] == "G1" + s = adata.obs["phase"] == "S-ph" + g2 = adata.obs["phase"] == "G2M" + do_make_boxplots = False + if do_make_boxplots: + for iii, ensg in enumerate(adata.var_names): + maxtpm = np.max( + np.concatenate((adata.X[g1, iii], adata.X[s, iii], adata.X[g2, iii])) + ) + RNACellCycleDependence.boxplot_result( + adata.X[g1, iii] / maxtpm, + adata.X[s, iii] / maxtpm, + adata.X[g2, iii] / maxtpm, + "figures/RNABoxplotByPhase", + ensg, + ) + + #%% Idea: Display general RNA expression patterns in single cells using UMAP dimensionality reduction, and display with FUCCI pseudotime overlayed + if doAllPlotsAndAnalyses: + FucciPseudotime.pseudotime_umap(adata) # Generate a UMAP with the pseudotime overlayed + + # We can also show that the cycle pattern remains when the curated CCD genes or CCD proteins are removed, + # demonstrating that there's still valuable information about cell cycling beyond what was called CCD + RNADataPreparation.demonstrate_umap_cycle_without_ccd(adata) + + # Read in the curated CCD genes / CCD proteins from the present work / Non-CCD genes from the present work; filter for genes that weren't filtered in QC of RNA-Seq + ccd_regev_filtered, ccd_filtered, nonccd_filtered = utils.ccd_gene_lists(adata) + adata_ccdprotein, adata_nonccdprotein, adata_regevccdgenes = RNADataPreparation.is_ccd( + adata, wp_ensg, ccd_comp, nonccd_comp, bioccd, ccd_regev_filtered + ) + + # Generate plots with expression of genes overlayed + expression_data = adata.X + normalized_exp_data = (expression_data.T / np.max(expression_data, axis=0)[:, None]).T + + # Log-log FUCCI plot with RNA expression overlayed + if doAllPlotsAndAnalyses: + RNACellCycleDependence.plot_expression_facs( + adata, + wp_ensg[np.isin(wp_ensg, adata.var_names)], + normalized_exp_data, + "figures/GeneExpressionFucci", + ) + + # Cluster the expression into phases and analyze it that way + bulk_phase_tests = RNACellCycleDependence.analyze_ccd_variation_by_phase_rna( + adata, normalized_exp_data, biotype_to_use + ) + # RNACellCycleDependence.plot_expression_boxplots(adata, wp_ensg[np.isin(wp_ensg, adata.var_names)], bulk_phase_tests, "figures/GeneExpressionBoxplots") + # RNACellCycleDependence.plot_expression_umap(adata, wp_ensg[np.isin(wp_ensg, adata.var_names)], "figures/GeneExpressionUmap") + + #%% Moving average calculations and randomization analysis for RNA + rna_ccd_analysis_results = RNACellCycleDependence.analyze_ccd_variation_by_mvavg_rna( + adata, + wp_ensg, + ccd_comp, + bioccd, + adata_nonccdprotein, + adata_regevccdgenes, + biotype_to_use, + make_mvavg_plots_isoforms=doAllPlotsAndAnalyses, + ) + percent_ccd_variance, total_gini, mean_diff_from_rng, pass_meandiff, eq_percvar_adj, fucci_time_inds, norm_exp_sort, moving_averages, mvavg_xvals, perms, ccdtranscript, ccdprotein, mvpercs = ( + rna_ccd_analysis_results + ) + + RNACellCycleDependence.make_plotting_dataframe( + adata, + ccdtranscript, + wp_ensg, + bioccd, + norm_exp_sort[np.argsort(fucci_time_inds), :], + mvavg_xvals, + moving_averages, + mvpercs, + ) + + if doAllPlotsAndAnalyses: + RNACellCycleDependence.plot_umap_ccd_cutoffs(adata, mean_diff_from_rng) + RNACellCycleDependence.figures_ccd_analysis_rna( + adata, + percent_ccd_variance, + mean_diff_from_rng, + pass_meandiff, + eq_percvar_adj, + wp_ensg, + ccd_comp, + ccd_regev_filtered, + ) + RNACellCycleDependence.plot_overall_and_ccd_variances( + adata, + biotype_to_use, + total_gini, + percent_ccd_variance, + pass_meandiff, + adata_ccdprotein, + adata_nonccdprotein, + adata_regevccdgenes, + ) + RNACellCycleDependence.compare_to_lasso_analysis(adata, ccdtranscript) + RNACellCycleDependence.analyze_cnv_calls(adata, ccdtranscript) + + #%% Moving average calculations and randomization analysis for the spike-in internal controls + if doAllPlotsAndAnalyses: + adata_spikeins, phases_spikeins = RNADataPreparation.read_counts_and_phases( + valuetype, use_spike_ins=True, biotype_to_use="", u_plates=u_plates + ) + sc.pp.filter_genes(adata_spikeins, min_cells=100) + print(f"data shape after filtering: {adata_spikeins.X.shape}") + + RNACellCycleDependence.ccd_analysis_of_spikeins(adata_spikeins, perms) + + #%% Analyze RNA velocity + valuetype, use_spikeins, biotype_to_use = "Tpms", False, "protein_coding" + adata, phases = RNADataPreparation.read_counts_and_phases( + valuetype, use_spikeins, biotype_to_use, u_plates, load_velocities=True + ) + adata_blobless, phasesfilt = RNADataPreparation.qc_filtering( + adata, do_log_normalize=True, do_remove_blob=False + ) + RNAVelocity.analyze_rna_velocity(adata_blobless, mean_diff_from_rng, doAllPlotsAndAnalyses) + + #%% Analyze isoforms + if doAllPlotsAndAnalyses: + adata_isoform, ccdtranscript_isoform = RNACellCycleDependence.analyze_isoforms( + adata, ccdtranscript, wp_ensg, ccd_comp, nonccd_comp, u_plates, make_mvavg_plots_isoforms=doAllPlotsAndAnalyses + ) + RNACellCycleDependence.compare_genes_to_isoforms( + adata, + ccdprotein, + ccdtranscript, + adata_nonccdprotein, + adata_isoform, + ccdtranscript_isoform, ) - -#%% Idea: Display general RNA expression patterns in single cells using UMAP dimensionality reduction, and display with FUCCI pseudotime overlayed -FucciPseudotime.pseudotime_umap(adata) # Generate a UMAP with the pseudotime overlayed - -# We can also show that the cycle pattern remains when the curated CCD genes or CCD proteins are removed, -# demonstrating that there's still valuable information about cell cycling beyond what was called CCD -RNADataPreparation.demonstrate_umap_cycle_without_ccd(adata) - -# Read in the curated CCD genes / CCD proteins from the present work / Non-CCD genes from the present work; filter for genes that weren't filtered in QC of RNA-Seq -ccd_regev_filtered, ccd_filtered, nonccd_filtered = utils.ccd_gene_lists(adata) -adata_ccdprotein, adata_nonccdprotein, adata_regevccdgenes = RNADataPreparation.is_ccd( - adata, wp_ensg, ccd_comp, nonccd_comp, bioccd, ccd_regev_filtered -) - -# Generate plots with expression of genes overlayed -expression_data = adata.X -normalized_exp_data = (expression_data.T / np.max(expression_data, axis=0)[:, None]).T - -# Log-log FUCCI plot with RNA expression overlayed -RNACellCycleDependence.plot_expression_facs( - adata, - wp_ensg[np.isin(wp_ensg, adata.var_names)], - normalized_exp_data, - "figures/GeneExpressionFucci", -) - -# Cluster the expression into phases and analyze it that way -bulk_phase_tests = RNACellCycleDependence.analyze_ccd_variation_by_phase_rna( - adata, normalized_exp_data, biotype_to_use -) -# RNACellCycleDependence.plot_expression_boxplots(adata, wp_ensg[np.isin(wp_ensg, adata.var_names)], bulk_phase_tests, "figures/GeneExpressionBoxplots") -# RNACellCycleDependence.plot_expression_umap(adata, wp_ensg[np.isin(wp_ensg, adata.var_names)], "figures/GeneExpressionUmap") - -#%% Moving average calculations and randomization analysis for RNA -rna_ccd_analysis_results = RNACellCycleDependence.analyze_ccd_variation_by_mvavg_rna( - adata, - wp_ensg, - ccd_comp, - bioccd, - adata_nonccdprotein, - adata_regevccdgenes, - biotype_to_use, -) -percent_ccd_variance, total_gini, mean_diff_from_rng, pass_meandiff, eq_percvar_adj, fucci_time_inds, norm_exp_sort, moving_averages, mvavg_xvals, perms, ccdtranscript, ccdprotein, mvpercs = ( - rna_ccd_analysis_results -) - -RNACellCycleDependence.plot_umap_ccd_cutoffs(adata, mean_diff_from_rng) -RNACellCycleDependence.figures_ccd_analysis_rna( - adata, - percent_ccd_variance, - mean_diff_from_rng, - pass_meandiff, - eq_percvar_adj, - wp_ensg, - ccd_comp, - ccd_regev_filtered, -) -RNACellCycleDependence.plot_overall_and_ccd_variances( - adata, - biotype_to_use, - total_gini, - percent_ccd_variance, - pass_meandiff, - adata_ccdprotein, - adata_nonccdprotein, - adata_regevccdgenes, -) -RNACellCycleDependence.make_plotting_dataframe( - adata, - ccdtranscript, - wp_ensg, - bioccd, - norm_exp_sort[np.argsort(fucci_time_inds), :], - mvavg_xvals, - moving_averages, - mvpercs, -) -RNACellCycleDependence.compare_to_lasso_analysis(adata, ccdtranscript) -RNACellCycleDependence.analyze_cnv_calls(adata, ccdtranscript) - -#%% Moving average calculations and randomization analysis for the spike-in internal controls -adata_spikeins, phases_spikeins = RNADataPreparation.read_counts_and_phases( - valuetype, use_spike_ins=True, biotype_to_use="", u_plates=u_plates -) -sc.pp.filter_genes(adata_spikeins, min_cells=100) -print(f"data shape after filtering: {adata_spikeins.X.shape}") - -RNACellCycleDependence.ccd_analysis_of_spikeins(adata_spikeins, perms) - -#%% Analyze RNA velocity -valuetype, use_spikeins, biotype_to_use = "Tpms", False, "protein_coding" -adata, phases = RNADataPreparation.read_counts_and_phases( - valuetype, use_spikeins, biotype_to_use, u_plates, load_velocities=True -) -adata_blobless, phasesfilt = RNADataPreparation.qc_filtering( - adata, do_log_normalize=True, do_remove_blob=False -) -RNAVelocity.analyze_rna_velocity(adata_blobless, mean_diff_from_rng) - -#%% Analyze isoforms -adata_isoform, ccdtranscript_isoform = RNACellCycleDependence.analyze_isoforms( - adata, ccdtranscript, wp_ensg, ccd_comp, nonccd_comp, u_plates -) -RNACellCycleDependence.compare_genes_to_isoforms( - adata, - ccdprotein, - ccdtranscript, - adata_nonccdprotein, - adata_isoform, - ccdtranscript_isoform, -) diff --git a/SingleCellProteogenomics/ProteinBimodality.py b/SingleCellProteogenomics/ProteinBimodality.py index a36076a..ffe9136 100644 --- a/SingleCellProteogenomics/ProteinBimodality.py +++ b/SingleCellProteogenomics/ProteinBimodality.py @@ -17,7 +17,7 @@ def identify_bimodal_intensity_distributions(u_well_plates, wp_ensg, pol_sort_well_plate, pol_sort_norm_rev, pol_sort_ab_cell, pol_sort_ab_nuc, pol_sort_ab_cyto, pol_sort_mt_cell, - wp_iscell, wp_isnuc, wp_iscyto): + wp_iscell, wp_isnuc, wp_iscyto, do_plotting): ''' Some proteins display bimodal intensity distributions. This method seeks to identify distributions with high- and low-expressing cells, @@ -81,38 +81,39 @@ def identify_bimodal_intensity_distributions(u_well_plates, wp_ensg, print(f"{sum(~wp_isbimodal_generally[~wp_removeReplicate])}: number of proteins displaying unimodal distributions ({sum(~wp_isbimodal_generally)/len(wp_isbimodal_generally)}%)") print(f"{sum(wp_isbimodal_generally[~wp_removeReplicate])}: number of proteins displaying bimodal distributions ({sum(wp_isbimodal_generally)/len(wp_isbimodal_generally)}%)") - # Show that the intensity measurements are reasonable for these bimodal samples - plt.hist(np.concatenate(np.array(wp_intensities, dtype=object)[wp_isbimodal_generally])) - plt.xlabel("Mean intensity") - plt.ylabel("Count") - plt.title("Intensities of Cells within Bimodal Distributions\nAre Similar to those Overall") - # plt.show() - plt.close() - - print("Illustrate the significantly distinct high- and low-expressing cell populations") - plt.scatter(np.log(wp_bimodal_fcmeans) / np.log(2), -np.log10(wp_isbimodal_padj), c=wp_isbimodal_generally, alpha=0.5, cmap="bwr_r") - plt.xlabel("Log2 Fold Change Between Gaussian Clusters") - plt.ylabel("-Log10 Adj. p-Value for Difference Between Clusters") - plt.savefig("figures/BimodalSignificance_GeneralBimodality.png") - # plt.show() - plt.close() - - print("Illustrate the significantly distinct high- and low-expressing cell populations") - print("with no difference in pseudotime. These are evaluated separately for CCD.") - plt.scatter(np.log(wp_bimodal_fcmeans) / np.log(2), -np.log10(wp_isbimodal_padj), c=wp_isbimodal_fcpadj_pass, alpha=0.5, cmap="bwr_r") - plt.xlabel("Log2 Fold Change Between Gaussian Clusters") - plt.ylabel("-Log10 Adj. p-Value for Difference Between Clusters") - plt.savefig("figures/BimodalSignificance.png") - plt.savefig("figures/BimodalSignificance.pdf") - # plt.show() - plt.close() + if do_plotting: + # Show that the intensity measurements are reasonable for these bimodal samples + plt.hist(np.concatenate(np.array(wp_intensities, dtype=object)[wp_isbimodal_generally])) + plt.xlabel("Mean intensity") + plt.ylabel("Count") + plt.title("Intensities of Cells within Bimodal Distributions\nAre Similar to those Overall") + # plt.show() + plt.close() - print("Illustrate the samples with sufficient cell count for CCD evaluation of high- and low-expressing cell populations.") - plt.scatter([sum(c1[0]) for c1 in wp_bimodal_cluster_idxs], [sum(c1[1]) for c1 in wp_bimodal_cluster_idxs], c=wp_enoughcellsinbothclusters, alpha=0.5, cmap="bwr_r") - plt.xlabel("Cell Count, Cluster 1") - plt.ylabel("Cell Count, Cluster 2") - plt.savefig("figures/BimodalCellCount.png") - # plt.show() - plt.close() + print("Illustrate the significantly distinct high- and low-expressing cell populations") + plt.scatter(np.log(wp_bimodal_fcmeans) / np.log(2), -np.log10(wp_isbimodal_padj), c=wp_isbimodal_generally, alpha=0.5, cmap="bwr_r") + plt.xlabel("Log2 Fold Change Between Gaussian Clusters") + plt.ylabel("-Log10 Adj. p-Value for Difference Between Clusters") + plt.savefig("figures/BimodalSignificance_GeneralBimodality.png") + # plt.show() + plt.close() + + print("Illustrate the significantly distinct high- and low-expressing cell populations") + print("with no difference in pseudotime. These are evaluated separately for CCD.") + plt.scatter(np.log(wp_bimodal_fcmeans) / np.log(2), -np.log10(wp_isbimodal_padj), c=wp_isbimodal_fcpadj_pass, alpha=0.5, cmap="bwr_r") + plt.xlabel("Log2 Fold Change Between Gaussian Clusters") + plt.ylabel("-Log10 Adj. p-Value for Difference Between Clusters") + plt.savefig("figures/BimodalSignificance.png") + plt.savefig("figures/BimodalSignificance.pdf") + # plt.show() + plt.close() + + print("Illustrate the samples with sufficient cell count for CCD evaluation of high- and low-expressing cell populations.") + plt.scatter([sum(c1[0]) for c1 in wp_bimodal_cluster_idxs], [sum(c1[1]) for c1 in wp_bimodal_cluster_idxs], c=wp_enoughcellsinbothclusters, alpha=0.5, cmap="bwr_r") + plt.xlabel("Cell Count, Cluster 1") + plt.ylabel("Cell Count, Cluster 2") + plt.savefig("figures/BimodalCellCount.png") + # plt.show() + plt.close() return wp_isbimodal_fcpadj_pass, wp_bimodal_cluster_idxs, wp_isbimodal_generally, wp_bimodal_fcmaxmin \ No newline at end of file diff --git a/SingleCellProteogenomics/ProteinCellCycleDependence.py b/SingleCellProteogenomics/ProteinCellCycleDependence.py index bf6d266..78b1940 100644 --- a/SingleCellProteogenomics/ProteinCellCycleDependence.py +++ b/SingleCellProteogenomics/ProteinCellCycleDependence.py @@ -89,7 +89,8 @@ def cell_cycle_dependence_protein(u_well_plates, wp_ensg, wp_ab, use_log_ccd, do pol_sort_fred, pol_sort_fgreen, pol_sort_mockbulk_phases, pol_sort_area_cell, pol_sort_area_nuc, pol_sort_well_plate_imgnb, wp_iscell, wp_isnuc, wp_iscyto, - wp_isbimodal_fcpadj_pass, wp_bimodal_cluster_idxs, wp_comp_kruskal_gaussccd_adj): + wp_isbimodal_fcpadj_pass, wp_bimodal_cluster_idxs, wp_comp_kruskal_gaussccd_adj, + do_plotting): ''' Use a moving average model of protein expression over the cell cycle to determine cell cycle dependence. Generates plots for each gene @@ -192,13 +193,14 @@ def cell_cycle_dependence_protein(u_well_plates, wp_ensg, wp_ab, use_log_ccd, do windows2 = np.asarray([np.arange(start, start + WINDOW) for start in np.arange(sum(clust2_idx) - WINDOW + 1)]) mvperc1 = MovingAverages.mvpercentiles(curr_comp_norm[clust1_idx][windows1]) mvperc2 = MovingAverages.mvpercentiles(curr_comp_norm[clust2_idx][windows2]) - MovingAverages.temporal_mov_avg_protein(curr_pol[clust1_idx], curr_comp_norm[clust1_idx], - mvavgs_x_clust1[-1], mvavgs_comp_clust1[-1], mvperc1, None, folder, fileprefixes[i] + "_clust1") - MovingAverages.temporal_mov_avg_protein(curr_pol[clust2_idx], curr_comp_norm[clust2_idx], - mvavgs_x_clust2[-1], mvavgs_comp_clust2[-1], mvperc2, None, folder, fileprefixes[i] + "_clust2") + if do_plotting: + MovingAverages.temporal_mov_avg_protein(curr_pol[clust1_idx], curr_comp_norm[clust1_idx], + mvavgs_x_clust1[-1], mvavgs_comp_clust1[-1], mvperc1, None, folder, fileprefixes[i] + "_clust1") + MovingAverages.temporal_mov_avg_protein(curr_pol[clust2_idx], curr_comp_norm[clust2_idx], + mvavgs_x_clust2[-1], mvavgs_comp_clust2[-1], mvperc2, None, folder, fileprefixes[i] + "_clust2") # Make example plots for the randomization trials for the NFAT5 example in manuscript - if wp_ensg[i] == "ENSG00000102908": + if wp_ensg[i] == "ENSG00000102908" and do_plotting: median_rng_idx = np.argsort(curr_percvar_rng_comp) for iii, idx in enumerate([0,len(curr_percvar_rng_comp)//4,len(curr_percvar_rng_comp)//2,3*len(curr_percvar_rng_comp)//4,len(curr_percvar_rng_comp)-1]): MovingAverages.temporal_mov_avg_randomization_example_protein(curr_pol, curr_comp_norm, curr_comp_perm[median_rng_idx[idx]], @@ -229,36 +231,37 @@ def cell_cycle_dependence_protein(u_well_plates, wp_ensg, wp_ab, use_log_ccd, do windows = np.asarray([np.arange(start, start + WINDOW) for start in np.arange(len(curr_pol) - WINDOW + 1)]) mvperc_comp = MovingAverages.mvpercentiles(curr_ab_norm[windows]) mvperc_comps.append(mvperc_comp) - MovingAverages.temporal_mov_avg_protein(curr_pol, curr_ab_norm, - mvavg_xvals, mvavg_comp, mvperc_comp, None, folder, fileprefixes[i]) - - # Make the plots for microtubules (takes 10 mins for all, so just do the arbitrary one in the Fig 1) - if well == "C07_55405991": - mvperc_mt = MovingAverages.mvpercentiles(curr_mt_cell_norm[windows]) - MovingAverages.temporal_mov_avg_protein(curr_pol, curr_mt_cell_norm, - mvavg_xvals, mvavg_mt, mvperc_mt, None, folder_mt, f"{fileprefixes[i]}_mt") - if well == "C07_55405991": # keep here in case making all pseudotime plots - pd.DataFrame({ - "ENSG" : wp_ensg[i], - "Antibody" : wp_ab[i], - "Compartment" : "Cell", - "CCD" : "No", - "cell_pseudotime" : [",".join([str(ppp) for ppp in pp]) for pp in [curr_pol]], - "cell_intensity" : [",".join([str(yyy) for yyy in yy]) for yy in [curr_mt_cell_norm]], - "mvavg_x" : [",".join([str(xxx) for xxx in xx]) for xx in [mvavg_xvals]], - "mvavg_y" : [",".join([str(yyy) for yyy in yy]) for yy in [mvavg_mt]], - "mvavgs_10p" : [",".join([str(yyy) for yyy in yy]) for yy in [mvperc_mt[0]]], - "mvavgs_90p" : [",".join([str(yyy) for yyy in yy]) for yy in [mvperc_mt[-1]]], - "mvavgs_25p" : [",".join([str(yyy) for yyy in yy]) for yy in [mvperc_mt[1]]], - "mvavgs_75p" : [",".join([str(yyy) for yyy in yy]) for yy in [mvperc_mt[-2]]], - "phase" : [",".join(pp) for pp in [curr_mockbulk_phase]], - "WellPlate" : u_well_plates[i]}).to_csv( - "output/mtplottingline.tsv", index=False, sep="\t") - - # Make the plots for each bimodal protein - if clusters is not None: + if do_plotting: MovingAverages.temporal_mov_avg_protein(curr_pol, curr_ab_norm, - mvavg_xvals, mvavg_comp, mvperc_comp, clusters, folder, fileprefixes[i] + "_clust1&2") + mvavg_xvals, mvavg_comp, mvperc_comp, None, folder, fileprefixes[i]) + + # Make the plots for microtubules (takes 10 mins for all, so just do the arbitrary one in the Fig 1) + if well == "C07_55405991": + mvperc_mt = MovingAverages.mvpercentiles(curr_mt_cell_norm[windows]) + MovingAverages.temporal_mov_avg_protein(curr_pol, curr_mt_cell_norm, + mvavg_xvals, mvavg_mt, mvperc_mt, None, folder_mt, f"{fileprefixes[i]}_mt") + if well == "C07_55405991": # keep here in case making all pseudotime plots + pd.DataFrame({ + "ENSG" : wp_ensg[i], + "Antibody" : wp_ab[i], + "Compartment" : "Cell", + "CCD" : "No", + "cell_pseudotime" : [",".join([str(ppp) for ppp in pp]) for pp in [curr_pol]], + "cell_intensity" : [",".join([str(yyy) for yyy in yy]) for yy in [curr_mt_cell_norm]], + "mvavg_x" : [",".join([str(xxx) for xxx in xx]) for xx in [mvavg_xvals]], + "mvavg_y" : [",".join([str(yyy) for yyy in yy]) for yy in [mvavg_mt]], + "mvavgs_10p" : [",".join([str(yyy) for yyy in yy]) for yy in [mvperc_mt[0]]], + "mvavgs_90p" : [",".join([str(yyy) for yyy in yy]) for yy in [mvperc_mt[-1]]], + "mvavgs_25p" : [",".join([str(yyy) for yyy in yy]) for yy in [mvperc_mt[1]]], + "mvavgs_75p" : [",".join([str(yyy) for yyy in yy]) for yy in [mvperc_mt[-2]]], + "phase" : [",".join(pp) for pp in [curr_mockbulk_phase]], + "WellPlate" : u_well_plates[i]}).to_csv( + "output/mtplottingline.tsv", index=False, sep="\t") + + # Make the plots for each bimodal protein + if clusters is not None: + MovingAverages.temporal_mov_avg_protein(curr_pol, curr_ab_norm, + mvavg_xvals, mvavg_comp, mvperc_comp, clusters, folder, fileprefixes[i] + "_clust1&2") alpha_ccd = 0.01 perc_var_cell, perc_var_nuc, perc_var_cyto, perc_var_mt = np.array(perc_var_cell),np.array(perc_var_nuc),np.array(perc_var_cyto),np.array(perc_var_mt) # percent variance attributed to cell cycle (mean POI intensities) @@ -323,28 +326,29 @@ def cell_cycle_dependence_protein(u_well_plates, wp_ensg, wp_ab, use_log_ccd, do wp_ccd_unibimodal = wp_comp_ccd_difffromrng | wp_comp_ccd_clust1 | wp_comp_ccd_clust2 - plt.figure(figsize=(10,10)) - plt.scatter(perc_var_comp_withbimodal, mean_diff_from_rng_withbimodal, c=wp_comp_ccd_difffromrng_withbimodal, cmap="bwr_r") - plt.hlines(MIN_MEAN_PERCVAR_DIFF_FROM_RANDOM, np.min(perc_var_comp), np.max(perc_var_comp), color="gray") - plt.xlabel("Percent Variance Explained by Cell Cycle") - plt.ylabel("Mean Difference from Random") - plt.savefig("figures/MedianDiffFromRandom.png") - plt.savefig("figures/MedianDiffFromRandom.pdf") - # plt.show() - plt.close() - - pervar_adj_withbimodal_nextafter = np.nextafter(wp_comp_eq_percvar_adj_withbimodal, wp_comp_eq_percvar_adj_withbimodal + 1) - plt.figure(figsize=(10,10)) - plt.scatter(mean_diff_from_rng_withbimodal, -np.log10(pervar_adj_withbimodal_nextafter), c=wp_comp_ccd_difffromrng_withbimodal, cmap="bwr_r") - plt.vlines(MIN_MEAN_PERCVAR_DIFF_FROM_RANDOM, - np.min(-np.log10(pervar_adj_withbimodal_nextafter)), - np.max(-np.log10(pervar_adj_withbimodal_nextafter)), color="gray") - plt.xlabel("Mean Difference from Random") - plt.ylabel("-log10 adj p-value from randomization") - plt.savefig("figures/MedianDiffFromRandomVolcano.png") - plt.savefig("figures/MedianDiffFromRandomVolcano.pdf") - # plt.show() - plt.close() + if do_plotting: + plt.figure(figsize=(10,10)) + plt.scatter(perc_var_comp_withbimodal, mean_diff_from_rng_withbimodal, c=wp_comp_ccd_difffromrng_withbimodal, cmap="bwr_r") + plt.hlines(MIN_MEAN_PERCVAR_DIFF_FROM_RANDOM, np.min(perc_var_comp), np.max(perc_var_comp), color="gray") + plt.xlabel("Percent Variance Explained by Cell Cycle") + plt.ylabel("Mean Difference from Random") + plt.savefig("figures/MedianDiffFromRandom.png") + plt.savefig("figures/MedianDiffFromRandom.pdf") + # plt.show() + plt.close() + + pervar_adj_withbimodal_nextafter = np.nextafter(wp_comp_eq_percvar_adj_withbimodal, wp_comp_eq_percvar_adj_withbimodal + 1) + plt.figure(figsize=(10,10)) + plt.scatter(mean_diff_from_rng_withbimodal, -np.log10(pervar_adj_withbimodal_nextafter), c=wp_comp_ccd_difffromrng_withbimodal, cmap="bwr_r") + plt.vlines(MIN_MEAN_PERCVAR_DIFF_FROM_RANDOM, + np.min(-np.log10(pervar_adj_withbimodal_nextafter)), + np.max(-np.log10(pervar_adj_withbimodal_nextafter)), color="gray") + plt.xlabel("Mean Difference from Random") + plt.ylabel("-log10 adj p-value from randomization") + plt.savefig("figures/MedianDiffFromRandomVolcano.png") + plt.savefig("figures/MedianDiffFromRandomVolcano.pdf") + # plt.show() + plt.close() return (wp_comp_ccd_difffromrng, mean_diff_from_rng_mt, wp_comp_ccd_clust1, wp_comp_ccd_clust2, wp_ccd_unibimodal, wp_comp_ccd_gauss, perc_var_comp, mean_diff_from_rng, wp_comp_eq_percvar_adj, diff --git a/SingleCellProteogenomics/RNACellCycleDependence.py b/SingleCellProteogenomics/RNACellCycleDependence.py index 26ab88b..df9e02b 100644 --- a/SingleCellProteogenomics/RNACellCycleDependence.py +++ b/SingleCellProteogenomics/RNACellCycleDependence.py @@ -23,7 +23,7 @@ np.random.seed(0) # Get the same results each time WINDOW = 100 # Number of points for moving average window for protein analysis -PERMUTATIONS = 100#10000 # Number of permutations used for randomization analysis +PERMUTATIONS = 10000 # Number of permutations used for randomization analysis PERMUTATIONS_ISOFORMS = 100 MIN_MEAN_PERCVAR_DIFF_FROM_RANDOM = 0.08 # Cutoff used for percent additional variance explained by the cell cycle than random @@ -292,7 +292,7 @@ def compare_genes_to_isoforms(adata, ccdprotein, ccdtranscript, adata_nonccdprot print(f"{sum(ccdIsoformsPerCcdProtein > 0) / sum(ccdprotein)}% of CCD proteins had at least one CCD transcript isoform") print(f"{sum(ccdIsoformsPerNonCcdProtein > 0) / sum(adata_nonccdprotein)}% of non-CCD proteins had at least one CCD transcript isoform") -def analyze_isoforms(adata, ccdtranscript, wp_ensg, ccd_comp, nonccd_comp, u_plates): +def analyze_isoforms(adata, ccdtranscript, wp_ensg, ccd_comp, nonccd_comp, u_plates, make_mvavg_plots_isoforms=False): '''Analyze the isoform-level results over the cell cycle''' # Read in the data & QC analysis valuetype, use_spikeins, biotype_to_use = "Tpms", False, "protein_coding" @@ -307,7 +307,7 @@ def analyze_isoforms(adata, ccdtranscript, wp_ensg, ccd_comp, nonccd_comp, u_pla bioccd = np.genfromtxt("input/ProteinData/BiologicallyDefinedCCD.txt", dtype='str') # from mitotic structures in the protein work ccd_regev_filtered_isoform, ccd_filtered_isoform, nonccd_filtered_isoform = utils.ccd_gene_lists(adata_isoform) adata_ccdprotein_isoform, adata_nonccdprotein_isoform, adata_regevccdgenes_isoform = RNADataPreparation.is_ccd(adata_isoform, wp_ensg, ccd_comp, nonccd_comp, bioccd, ccd_regev_filtered_isoform) - rna_ccd_analysis_results = analyze_ccd_variation_by_mvavg_rna(adata_isoform, wp_ensg, ccd_comp, bioccd, adata_nonccdprotein_isoform, adata_regevccdgenes_isoform, biotype_to_use, True) + rna_ccd_analysis_results = analyze_ccd_variation_by_mvavg_rna(adata_isoform, wp_ensg, ccd_comp, bioccd, adata_nonccdprotein_isoform, adata_regevccdgenes_isoform, biotype_to_use, use_isoforms=True, make_mvavg_plots_isoforms=make_mvavg_plots_isoforms) percent_ccd_variance_isoform, total_gini_isoform, mean_diff_from_rng_isoform, pass_meandiff_isoform, eq_percvar_adj_isoform, fucci_time_inds_isoform, norm_exp_sort_isoform, moving_averages_isoform, mvavg_xvals_isoform, perms_isoform, ccdtranscript_isoform, ccdprotein_isoform, mvpercs_isoform = rna_ccd_analysis_results return adata_isoform, ccdtranscript_isoform diff --git a/SingleCellProteogenomics/RNADataPreparation.py b/SingleCellProteogenomics/RNADataPreparation.py index 31f3aea..48741aa 100644 --- a/SingleCellProteogenomics/RNADataPreparation.py +++ b/SingleCellProteogenomics/RNADataPreparation.py @@ -117,8 +117,10 @@ def qc_filtering(adata, do_log_normalize, do_remove_blob): if do_log_normalize: sc.pp.log1p(adata) louvain_file="input/RNAData/louvain_original.npy" louvain = np.load(louvain_file, allow_pickle=True) - adata.obs["original_louvain_wp"] = louvain[:,0] - adata.obs["original_louvain"] = louvain[:,1] + original_louvain_wp = louvain[:,0] + keepThese = np.isin(original_louvain_wp, adata.obs["Well_Plate"]) # can have slightly different number of cells + adata.obs["original_louvain_wp"] = louvain[:,0][keepThese] + adata.obs["original_louvain"] = louvain[:,1][keepThese] if do_remove_blob: removeThese = np.isin(adata.obs["Well_Plate"], adata.obs["original_louvain_wp"][adata.obs["original_louvain"] == "5"]) adata = adata[~removeThese,:] @@ -201,8 +203,8 @@ def analyze_noncycling_cells(u_plates): for md in mindists: sc.pp.neighbors(adata, n_neighbors=nn, n_pcs=40) sc.tl.umap(adata, min_dist=md) - sc.pl.umap(adata, color="louvain", show=False, save=f"louvain_clusters_before_nn{nn}_md{md}.pdf") - sc.tl.rank_genes_groups(adata, groupby="louvain") + sc.pl.umap(adata, color="original_louvain", show=False, save=f"louvain_clusters_before_nn{nn}_md{md}.pdf") + sc.tl.rank_genes_groups(adata, groupby="original_louvain") p_blob=[a[5] for a in adata.uns["rank_genes_groups"]["pvals_adj"]] p_blob_sig = np.array(p_blob) < 0.01 ensg_blob_sig=np.array([a[5] for a in adata.uns["rank_genes_groups"]["names"]])[p_blob_sig] @@ -217,7 +219,7 @@ def analyze_noncycling_cells(u_plates): for md in mindists: sc.pp.neighbors(adata, n_neighbors=nn, n_pcs=40) sc.tl.umap(adata, min_dist=md) - sc.pl.umap(adata, color="louvain", show=False, save=f"louvain_clusters_after_nn{nn}_md{md}.pdf") + sc.pl.umap(adata, color="original_louvain", show=False, save=f"louvain_clusters_after_nn{nn}_md{md}.pdf") def demonstrate_umap_cycle_without_ccd(adata): ''' diff --git a/SingleCellProteogenomics/RNAVelocity.py b/SingleCellProteogenomics/RNAVelocity.py index dbbbd94..dc07ee7 100644 --- a/SingleCellProteogenomics/RNAVelocity.py +++ b/SingleCellProteogenomics/RNAVelocity.py @@ -15,8 +15,8 @@ plt.rcParams['pdf.fonttype'], plt.rcParams['ps.fonttype'], plt.rcParams['savefig.dpi'] = 42, 42, 300 #Make PDF text readable plt.rcParams['figure.figsize'] = (10, 10) -def analyze_rna_velocity(adata_withblob, mean_diff_from_rng): - adata_blobless = adata_withblob[adata_withblob.obs["louvain"] != "5",:] +def analyze_rna_velocity(adata_withblob, mean_diff_from_rng, do_all_plotting): + adata_blobless = adata_withblob[adata_withblob.obs["original_louvain"] != "5",:] # Make the chosen cutoff UMAP cutoff=0.08 @@ -35,6 +35,9 @@ def analyze_rna_velocity(adata_withblob, mean_diff_from_rng): "y_umap" : [x[1] for x in adata_withCutoff.obsm['velocity_umap']]}).to_csv( f"figures/velocityStreamUmap{cutoff}CCD.tsv", index=False, sep="\t") + if not do_all_plotting: + return + # Profile different cutoffs for CCD and non-CCD, UMAP and tSNE to evaluate params and consistency for cutoff in (np.arange(20) + 1) / 100: adata_withCutoff = adata_blobless[:,mean_diff_from_rng <= cutoff]