Merge pull request #12 from CellProfiling/norm

Normalization method update
CellProfiling · May 21, 2021 · 7f73b9a · 7f73b9a
2 parents 0654f0f + 8489c5a
commit 7f73b9a
Show file tree

Hide file tree

Showing 12 changed files with 197 additions and 73 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,13 +1,13 @@
 # IDE and Checkpoint Files
+.spyproject/
 .vscode/
 .ipynb_checkpoints
 __pycache__/
 .pybiomart.sqlite
 ~$*
 */~$*
 cache/
-.DS_Store
-*/.DS_Store
+**/.DS_Store
 
 # Output or Downloaded Folders
 input.zip
@@ -18,3 +18,6 @@ data/
 
 # External Software
 iupred2a/
+
+# Snakemake files
+**/.snakemake
diff --git a/3_RNAFucciPseudotime.py b/3_RNAFucciPseudotime.py
@@ -27,34 +27,38 @@
 wp_ensg = np.load("output/pickles/wp_ensg.npy", allow_pickle=True)
 ccd_comp = np.load("output/pickles/ccd_comp.npy", allow_pickle=True)
 nonccd_comp = np.load("output/pickles/nonccd_comp.npy", allow_pickle=True)
+u_plates = ["355","356","357"]
 
 #%% Convert FACS intensities for FUCCI markers to pseudotime using the same polar coordinate methods as for protein
 # Idea: Use the polar coordinate pseudotime calculations to calculate the pseudotime for each cell
 # Execution: Adapt Devin's code for the cells sorted for RNA-Seq
 # Output: Make log-log fucci intensity plots for the cells analyzed by RNA-Seq; Plot of all fucci pseudotimes; table of pseudotimes for each cell
 adata, phases_filt = RNADataPreparation.read_counts_and_phases(
-    "Counts", False, "protein_coding"
-)  # no qc, yet
-FucciPseudotime.pseudotime_rna(adata, phases_filt)
+    "Counts", False, "protein_coding", u_plates
+)
+adata = RNADataPreparation.zero_center_fucci(adata)
+FucciPseudotime.pseudotime_rna(adata)
 
 #%% Single cell RNA-Seq data preparation and general analysis
-RNADataPreparation.general_plots()
-RNADataPreparation.analyze_noncycling_cells()
+RNADataPreparation.general_plots(u_plates)
+RNADataPreparation.analyze_noncycling_cells(u_plates)
 adata, phasesfilt = RNADataPreparation.qc_filtering(
     adata, do_log_normalize=True, do_remove_blob=False
 )
+adata = RNADataPreparation.zero_center_fucci(adata)
 RNADataPreparation.plot_pca_for_batch_effect_analysis(adata, "BeforeRemovingNoncycling")
 
 #%% Idea: Similar to mock-bulk analysis for proteins, we can evaluate each gene bundled by phase across cells
 # Execution: Make boxplots of RNA expression by phase
 # Output: boxplots for each gene
 valuetype, use_spikeins, biotype_to_use = "Tpms", False, "protein_coding"
 adata, phases = RNADataPreparation.read_counts_and_phases(
-    valuetype, use_spikeins, biotype_to_use
+    valuetype, use_spikeins, biotype_to_use, u_plates
 )
 adata, phasesfilt = RNADataPreparation.qc_filtering(
     adata, do_log_normalize=True, do_remove_blob=True
 )
+adata = RNADataPreparation.zero_center_fucci(adata)
 RNADataPreparation.plot_markers_vs_reads(adata)
 RNADataPreparation.plot_pca_for_batch_effect_analysis(adata, "AfterRemovingNoncycling")
 g1 = adata.obs["phase"] == "G1"
@@ -93,10 +97,9 @@
 
 # Log-log FUCCI plot with RNA expression overlayed
 RNACellCycleDependence.plot_expression_facs(
+    adata,
     wp_ensg[np.isin(wp_ensg, adata.var_names)],
     normalized_exp_data,
-    phasesfilt,
-    adata.var_names,
     "figures/GeneExpressionFucci",
 )
 
@@ -157,7 +160,7 @@
 
 #%% Moving average calculations and randomization analysis for the spike-in internal controls
 adata_spikeins, phases_spikeins = RNADataPreparation.read_counts_and_phases(
-    valuetype, use_spike_ins=True, biotype_to_use=""
+    valuetype, use_spike_ins=True, biotype_to_use="", u_plates=u_plates
 )
 sc.pp.filter_genes(adata_spikeins, min_cells=100)
 print(f"data shape after filtering: {adata_spikeins.X.shape}")
@@ -166,7 +169,7 @@
 
 #%% Analyze isoforms
 adata_isoform, ccdtranscript_isoform = RNACellCycleDependence.analyze_isoforms(
-    adata, ccdtranscript, wp_ensg, ccd_comp, nonccd_comp
+    adata, ccdtranscript, wp_ensg, ccd_comp, nonccd_comp, u_plates
 )
 RNACellCycleDependence.compare_genes_to_isoforms(
     adata,

diff --git a/4_TemporalDelay.py b/4_TemporalDelay.py
@@ -85,26 +85,29 @@
 
 #%% Create a heatmap of peak RNA expression
 highlight_names, highlight_ensg = [], []
+u_rna_plates = ["355","356","357"]
 
 # Read in RNA-Seq data; use TPMs so that the gene-specific results scales match for cross-gene comparisons
 valuetype, use_spikeins, biotype_to_use = "Tpms", False, "protein_coding"
 adata, phases = RNADataPreparation.read_counts_and_phases(
-    valuetype, use_spikeins, biotype_to_use
+    valuetype, use_spikeins, biotype_to_use, u_rna_plates
 )
 adata, phasesfilt = RNADataPreparation.qc_filtering(
     adata, do_log_normalize=True, do_remove_blob=True
 )
+adata = RNADataPreparation.zero_center_fucci(adata)
 sorted_max_moving_avg_pol_ccd, norm_exp_sort, max_moving_avg_pol, sorted_rna_binned_norm = TemporalDelay.rna_heatmap(
     adata, highlight_names, highlight_ensg, ccdtranscript, xvals
 )
 
 # Analyze isoforms
 adata_isoform, phases_isoform = RNADataPreparation.read_counts_and_phases(
-    valuetype, use_spikeins, biotype_to_use, use_isoforms=True
+    valuetype, use_spikeins, biotype_to_use, u_rna_plates, use_isoforms=True,
 )
 adata_isoform, phasesfilt_isoform = RNADataPreparation.qc_filtering(
     adata_isoform, do_log_normalize=True, do_remove_blob=True
 )
+adata_isoform = RNADataPreparation.zero_center_fucci(adata_isoform)
 sorted_max_moving_avg_pol_ccd_isoform, norm_exp_sort_isoform, max_moving_avg_pol_isoform, sorted_rna_binned_norm_isoform = TemporalDelay.rna_heatmap(
     adata_isoform,
     highlight_names,
@@ -114,7 +117,7 @@
     isIsoformData=True,
 )
 pearsonCorrelations = TemporalDelay.analyze_ccd_isoform_correlations(
-    adata, adata_isoform, ccdtranscript, ccdtranscript_isoform, xvals
+    adata, adata_isoform, ccdtranscript, ccdtranscript_isoform, xvals, u_rna_plates
 )
 
 #%% Compare the variances and time of peak expression between protein and RNA

diff --git a/5_ProteinProperties.py b/5_ProteinProperties.py
@@ -19,17 +19,18 @@
 plt.rcParams["ps.fonttype"] = 42
 plt.rcParams["savefig.dpi"] = 300
 fucci = FucciCellCycle.FucciCellCycle()
+u_rna_plates = ["355","356","357"]
 
 #%% Import the genes names we're analyzing
 # Read in RNA-Seq data again and the CCD gene lists
 valuetype, use_spikeins, biotype_to_use = "Tpms", False, "protein_coding"
 adata, phases = RNADataPreparation.read_counts_and_phases(
-    valuetype, use_spikeins, biotype_to_use
+    valuetype, use_spikeins, biotype_to_use, u_rna_plates
 )
 adata, phasesfilt = RNADataPreparation.qc_filtering(
     adata, do_log_normalize=True, do_remove_blob=True
 )
-
+adata = RNADataPreparation.zero_center_fucci(adata)
 import_dict = Loaders.load_ptm_and_stability(adata)
 wp_ensg, ccd_comp, nonccd_comp, ccdtranscript, wp_max_pol = (
     import_dict["wp_ensg"],

diff --git a/README.md b/README.md
@@ -3,32 +3,31 @@
 This repository contains the code used to perform the [single-cell proteogenomic analysis of the human cell cycle](https://www.nature.com/articles/s41586-021-03232-9). This study was based on immunofluorescence staining of ~200k cells for single-cell analysis of proteomic heterogeneity and ~1k cells for analysis of single-cell RNA variability. These analyses were integrated with other proteomic studies and databases to investigate the functional importance of transcript-regulated and non-transcript regulated variability.
 
 ## Structure of repository
-The code is listed in order of execution, e.g. "1_", "2_" etc. The output of each script is used in the subsequent script.
+The code is listed in order of execution, e.g. "1_", "2_" etc. The output of each script is used in the subsequent script. This workflow can also be run using snakemake (see below).
 
 The logic for these analyses is contained in the `SingleCellProteogenomics` folder.
 
-The input files are contained in the "input" folder. This folder is linked [here](https://drive.google.com/file/d/1mdQbYcDPqiTOHeiYbv_4RtrxrmlhYMNl/view?usp=sharing) as a zip file, `input.zip`. Expand this folder within the base directory of this repository. If you are looking for the raw imaging proteomic dataset produced after filtering artifacts and such, that is located [here](https://drive.google.com/file/d/11vjsZV-nmzPpFmA7ShbfHzmbrk057b1V/view?usp=sharing).
+The input files are contained in the "input" folder. This folder is linked [here](https://drive.google.com/file/d/1G4i115FCH8XNyiEHCkBXMSO_9pwGflTq/view?usp=sharing) as a zip file, `input.zip`. Expand this folder within the base directory of this repository. If you are looking for the raw imaging proteomic dataset produced after filtering artifacts and such, that is located [here](https://drive.google.com/file/d/11vjsZV-nmzPpFmA7ShbfHzmbrk057b1V/view?usp=sharing).
 
 The output files are added to a folder "output" during the analysis, and figures are added to a folder "figures."
 
 An R-script used to analyze skewness and kurtosis (noted in the Methods of the manuscript) is contained in the other_scripts folder. The `other_scripts/ProteinDisorder.py` script utilizes [IUPRED2A](https://iupred2a.elte.hu/) and a [human UniProt](https://www.uniprot.org/proteomes/UP000005640) database.
 
-## Prerequisites
+## Running the workflow using snakemake
 
-Prerequisites are listed in `enviro.yaml` file. They can be installed using `conda`:
+This workflow can be run using `snakemake`:
 
-1. Install Miniconda from https://docs.conda.io/en/latest/miniconda.html
+1. Install Miniconda from https://docs.conda.io/en/latest/miniconda.html.
 
-2. From this directory run `conda env create -n fucci -f enviro.yaml; conda activate fucci` to install and activate these prerequisites.
+2. Install snakemake using `conda install -c conda-forge snakemake-minimal`.
+
+3. Within this directory, run `snakemake -j 1 --use-conda --snakefile workflow/Snakefile`.
 
 ## Single-cell RNA-Seq analysis
 
-The single-cell RNA-Seq data is available at GEO SRA under project number [GSE146773](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE146773). 
+The single-cell RNA-Seq data is available at GEO SRA under project number [GSE146773](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE146773).
 
-The cell cycle phase and FACS intensity data for these ~1,000 cells are contained in the [input folder](https://drive.google.com/file/d/1mdQbYcDPqiTOHeiYbv_4RtrxrmlhYMNl/view?usp=sharing) within the file `ProteinData/WellPlatePhasesLogNormIntensities.csv`:
-* The column "Well_Plate" (e.g., A10_355) corresponds to the sample title within GEO SRA (e.g., "Single U2OS cell A10_355"). 
-* The column "Stage" corresponds to the phase assigned by FACS gating.
-* The "Green530" and "Red585" columns correspond to the log-intensities for the red (CDT1) and green (GMNN) FUCCI markers for the individual cells.
+The cell cycle phase and FACS intensity information for these ~1,000 cells are contained in the [input folder](https://drive.google.com/file/d/1G4i115FCH8XNyiEHCkBXMSO_9pwGflTq/view?usp=sharing) within three files, one per plate, starting with `RNAData/180911_Fucci_single cell seq_ss2-18-*.csv`.
 
 The `snakemake` workflow used to analyze the scRNA-Seq dataset, including RNA velocity calculations and louvain unsupervised clustering, can be found in this repository: https://github.com/CellProfiling/FucciSingleCellSeqPipeline.
 

diff --git a/SingleCellProteogenomics/FucciPseudotime.py b/SingleCellProteogenomics/FucciPseudotime.py
@@ -52,6 +52,7 @@ def plot_annotate_time(R_2, start_phi, fraction):
     pt = pol2cart(R_2,start_phi + (1 - fraction) * 2 * np.pi)
     plt.scatter(pt[0],pt[1],c='c',linewidths=4)
     plt.annotate(f"  {round(fraction * fucci.TOT_LEN, 2)} hrs", (pt[0], pt[1]))
+
 def drange(x, y, jump):
     '''Increment `x` by a decimal `jump` until greater than `y`'''
     while x < y:
@@ -181,6 +182,7 @@ def fucci_polar_coords(x, y, analysis_title):
     plt.tight_layout()
     plt.savefig(f"figures/FucciAllPseudotimeHist_{analysis_title}.png")
     # plt.show()
+    plt.close()
 
     # visualize that result
     start_pt = pol2cart(R_2,start_phi)
@@ -189,15 +191,15 @@ def fucci_polar_coords(x, y, analysis_title):
     cart_data_ur = pol2cart(np.repeat(R_2,len(centered_data)), pol_data[1])
     fucci_hist2d(centered_data, cart_data_ur, start_pt, g1_end_pt, g1s_end_pt, analysis_title, R_2, start_phi)
 
-    return (pol_sort_norm_rev, centered_data, pol_sort_centered_data0, pol_sort_centered_data1, pol_sort_inds, pol_sort_inds_reorder, 
+    return (pol_sort_norm_rev, centered_data, pol_sort_centered_data0, pol_sort_centered_data1, pol_sort_phi, pol_sort_inds, pol_sort_inds_reorder, pol_sort_phi_reorder,
         more_than_start, less_than_start, start_pt, g1_end_pt, g1s_end_pt, cart_data_ur, R_2, start_phi)
 
 def pseudotime_protein(fucci_data, ab_nuc, ab_cyto, ab_cell, mt_cell, area_cell, area_nuc, well_plate, well_plate_imgnb, well_plate_imgnb_objnb,
                         log_red_fucci_zeroc_rescale, log_green_fucci_zeroc_rescale, mockbulk_phases):
     '''Generate a polar coordinate model of cell cycle progression based on the FUCCI intensities'''
     # Generate model
     polar_coord_results = fucci_polar_coords(fucci_data[:,0], fucci_data[:,1], "Protein")
-    pol_sort_norm_rev, centered_data, pol_sort_centered_data0, pol_sort_centered_data1, pol_sort_inds, pol_sort_inds_reorder, more_than_start, less_than_start, start_pt, g1_end_pt, g1s_end_pt, cart_data_ur, R_2, start_phi = polar_coord_results
+    pol_sort_norm_rev, centered_data, pol_sort_centered_data0, pol_sort_centered_data1, pol_sort_phi, pol_sort_inds, pol_sort_inds_reorder, pol_sort_phi_reorder, more_than_start, less_than_start, start_pt, g1_end_pt, g1s_end_pt, cart_data_ur, R_2, start_phi = polar_coord_results
 
     # Sort results by pseudotime
     sort_results = pol_sort(pol_sort_inds, more_than_start, less_than_start, well_plate, well_plate_imgnb, well_plate_imgnb_objnb, ab_nuc, ab_cyto, ab_cell, mt_cell, area_cell, area_nuc, log_red_fucci_zeroc_rescale, log_green_fucci_zeroc_rescale, mockbulk_phases)
@@ -239,29 +241,27 @@ def defined_phases_scatter(phases_filtered, outfile):
     plt.savefig(outfile)
     plt.close()
 
-def pseudotime_rna(adata, phases_filt):
+def pseudotime_rna(adata):
     '''
     Model the pseudotime of FUCCI markers measured by FACS for cell analyzed with single-cell RNA sequencing
     Input: RNA-Seq data; cell cycle phase for each cell
     Output: Cell cycle pseudotime for each cell; plots illustrating FUCCI intensities over pseudotime
     '''
-    phases_validIntPhase = phases_filt[pd.notnull(phases_filt.Green530) & pd.notnull(phases_filt.Red585) & pd.notnull(phases_filt.Stage)]
-    utils.general_scatter_color(phases_validIntPhase["Green530"], phases_validIntPhase["Red585"], "log(Green530)", "log(Red585)", 
-        phases_validIntPhase["Stage"].apply(lambda x: get_phase_colormap()[x]), "Cell Cycle Phase", False, "", "figures/FucciPlotByPhase_RNA.png", 
+    adataValidPhase = adata.obs["phase"] != "N/A"
+    utils.general_scatter_color(adata.obs["Green530"][adataValidPhase], adata.obs["Red585"][adataValidPhase], "log(Green530)", "log(Red585)", 
+        adata.obs["phase"][adataValidPhase].apply(lambda x: get_phase_colormap()[x]), "Cell Cycle Phase", False, "", "figures/FucciPlotByPhase_RNA.png", 
         get_phase_colormap())
 
-    phases_validInt = phases_filt[pd.notnull(phases_filt.Green530) & pd.notnull(phases_filt.Red585)] # stage may be null
-    polar_coord_results = fucci_polar_coords(phases_validInt["Green530"], phases_validInt["Red585"], "RNA")
-    pol_sort_norm_rev, centered_data, pol_sort_centered_data0, pol_sort_centered_data1, pol_sort_inds, pol_sort_inds_reorder, more_than_start, less_than_start, start_pt, g1_end_pt, g1s_end_pt, cart_data_ur, R_2, start_phi = polar_coord_results
+    polar_coord_results = fucci_polar_coords(adata.obs["Green530"], adata.obs["Red585"], "RNA")
+    pol_sort_norm_rev, centered_data, pol_sort_centered_data0, pol_sort_centered_data1, pol_sort_phi, pol_sort_inds, pol_sort_inds_reorder, pol_sort_phi_reorder, more_than_start, less_than_start, start_pt, g1_end_pt, g1s_end_pt, cart_data_ur, R_2, start_phi = polar_coord_results
 
     # Assign cells a pseudotime and visualize in fucci plot
     pol_unsort = np.argsort(pol_sort_inds_reorder)
     fucci_time = pol_sort_norm_rev[pol_unsort]
     adata.obs["fucci_time"] = fucci_time
-    phases_validInt["fucci_time"] = fucci_time
 
     plt.figure(figsize=(6,5))
-    plt.scatter(phases_validInt["Green530"], phases_validInt["Red585"], c = phases_validInt["fucci_time"], cmap="RdYlGn")
+    plt.scatter(adata.obs["Green530"], adata.obs["Red585"], c = adata.obs["fucci_time"], cmap="RdYlGn")
     cbar = plt.colorbar()
     cbar.set_label('Pseudotime',size=20)
     cbar.ax.tick_params(labelsize=18)
@@ -275,9 +275,21 @@ def pseudotime_rna(adata, phases_filt):
     # Save fucci times, so they can be used in other workbooks
     fucci_time_inds = np.argsort(adata.obs["fucci_time"])
     fucci_time_sort = np.take(np.array(adata.obs["fucci_time"]), fucci_time_inds)
-    cell_time_sort = pd.DataFrame({"fucci_time" : fucci_time_sort, "cell" : np.take(adata.obs_names, fucci_time_inds)})
-    cell_time_sort.to_csv("output/CellPseudotimes.csv")
-    pd.DataFrame({"fucci_time": fucci_time}).to_csv("output/fucci_time.csv")
+    cell_time_sort = pd.DataFrame({
+        "fucci_time" : fucci_time_sort, 
+        "cell" : np.take(adata.obs["Well_Plate"], fucci_time_inds)})
+    cell_time_sort.to_csv("output/CellPseudotimes.csv", index=False)
+    cell_polar_coords = pd.DataFrame({
+        "cell" : np.take(adata.obs["Well_Plate"], fucci_time_inds),
+        "phase_by_facs_gating" : adata.obs["phase"][fucci_time_inds],
+        "raw_green530" : adata.obs["MeanGreen530"][fucci_time_inds],
+        "raw_red585" : adata.obs["MeanRed585"][fucci_time_inds],
+        "green530_lognorm_rescale" : adata.obs["Green530"][fucci_time_inds],
+        "red585_lognorm_rescale" : adata.obs["Red585"][fucci_time_inds],
+        "polar_coord_phi" : pol_sort_phi[pol_unsort][fucci_time_inds],
+        "fucci_time_hrs" : fucci_time_sort * fucci.TOT_LEN})
+    cell_polar_coords.to_csv("output/fucci_coords.csv", index=False)
+    pd.DataFrame({"fucci_time": fucci_time}).to_csv("output/fucci_time.csv", index=False)
 
 def pseudotime_umap(adata, isIsoform=False):
     '''

diff --git a/SingleCellProteogenomics/RNACellCycleDependence.py b/SingleCellProteogenomics/RNACellCycleDependence.py
@@ -53,16 +53,16 @@ def plot_expression_umap(adata, genelist, outfolder):
         plt.close()
         adata.obs.drop(gene, 1)
 
-def plot_expression_facs(genelist, normalized_exp_data, phases, var_names, outfolder):
+def plot_expression_facs(adata, genelist, normalized_exp_data, outfolder):
     '''
     Displaying relative expression on the FUCCI plot (log-log intensity of FUCCI markers)
     Output: a folder of FUCCI plots for each gene in a list
     '''
-    phases_validInt = phases[pd.notnull(phases.Green530) & pd.notnull(phases.Red585)]
     if not os.path.exists(outfolder): os.mkdir(outfolder)
+    var_name_list= list(adata.var_names)
     for gene in genelist:
-        nexp = normalized_exp_data[:,list(var_names).index(gene)]
-        plt.scatter(phases_validInt["Green530"], phases_validInt["Red585"], c=nexp)
+        nexp = normalized_exp_data[:,var_name_list.index(gene)]
+        plt.scatter(adata.obs["Green530"], adata.obs["Red585"], c=nexp)
         plt.tight_layout()
         cbar = plt.colorbar()
         cbar.ax.get_yaxis().labelpad = 15
@@ -286,13 +286,14 @@ def compare_genes_to_isoforms(adata, ccdprotein, ccdtranscript, adata_nonccdprot
     print(f"{sum(ccdIsoformsPerCcdProtein > 0) / sum(ccdprotein)}% of CCD proteins had at least one CCD transcript isoform")
     print(f"{sum(ccdIsoformsPerNonCcdProtein > 0) / sum(adata_nonccdprotein)}% of non-CCD proteins had at least one CCD transcript isoform")
 
-def analyze_isoforms(adata, ccdtranscript, wp_ensg, ccd_comp, nonccd_comp):
+def analyze_isoforms(adata, ccdtranscript, wp_ensg, ccd_comp, nonccd_comp, u_plates):
     '''Analyze the isoform-level results over the cell cycle'''
     # Read in the data & QC analysis
     valuetype, use_spikeins, biotype_to_use = "Tpms", False, "protein_coding"
-    adata_isoform, phases_isoform = RNADataPreparation.read_counts_and_phases(valuetype, use_spikeins, biotype_to_use, use_isoforms=True)
+    adata_isoform, phases_isoform = RNADataPreparation.read_counts_and_phases(valuetype, use_spikeins, biotype_to_use, u_plates, use_isoforms=True)
     # RNADataPreparation.plot_pca_for_batch_effect_analysis(adata_isoform, "BeforeRemovingNoncycling_Isoform")
     adata_isoform, phasesfilt_isoform = RNADataPreparation.qc_filtering(adata_isoform, do_log_normalize=True, do_remove_blob=True)
+    adata_isoform = RNADataPreparation.zero_center_fucci(adata_isoform)
     # RNADataPreparation.plot_pca_for_batch_effect_analysis(adata_isoform, "AfterRemovingNoncycling_Isoform")
     # FucciPseudotime.pseudotime_umap(adata_isoform, isIsoform=True)