Skip to content

Commit

Permalink
Merge pull request #12 from CellProfiling/norm
Browse files Browse the repository at this point in the history
Normalization method update
  • Loading branch information
Anthony authored May 21, 2021
2 parents 0654f0f + 8489c5a commit 7f73b9a
Show file tree
Hide file tree
Showing 12 changed files with 197 additions and 73 deletions.
7 changes: 5 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
# IDE and Checkpoint Files
.spyproject/
.vscode/
.ipynb_checkpoints
__pycache__/
.pybiomart.sqlite
~$*
*/~$*
cache/
.DS_Store
*/.DS_Store
**/.DS_Store

# Output or Downloaded Folders
input.zip
Expand All @@ -18,3 +18,6 @@ data/

# External Software
iupred2a/

# Snakemake files
**/.snakemake
23 changes: 13 additions & 10 deletions 3_RNAFucciPseudotime.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,34 +27,38 @@
wp_ensg = np.load("output/pickles/wp_ensg.npy", allow_pickle=True)
ccd_comp = np.load("output/pickles/ccd_comp.npy", allow_pickle=True)
nonccd_comp = np.load("output/pickles/nonccd_comp.npy", allow_pickle=True)
u_plates = ["355","356","357"]

#%% Convert FACS intensities for FUCCI markers to pseudotime using the same polar coordinate methods as for protein
# Idea: Use the polar coordinate pseudotime calculations to calculate the pseudotime for each cell
# Execution: Adapt Devin's code for the cells sorted for RNA-Seq
# Output: Make log-log fucci intensity plots for the cells analyzed by RNA-Seq; Plot of all fucci pseudotimes; table of pseudotimes for each cell
adata, phases_filt = RNADataPreparation.read_counts_and_phases(
"Counts", False, "protein_coding"
) # no qc, yet
FucciPseudotime.pseudotime_rna(adata, phases_filt)
"Counts", False, "protein_coding", u_plates
)
adata = RNADataPreparation.zero_center_fucci(adata)
FucciPseudotime.pseudotime_rna(adata)

#%% Single cell RNA-Seq data preparation and general analysis
RNADataPreparation.general_plots()
RNADataPreparation.analyze_noncycling_cells()
RNADataPreparation.general_plots(u_plates)
RNADataPreparation.analyze_noncycling_cells(u_plates)
adata, phasesfilt = RNADataPreparation.qc_filtering(
adata, do_log_normalize=True, do_remove_blob=False
)
adata = RNADataPreparation.zero_center_fucci(adata)
RNADataPreparation.plot_pca_for_batch_effect_analysis(adata, "BeforeRemovingNoncycling")

#%% Idea: Similar to mock-bulk analysis for proteins, we can evaluate each gene bundled by phase across cells
# Execution: Make boxplots of RNA expression by phase
# Output: boxplots for each gene
valuetype, use_spikeins, biotype_to_use = "Tpms", False, "protein_coding"
adata, phases = RNADataPreparation.read_counts_and_phases(
valuetype, use_spikeins, biotype_to_use
valuetype, use_spikeins, biotype_to_use, u_plates
)
adata, phasesfilt = RNADataPreparation.qc_filtering(
adata, do_log_normalize=True, do_remove_blob=True
)
adata = RNADataPreparation.zero_center_fucci(adata)
RNADataPreparation.plot_markers_vs_reads(adata)
RNADataPreparation.plot_pca_for_batch_effect_analysis(adata, "AfterRemovingNoncycling")
g1 = adata.obs["phase"] == "G1"
Expand Down Expand Up @@ -93,10 +97,9 @@

# Log-log FUCCI plot with RNA expression overlayed
RNACellCycleDependence.plot_expression_facs(
adata,
wp_ensg[np.isin(wp_ensg, adata.var_names)],
normalized_exp_data,
phasesfilt,
adata.var_names,
"figures/GeneExpressionFucci",
)

Expand Down Expand Up @@ -157,7 +160,7 @@

#%% Moving average calculations and randomization analysis for the spike-in internal controls
adata_spikeins, phases_spikeins = RNADataPreparation.read_counts_and_phases(
valuetype, use_spike_ins=True, biotype_to_use=""
valuetype, use_spike_ins=True, biotype_to_use="", u_plates=u_plates
)
sc.pp.filter_genes(adata_spikeins, min_cells=100)
print(f"data shape after filtering: {adata_spikeins.X.shape}")
Expand All @@ -166,7 +169,7 @@

#%% Analyze isoforms
adata_isoform, ccdtranscript_isoform = RNACellCycleDependence.analyze_isoforms(
adata, ccdtranscript, wp_ensg, ccd_comp, nonccd_comp
adata, ccdtranscript, wp_ensg, ccd_comp, nonccd_comp, u_plates
)
RNACellCycleDependence.compare_genes_to_isoforms(
adata,
Expand Down
9 changes: 6 additions & 3 deletions 4_TemporalDelay.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,26 +85,29 @@

#%% Create a heatmap of peak RNA expression
highlight_names, highlight_ensg = [], []
u_rna_plates = ["355","356","357"]

# Read in RNA-Seq data; use TPMs so that the gene-specific results scales match for cross-gene comparisons
valuetype, use_spikeins, biotype_to_use = "Tpms", False, "protein_coding"
adata, phases = RNADataPreparation.read_counts_and_phases(
valuetype, use_spikeins, biotype_to_use
valuetype, use_spikeins, biotype_to_use, u_rna_plates
)
adata, phasesfilt = RNADataPreparation.qc_filtering(
adata, do_log_normalize=True, do_remove_blob=True
)
adata = RNADataPreparation.zero_center_fucci(adata)
sorted_max_moving_avg_pol_ccd, norm_exp_sort, max_moving_avg_pol, sorted_rna_binned_norm = TemporalDelay.rna_heatmap(
adata, highlight_names, highlight_ensg, ccdtranscript, xvals
)

# Analyze isoforms
adata_isoform, phases_isoform = RNADataPreparation.read_counts_and_phases(
valuetype, use_spikeins, biotype_to_use, use_isoforms=True
valuetype, use_spikeins, biotype_to_use, u_rna_plates, use_isoforms=True,
)
adata_isoform, phasesfilt_isoform = RNADataPreparation.qc_filtering(
adata_isoform, do_log_normalize=True, do_remove_blob=True
)
adata_isoform = RNADataPreparation.zero_center_fucci(adata_isoform)
sorted_max_moving_avg_pol_ccd_isoform, norm_exp_sort_isoform, max_moving_avg_pol_isoform, sorted_rna_binned_norm_isoform = TemporalDelay.rna_heatmap(
adata_isoform,
highlight_names,
Expand All @@ -114,7 +117,7 @@
isIsoformData=True,
)
pearsonCorrelations = TemporalDelay.analyze_ccd_isoform_correlations(
adata, adata_isoform, ccdtranscript, ccdtranscript_isoform, xvals
adata, adata_isoform, ccdtranscript, ccdtranscript_isoform, xvals, u_rna_plates
)

#%% Compare the variances and time of peak expression between protein and RNA
Expand Down
5 changes: 3 additions & 2 deletions 5_ProteinProperties.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,17 +19,18 @@
plt.rcParams["ps.fonttype"] = 42
plt.rcParams["savefig.dpi"] = 300
fucci = FucciCellCycle.FucciCellCycle()
u_rna_plates = ["355","356","357"]

#%% Import the genes names we're analyzing
# Read in RNA-Seq data again and the CCD gene lists
valuetype, use_spikeins, biotype_to_use = "Tpms", False, "protein_coding"
adata, phases = RNADataPreparation.read_counts_and_phases(
valuetype, use_spikeins, biotype_to_use
valuetype, use_spikeins, biotype_to_use, u_rna_plates
)
adata, phasesfilt = RNADataPreparation.qc_filtering(
adata, do_log_normalize=True, do_remove_blob=True
)

adata = RNADataPreparation.zero_center_fucci(adata)
import_dict = Loaders.load_ptm_and_stability(adata)
wp_ensg, ccd_comp, nonccd_comp, ccdtranscript, wp_max_pol = (
import_dict["wp_ensg"],
Expand Down
21 changes: 10 additions & 11 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,32 +3,31 @@
This repository contains the code used to perform the [single-cell proteogenomic analysis of the human cell cycle](https://www.nature.com/articles/s41586-021-03232-9). This study was based on immunofluorescence staining of ~200k cells for single-cell analysis of proteomic heterogeneity and ~1k cells for analysis of single-cell RNA variability. These analyses were integrated with other proteomic studies and databases to investigate the functional importance of transcript-regulated and non-transcript regulated variability.

## Structure of repository
The code is listed in order of execution, e.g. "1_", "2_" etc. The output of each script is used in the subsequent script.
The code is listed in order of execution, e.g. "1_", "2_" etc. The output of each script is used in the subsequent script. This workflow can also be run using snakemake (see below).

The logic for these analyses is contained in the `SingleCellProteogenomics` folder.

The input files are contained in the "input" folder. This folder is linked [here](https://drive.google.com/file/d/1mdQbYcDPqiTOHeiYbv_4RtrxrmlhYMNl/view?usp=sharing) as a zip file, `input.zip`. Expand this folder within the base directory of this repository. If you are looking for the raw imaging proteomic dataset produced after filtering artifacts and such, that is located [here](https://drive.google.com/file/d/11vjsZV-nmzPpFmA7ShbfHzmbrk057b1V/view?usp=sharing).
The input files are contained in the "input" folder. This folder is linked [here](https://drive.google.com/file/d/1G4i115FCH8XNyiEHCkBXMSO_9pwGflTq/view?usp=sharing) as a zip file, `input.zip`. Expand this folder within the base directory of this repository. If you are looking for the raw imaging proteomic dataset produced after filtering artifacts and such, that is located [here](https://drive.google.com/file/d/11vjsZV-nmzPpFmA7ShbfHzmbrk057b1V/view?usp=sharing).

The output files are added to a folder "output" during the analysis, and figures are added to a folder "figures."

An R-script used to analyze skewness and kurtosis (noted in the Methods of the manuscript) is contained in the other_scripts folder. The `other_scripts/ProteinDisorder.py` script utilizes [IUPRED2A](https://iupred2a.elte.hu/) and a [human UniProt](https://www.uniprot.org/proteomes/UP000005640) database.

## Prerequisites
## Running the workflow using snakemake

Prerequisites are listed in `enviro.yaml` file. They can be installed using `conda`:
This workflow can be run using `snakemake`:

1. Install Miniconda from https://docs.conda.io/en/latest/miniconda.html
1. Install Miniconda from https://docs.conda.io/en/latest/miniconda.html.

2. From this directory run `conda env create -n fucci -f enviro.yaml; conda activate fucci` to install and activate these prerequisites.
2. Install snakemake using `conda install -c conda-forge snakemake-minimal`.

3. Within this directory, run `snakemake -j 1 --use-conda --snakefile workflow/Snakefile`.

## Single-cell RNA-Seq analysis

The single-cell RNA-Seq data is available at GEO SRA under project number [GSE146773](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE146773).
The single-cell RNA-Seq data is available at GEO SRA under project number [GSE146773](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE146773).

The cell cycle phase and FACS intensity data for these ~1,000 cells are contained in the [input folder](https://drive.google.com/file/d/1mdQbYcDPqiTOHeiYbv_4RtrxrmlhYMNl/view?usp=sharing) within the file `ProteinData/WellPlatePhasesLogNormIntensities.csv`:
* The column "Well_Plate" (e.g., A10_355) corresponds to the sample title within GEO SRA (e.g., "Single U2OS cell A10_355").
* The column "Stage" corresponds to the phase assigned by FACS gating.
* The "Green530" and "Red585" columns correspond to the log-intensities for the red (CDT1) and green (GMNN) FUCCI markers for the individual cells.
The cell cycle phase and FACS intensity information for these ~1,000 cells are contained in the [input folder](https://drive.google.com/file/d/1G4i115FCH8XNyiEHCkBXMSO_9pwGflTq/view?usp=sharing) within three files, one per plate, starting with `RNAData/180911_Fucci_single cell seq_ss2-18-*.csv`.

The `snakemake` workflow used to analyze the scRNA-Seq dataset, including RNA velocity calculations and louvain unsupervised clustering, can be found in this repository: https://github.com/CellProfiling/FucciSingleCellSeqPipeline.

Expand Down
40 changes: 26 additions & 14 deletions SingleCellProteogenomics/FucciPseudotime.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ def plot_annotate_time(R_2, start_phi, fraction):
pt = pol2cart(R_2,start_phi + (1 - fraction) * 2 * np.pi)
plt.scatter(pt[0],pt[1],c='c',linewidths=4)
plt.annotate(f" {round(fraction * fucci.TOT_LEN, 2)} hrs", (pt[0], pt[1]))

def drange(x, y, jump):
'''Increment `x` by a decimal `jump` until greater than `y`'''
while x < y:
Expand Down Expand Up @@ -181,6 +182,7 @@ def fucci_polar_coords(x, y, analysis_title):
plt.tight_layout()
plt.savefig(f"figures/FucciAllPseudotimeHist_{analysis_title}.png")
# plt.show()
plt.close()

# visualize that result
start_pt = pol2cart(R_2,start_phi)
Expand All @@ -189,15 +191,15 @@ def fucci_polar_coords(x, y, analysis_title):
cart_data_ur = pol2cart(np.repeat(R_2,len(centered_data)), pol_data[1])
fucci_hist2d(centered_data, cart_data_ur, start_pt, g1_end_pt, g1s_end_pt, analysis_title, R_2, start_phi)

return (pol_sort_norm_rev, centered_data, pol_sort_centered_data0, pol_sort_centered_data1, pol_sort_inds, pol_sort_inds_reorder,
return (pol_sort_norm_rev, centered_data, pol_sort_centered_data0, pol_sort_centered_data1, pol_sort_phi, pol_sort_inds, pol_sort_inds_reorder, pol_sort_phi_reorder,
more_than_start, less_than_start, start_pt, g1_end_pt, g1s_end_pt, cart_data_ur, R_2, start_phi)

def pseudotime_protein(fucci_data, ab_nuc, ab_cyto, ab_cell, mt_cell, area_cell, area_nuc, well_plate, well_plate_imgnb, well_plate_imgnb_objnb,
log_red_fucci_zeroc_rescale, log_green_fucci_zeroc_rescale, mockbulk_phases):
'''Generate a polar coordinate model of cell cycle progression based on the FUCCI intensities'''
# Generate model
polar_coord_results = fucci_polar_coords(fucci_data[:,0], fucci_data[:,1], "Protein")
pol_sort_norm_rev, centered_data, pol_sort_centered_data0, pol_sort_centered_data1, pol_sort_inds, pol_sort_inds_reorder, more_than_start, less_than_start, start_pt, g1_end_pt, g1s_end_pt, cart_data_ur, R_2, start_phi = polar_coord_results
pol_sort_norm_rev, centered_data, pol_sort_centered_data0, pol_sort_centered_data1, pol_sort_phi, pol_sort_inds, pol_sort_inds_reorder, pol_sort_phi_reorder, more_than_start, less_than_start, start_pt, g1_end_pt, g1s_end_pt, cart_data_ur, R_2, start_phi = polar_coord_results

# Sort results by pseudotime
sort_results = pol_sort(pol_sort_inds, more_than_start, less_than_start, well_plate, well_plate_imgnb, well_plate_imgnb_objnb, ab_nuc, ab_cyto, ab_cell, mt_cell, area_cell, area_nuc, log_red_fucci_zeroc_rescale, log_green_fucci_zeroc_rescale, mockbulk_phases)
Expand Down Expand Up @@ -239,29 +241,27 @@ def defined_phases_scatter(phases_filtered, outfile):
plt.savefig(outfile)
plt.close()

def pseudotime_rna(adata, phases_filt):
def pseudotime_rna(adata):
'''
Model the pseudotime of FUCCI markers measured by FACS for cell analyzed with single-cell RNA sequencing
Input: RNA-Seq data; cell cycle phase for each cell
Output: Cell cycle pseudotime for each cell; plots illustrating FUCCI intensities over pseudotime
'''
phases_validIntPhase = phases_filt[pd.notnull(phases_filt.Green530) & pd.notnull(phases_filt.Red585) & pd.notnull(phases_filt.Stage)]
utils.general_scatter_color(phases_validIntPhase["Green530"], phases_validIntPhase["Red585"], "log(Green530)", "log(Red585)",
phases_validIntPhase["Stage"].apply(lambda x: get_phase_colormap()[x]), "Cell Cycle Phase", False, "", "figures/FucciPlotByPhase_RNA.png",
adataValidPhase = adata.obs["phase"] != "N/A"
utils.general_scatter_color(adata.obs["Green530"][adataValidPhase], adata.obs["Red585"][adataValidPhase], "log(Green530)", "log(Red585)",
adata.obs["phase"][adataValidPhase].apply(lambda x: get_phase_colormap()[x]), "Cell Cycle Phase", False, "", "figures/FucciPlotByPhase_RNA.png",
get_phase_colormap())

phases_validInt = phases_filt[pd.notnull(phases_filt.Green530) & pd.notnull(phases_filt.Red585)] # stage may be null
polar_coord_results = fucci_polar_coords(phases_validInt["Green530"], phases_validInt["Red585"], "RNA")
pol_sort_norm_rev, centered_data, pol_sort_centered_data0, pol_sort_centered_data1, pol_sort_inds, pol_sort_inds_reorder, more_than_start, less_than_start, start_pt, g1_end_pt, g1s_end_pt, cart_data_ur, R_2, start_phi = polar_coord_results
polar_coord_results = fucci_polar_coords(adata.obs["Green530"], adata.obs["Red585"], "RNA")
pol_sort_norm_rev, centered_data, pol_sort_centered_data0, pol_sort_centered_data1, pol_sort_phi, pol_sort_inds, pol_sort_inds_reorder, pol_sort_phi_reorder, more_than_start, less_than_start, start_pt, g1_end_pt, g1s_end_pt, cart_data_ur, R_2, start_phi = polar_coord_results

# Assign cells a pseudotime and visualize in fucci plot
pol_unsort = np.argsort(pol_sort_inds_reorder)
fucci_time = pol_sort_norm_rev[pol_unsort]
adata.obs["fucci_time"] = fucci_time
phases_validInt["fucci_time"] = fucci_time

plt.figure(figsize=(6,5))
plt.scatter(phases_validInt["Green530"], phases_validInt["Red585"], c = phases_validInt["fucci_time"], cmap="RdYlGn")
plt.scatter(adata.obs["Green530"], adata.obs["Red585"], c = adata.obs["fucci_time"], cmap="RdYlGn")
cbar = plt.colorbar()
cbar.set_label('Pseudotime',size=20)
cbar.ax.tick_params(labelsize=18)
Expand All @@ -275,9 +275,21 @@ def pseudotime_rna(adata, phases_filt):
# Save fucci times, so they can be used in other workbooks
fucci_time_inds = np.argsort(adata.obs["fucci_time"])
fucci_time_sort = np.take(np.array(adata.obs["fucci_time"]), fucci_time_inds)
cell_time_sort = pd.DataFrame({"fucci_time" : fucci_time_sort, "cell" : np.take(adata.obs_names, fucci_time_inds)})
cell_time_sort.to_csv("output/CellPseudotimes.csv")
pd.DataFrame({"fucci_time": fucci_time}).to_csv("output/fucci_time.csv")
cell_time_sort = pd.DataFrame({
"fucci_time" : fucci_time_sort,
"cell" : np.take(adata.obs["Well_Plate"], fucci_time_inds)})
cell_time_sort.to_csv("output/CellPseudotimes.csv", index=False)
cell_polar_coords = pd.DataFrame({
"cell" : np.take(adata.obs["Well_Plate"], fucci_time_inds),
"phase_by_facs_gating" : adata.obs["phase"][fucci_time_inds],
"raw_green530" : adata.obs["MeanGreen530"][fucci_time_inds],
"raw_red585" : adata.obs["MeanRed585"][fucci_time_inds],
"green530_lognorm_rescale" : adata.obs["Green530"][fucci_time_inds],
"red585_lognorm_rescale" : adata.obs["Red585"][fucci_time_inds],
"polar_coord_phi" : pol_sort_phi[pol_unsort][fucci_time_inds],
"fucci_time_hrs" : fucci_time_sort * fucci.TOT_LEN})
cell_polar_coords.to_csv("output/fucci_coords.csv", index=False)
pd.DataFrame({"fucci_time": fucci_time}).to_csv("output/fucci_time.csv", index=False)

def pseudotime_umap(adata, isIsoform=False):
'''
Expand Down
13 changes: 7 additions & 6 deletions SingleCellProteogenomics/RNACellCycleDependence.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,16 +53,16 @@ def plot_expression_umap(adata, genelist, outfolder):
plt.close()
adata.obs.drop(gene, 1)

def plot_expression_facs(genelist, normalized_exp_data, phases, var_names, outfolder):
def plot_expression_facs(adata, genelist, normalized_exp_data, outfolder):
'''
Displaying relative expression on the FUCCI plot (log-log intensity of FUCCI markers)
Output: a folder of FUCCI plots for each gene in a list
'''
phases_validInt = phases[pd.notnull(phases.Green530) & pd.notnull(phases.Red585)]
if not os.path.exists(outfolder): os.mkdir(outfolder)
var_name_list= list(adata.var_names)
for gene in genelist:
nexp = normalized_exp_data[:,list(var_names).index(gene)]
plt.scatter(phases_validInt["Green530"], phases_validInt["Red585"], c=nexp)
nexp = normalized_exp_data[:,var_name_list.index(gene)]
plt.scatter(adata.obs["Green530"], adata.obs["Red585"], c=nexp)
plt.tight_layout()
cbar = plt.colorbar()
cbar.ax.get_yaxis().labelpad = 15
Expand Down Expand Up @@ -286,13 +286,14 @@ def compare_genes_to_isoforms(adata, ccdprotein, ccdtranscript, adata_nonccdprot
print(f"{sum(ccdIsoformsPerCcdProtein > 0) / sum(ccdprotein)}% of CCD proteins had at least one CCD transcript isoform")
print(f"{sum(ccdIsoformsPerNonCcdProtein > 0) / sum(adata_nonccdprotein)}% of non-CCD proteins had at least one CCD transcript isoform")

def analyze_isoforms(adata, ccdtranscript, wp_ensg, ccd_comp, nonccd_comp):
def analyze_isoforms(adata, ccdtranscript, wp_ensg, ccd_comp, nonccd_comp, u_plates):
'''Analyze the isoform-level results over the cell cycle'''
# Read in the data & QC analysis
valuetype, use_spikeins, biotype_to_use = "Tpms", False, "protein_coding"
adata_isoform, phases_isoform = RNADataPreparation.read_counts_and_phases(valuetype, use_spikeins, biotype_to_use, use_isoforms=True)
adata_isoform, phases_isoform = RNADataPreparation.read_counts_and_phases(valuetype, use_spikeins, biotype_to_use, u_plates, use_isoforms=True)
# RNADataPreparation.plot_pca_for_batch_effect_analysis(adata_isoform, "BeforeRemovingNoncycling_Isoform")
adata_isoform, phasesfilt_isoform = RNADataPreparation.qc_filtering(adata_isoform, do_log_normalize=True, do_remove_blob=True)
adata_isoform = RNADataPreparation.zero_center_fucci(adata_isoform)
# RNADataPreparation.plot_pca_for_batch_effect_analysis(adata_isoform, "AfterRemovingNoncycling_Isoform")
# FucciPseudotime.pseudotime_umap(adata_isoform, isIsoform=True)

Expand Down
Loading

0 comments on commit 7f73b9a

Please sign in to comment.