Edits for running on cluster (#6)

* edits for running on cluster * add new options for protein and rna scripts asdf merge cluster * add test to config * change config name * cluster option option * fix merge conflict * make_rsem_dataframe * edits on cluster * memory usage improvements --------- Co-authored-by: Anthony Joseph Cesnik <cesnik@sh02-ln01.stanford.edu>
CellProfiling · Mar 24, 2023 · 7118125 · 7118125
1 parent 9c0a35b
commit 7118125
Show file tree

Hide file tree

Showing 12 changed files with 115 additions and 110 deletions.
diff --git a/.gitignore b/.gitignore
@@ -4,3 +4,5 @@ resources/ERCC.filtered.*
 results
 input
 **/.DS_Store
+workflow/slurm*
+__pycache__
diff --git a/README.md b/README.md
@@ -12,42 +12,37 @@ This data is downloaded automatically in this pipeline.
 
 ## Updating the Ensembl version
 
-The genome and Ensembl versions are located at the top of the file `Snakefile`.
+The genome and Ensembl versions are located at the top of the file `workflow/config/FucciSingleCell.yaml`.
 These can be updated, and the references will be downloaded automatically.
 
 ## Usage
 
 1) Clone repository and initialize submodules: `git clone --recurse-submodules https://github.com/CellProfiling/FucciSingleCellSeqPipeline.git && cd FucciSingleCellSeqPipeline/workflow`
 1) Install conda: https://docs.conda.io/en/latest/miniconda.html
-2) Install snakemake using conda: `conda install -c conda-forge -c bioconda snakemake-minimal`
-4) Run the workflow: `snakemake --use-conda --cores 24 --resources mem_mb=100000`, where you can subsitute the max number of cores and max memory allocation. At least 54 GB of free memory should be available.
+2) Create and activate setup environment: `conda env create -n fuccisetup -f envs/setup.yaml && conda activate fuccisetup`
+4) Run the workflow: `snakemake --use-conda --conda-frontend mamba --cores 24 --resources mem_mb=100000`, where you can subsitute the max number of cores and max memory allocation. At least 54 GB of free memory should be available.
 
 ## Usage on cluster
 
+In place of installing conda, you may need to activate it as a module, such as by `module load conda` and then follow the instructions to initialize it.
+
+Adapt `config/cluster_config.yaml` for your needs.
+
 In place of the last step above, you can use the scheduler like this:
-`snakemake -j 500 --cores 16 --cluster-config cluster_config.yaml --latency-wait 60 --keep-going --use-conda --cluster "sbatch -A {cluster.account} -t {cluster.time} -N {cluster.nodes} --cpus-per-task {threads} -p {cluster.partition}"`
-
-Replace 99 with the number of cores specified above in `workflow/rules/align.smk` and `workflow/rules/quant.smk`.
-
-Where `cluster_config.yaml` may look like this:
-```
-__default__:
-    account: sens2020535
-    partition: core
-    time: 2-0 # time limit for each job
-    nodes: 1
-    ntasks-per-node: 16 #Request n cores be allocated per node.
-    output: ../results/slurmout/spritz-%j.out
-    error: ../results/slurmerr/spritz-%j.err
-```
+`snakemake -j 500 --cores 16 --cluster-config config/cluster_config.yaml --latency-wait 60 --keep-going --use-conda --conda-frontend mamba --cluster "sbatch -t {cluster.time} -N {cluster.nodes} --cpus-per-task {threads} -p {cluster.partition}"`
 
 ## Usage on protected access cluster
 
-1) Clone repository and initialize submodules: `git clone --recurse-submodules https://github.com/CellProfiling/FucciSingleCellSeqPipeline.git && cd FucciSingleCellSeqPipeline/workflow`
-1) Install conda: https://docs.conda.io/en/latest/miniconda.html
-2) Install snakemake using conda: `conda install -c conda-forge -c bioconda snakemake-minimal`
-2) If running the pipeline on protected access computer, predownload files by running `snakemake -j 16 ../results/setup.txt` on a machine with internet access.
-4) Make a tarball of the project with `cd ../.. && tar -cxvf FucciSingleCellSeqPipeline.zip FucciSingleCellSeqPipeline` and transfer it to the protected access cluster.
+1) Clone repository and initialize submodules on your local machine: `git clone --recurse-submodules https://github.com/CellProfiling/FucciSingleCellSeqPipeline.git && cd FucciSingleCellSeqPipeline/workflow`
+2) Install conda: https://docs.conda.io/en/latest/miniconda.html
+3) Create and activate setup environment: `conda env create -n fuccisetup -f envs/setup.yaml && conda activate fuccisetup`
+4) If running the pipeline on protected access computer, predownload files by running `snakemake -j 16 ../results/setup.txt` on a machine with internet access.
+5) Make a tarball of the project with `cd ../.. && tar -cxvf FucciSingleCellSeqPipeline.zip FucciSingleCellSeqPipeline` and transfer it to the protected access cluster.
+6) Load conda as a module on the protected access cluster, such as with `module load conda`, and follow the instructions to activate it.
+7) Create and activate setup environment: `conda env create -n fuccisetup -f envs/setup.yaml && conda activate fuccisetup`
+8) Adapt `config/cluster_config.yaml` for your needs.
+9) Use the scheduler from snakemake like this:
+`snakemake -j 500 --cores 16 --cluster-config config/cluster_config.yaml --latency-wait 60 --keep-going --use-conda --conda-frontend mamba --cluster "sbatch -A {cluster.account} -t {cluster.time} -N {cluster.nodes} --cpus-per-task {threads} -p {cluster.partition}"`
 
 ## Citation
 

diff --git a/workflow/Snakefile b/workflow/Snakefile
@@ -1,7 +1,7 @@
-configfile: "config/SraAccList.yaml"
+configfile: "config/FucciSingleCell.yaml"
 
 GENOME_VERSION = "GRCh38"
-ENSEMBL_VERSION = "103"
+ENSEMBL_VERSION = config["ensembl_version"] 
 SPECIES = config["species"]
 SPECIES_LOWER = SPECIES.lower()
 REF = f"{SPECIES}.{GENOME_VERSION}"
@@ -18,8 +18,8 @@ TEST_GENOME_FA = f"../resources/ensembl/202122.fa"
 TEST_ENSEMBL_GFF = f"../resources/ensembl/202122.gff3"
 TEST_ENSEMBL_GTF = f"../resources/ensembl/202122.gtf"
 
-# Uncomment to trigger testing
-#FA,GFF,GTF=TEST_GENOME_FA,TEST_ENSEMBL_GFF,TEST_ENSEMBL_GTF
+if config["test"]:
+    FA,GFF,GTF=TEST_GENOME_FA,TEST_ENSEMBL_GFF,TEST_ENSEMBL_GTF
 
 rule all:
     '''Final targets of run'''

diff --git a/workflow/config/SraAccList.yaml → workflow/config/FucciSingleCell.yaml b/workflow/config/SraAccList.yaml → workflow/config/FucciSingleCell.yaml
@@ -1,4 +1,6 @@
+ensembl_version: "109" # ensembl version
 species: "Homo_sapiens" # ensembl species name
+test: False 
 sra:
   - SRR11286626
   - SRR11286627

diff --git a/workflow/config/cluster_config.yaml b/workflow/config/cluster_config.yaml
@@ -0,0 +1,10 @@
+__default__:
+#    account: sens2020535  # Can be specified as sbatch option with `-A {cluster.account}`
+#    partition: core
+    time: 2-0 # time limit for each job
+    nodes: 1
+    ntasks-per-node: 16 #Request n cores be allocated per node.
+#    memory: 112000
+#    chdir: /directory/to/change/to
+    output: ../results/slurmout/spritz-%j.out
+    error: ../results/slurmerr/spritz-%j.err
diff --git a/workflow/envs/setup.yaml b/workflow/envs/setup.yaml
@@ -1,4 +1,6 @@
 channels:
   - bioconda 
+  - conda-forge
 dependencies:
   - snakemake-minimal
+  - mamba
diff --git a/workflow/envs/velo.yaml b/workflow/envs/velo.yaml
@@ -4,4 +4,5 @@ channels:
 dependencies:
   - velocyto.py==0.17.17
   - entrez-direct # for srr_lookup
-  - scvelo==0.2.3
+  - scvelo
+  - scanpy==1.9.2
diff --git a/workflow/rules/align.smk b/workflow/rules/align.smk
@@ -11,7 +11,7 @@ rule star_genome_generate:
         genomedir=lambda w, output: os.path.dirname(output[0]),
         size="--genomeSAindexNbases 12" if FA == TEST_GENOME_FA else "",
         sjoptions="--sjdbOverhang 42" # 43bp reads for this experiment
-    threads: 99
+    threads: workflow.cores
     log: f"{STAR_REF_FOLDER}.genome_generate.log"
     benchmark: f"{STAR_REF_FOLDER}.genome_generate.benchmark"
     conda: "../envs/quant.yaml"
@@ -20,23 +20,12 @@ rule star_genome_generate:
         " --genomeFastaFiles {input.efa} {input.gfa} --sjdbGTFfile {input.gtf}"
         " {params.sjoptions} {params.size}) &> {log}"
 
-rule load_star_genome_firstpass:
-    input: suffix=f"{STAR_REF_FOLDER}/SA"
-    output: temp("../results/align/loaded_firstpass")
-    params: genomedir=lambda w, input: os.path.dirname(input.suffix)
-    resources: mem_mb = RAM_MB_REQ
-    log: "../results/align/loaded_firstpass.log"
-    benchmark: "../results/align/loaded_firstpass.benchmark"
-    conda: "../envs/quant.yaml"
-    shell: "(STAR --genomeLoad LoadAndExit --genomeDir {params} && touch {output}) &> {log}"
-
 rule star_firstpass:
     input:
-        "../results/align/loaded_firstpass",
         suffix=f"{STAR_REF_FOLDER}/SA",
         fastq="../results/fastq/{sra}.trim.fastq.gz",
     output: sj="../results/align/SJ1st/{sra}SJ.out.tab"
-    threads: 8
+    threads: workflow.cores / 2
     params:
         bam="--outSAMtype None",
         genomedir=lambda w, input: os.path.dirname(input.suffix),
@@ -45,27 +34,13 @@ rule star_firstpass:
     benchmark: "../results/align/SJ1st/{sra}SJ.benchmark"
     conda: "../envs/quant.yaml"
     shell:
-        "(STAR --genomeLoad LoadAndKeep --genomeDir {params.genomedir}"
+        "(STAR --genomeLoad NoSharedMemory --genomeDir {params.genomedir}"
         " --runThreadN {threads} {params.bam} "
         " --outFileNamePrefix {params.outfolder}/{wildcards.sra}"
         " --readFilesIn <(zcat {input.fastq}) ) &> {log}"
 
-rule unload_firstpass_genome:
-    input:
-        suffix=f"{STAR_REF_FOLDER}/SA",
-        jj=expand("../results/align/SJ1st/{sra}SJ.out.tab", sra=config['sra'])
-    output: temp("../results/align/unloaded_firstpass")
-    params: genomedir=lambda w, input: os.path.dirname(input.suffix)
-    log: "../results/align/unloaded_firstpass.log"
-    benchmark: "../results/align/unloaded_firstpass.benchmark"
-    conda: "../envs/quant.yaml"
-    shell:
-        "(STAR --genomeLoad Remove --genomeDir {params.genomedir} && "
-        "touch {output}) &> {log}"
-
 rule star_genome_generate_secondpass:
     input:
-        unload="../results/align/unloaded_firstpass",
         efa="../resources/ERCC.fa",
         gfa=FA,
         gtf=f"{GTF}.fix.gtf",
@@ -75,7 +50,7 @@ rule star_genome_generate_secondpass:
         genomedir=lambda w, output: os.path.dirname(output.suffix),
         size="--genomeSAindexNbases 12" if FA == TEST_GENOME_FA else "",
         sjoptions="--sjdbOverhang 42 --limitSjdbInsertNsj 1200000" # 43bp reads for this experiment
-    threads: 99
+    threads: workflow.cores
     log: f"{STAR_REF_FOLDER}SecondPass.generate.log"
     benchmark: f"{STAR_REF_FOLDER}SecondPass.generate.benchmark"
     conda: "../envs/quant.yaml"
@@ -84,19 +59,8 @@ rule star_genome_generate_secondpass:
         " --genomeFastaFiles {input.efa} {input.gfa} --sjdbGTFfile {input.gtf} {params.sjoptions}"
         " --sjdbFileChrStartEnd {input.jj} {params.size}) &> {log}"
 
-rule load_star_genome_2pass:
-    input: suffix=f"{STAR_REF_FOLDER}SecondPass/SA"
-    output: temp("../results/align/loaded_2pass")
-    params: genomedir=lambda w, input: os.path.dirname(input.suffix),
-    resources: mem_mb = RAM_MB_REQ
-    log: "../results/align/loaded_2pass.log"
-    benchmark: "../results/align/loaded_2pass.benchmark"
-    conda: "../envs/quant.yaml"
-    shell: "(STAR --genomeLoad LoadAndExit --genomeDir {params} && touch {output}) &> {log}"
-
 rule star_2pass:
     input:
-        "../results/align/loaded_2pass",
         suffix=f"{STAR_REF_FOLDER}SecondPass/SA",
         fastq="../results/fastq/{sra}.trim.fastq.gz"
     output:
@@ -107,9 +71,9 @@ rule star_2pass:
         final="../results/align/{sra}Log.final.out",
         progress="../results/align/{sra}Log.progress.out",
     wildcard_constraints: sra="[A-Z0-9]+"
-    threads: 8
+    threads: workflow.cores / 2
     params:
-        mode="--runMode alignReads --genomeLoad LoadAndKeep",
+        mode="--runMode alignReads --genomeLoad NoSharedMemory",
         bam=f"--outSAMtype BAM SortedByCoordinate --outBAMcompression 10 --limitBAMsortRAM {str(RAM_B_REQ)}",
         transcripts=f"--quantMode TranscriptomeSAM",
         outfolder=lambda w, output: os.path.dirname(output.sj),
@@ -122,16 +86,3 @@ rule star_2pass:
         " --runThreadN {threads} {params.bam} {params.transcripts}"
         " --outFileNamePrefix {params.outfolder}/{wildcards.sra}"
         " --readFilesIn <(zcat {input.fastq}) ) &> {log}"
-
-rule unload_2pass_genome:
-    input:
-        suffix=f"{STAR_REF_FOLDER}SecondPass/SA",
-        bam=expand("../results/align/{sra}Aligned.sortedByCoord.out.bam", sra=config['sra']),
-    output: temp("../results/align/unloaded_2pass")
-    params: genomedir=lambda w, input: os.path.dirname(input.suffix)
-    log: "../results/align/unloaded_2pass.log"
-    benchmark: "../results/align/unloaded_2pass.benchmark"
-    conda: "../envs/quant.yaml"
-    shell:
-        "(STAR --genomeLoad Remove --genomeDir {params.genomedir} && "
-        "touch {output}) &> {log}"
diff --git a/workflow/rules/quant.smk b/workflow/rules/quant.smk
@@ -18,7 +18,7 @@ rule rsem_reference:
         fa=f"{RSEM_REF_FOLDER}RsemReference.transcripts.fa",
         suffix=f"{RSEM_REF_FOLDER}SA"
     params: prefix=lambda w, output: output.grp[:-4]
-    threads: 99 # can do fewer without STAR references
+    threads: workflow.cores # can do fewer without STAR references
     log: "../resources/ensembl/prepare-reference.log"
     benchmark: "../resources/ensembl/prepare-reference.benchmark"
     conda: "../envs/quant.yaml"
@@ -72,6 +72,8 @@ rule make_gene_rsem_dataframe:
     conda: "../envs/quant.yaml"
     log: "../results/quant/Counts.log"
     benchmark: "../results/quant/Counts.benchmark"
+    threads: workflow.cores # make sure this gets a full node, since it requires ~60 GB RAM
+    resources: mem_mb=112000
     shell:
         "python scripts/make_rsem_dataframe.py genes {input.gtf} {input.srr_lookup} {input.series_matrix}"
         " {output.counts} {output.tpms} {output.names} {output.ids} &> {log}"
@@ -93,6 +95,8 @@ rule make_isoform_rsem_dataframe:
     conda: "../envs/quant.yaml"
     log: "../results/quant/Counts_Isoforms.log"
     benchmark: "../results/quant/Counts_Isoforms.benchmark"
+    threads: workflow.cores # make sure this gets a full node, since it requires ~80 GB RAM
+    resources: mem_mb=112000
     shell:
         "python scripts/make_rsem_dataframe.py isoforms {input.gtf} {input.srr_lookup} {input.series_matrix}"
         " {output.counts} {output.tpms} {output.names} {output.ids} &> {log}"
diff --git a/workflow/rules/singlecellproteogenomics.smk b/workflow/rules/singlecellproteogenomics.smk
@@ -34,17 +34,17 @@ rule SingleCellProteogenomics_copyResults:
             "../results/velocity/a.loom",
             "../results/velocity/a.obs_names.csv",
         ],
-    output: directory("../results/newinput/RNAData_bkup/")
+    output: directory("../results/RNAData_bkup/")
     conda: "../envs/downloads.yaml"
-    params: rnadir=lambda w, output: output[0].split("_")[0],
+    params: rnadir=lambda w, input: os.path.join(input[0], "RNAData")
     log: "../results/SingleCellProteogenomics_copyResults.log"
     benchmark: "../results/SingleCellProteogenomics_copyResults.benchmark"
     shell:
-        "(cp -r {params.rnadir}/* {output} && "
+        "(mkdir -p {output} && cp -r {params.rnadir}/* {output} && "
         "cp {input.quant} {input.ids} {input.velocity} {params.rnadir}) &> {log}"
 
 rule SingleCellProteogenomics:
-    input: "../results/newinput/RNAData_bkup/"
+    input: "../results/RNAData_bkup/"
     output: "../results/output/pickles/mockbulk_phases.npy"
     conda: "../../SingleCellProteogenomics/workflow/envs/enviro.yaml"
     log: "../results/output/1_ProteinCellCycleClusters.log"
@@ -67,6 +67,22 @@ rule RNAFucciPseudotime:
     threads: workflow.cores # use a whole node
     shell: "cd ../results && python ../SingleCellProteogenomics/3_RNAFucciPseudotime.py --quicker &> {log}"
 
+rule TemporalDelay:
+    input: "../results/output/RNAPseudotimePlotting.csv.gz"
+    output: "../results/output/diff_max_pol.csv"
+    conda: "../../SingleCellProteogenomics/workflow/envs/enviro.yaml"
+    log: "../results/output/4_TemporalDelay.log"
+    threads: workflow.cores # use a whole node
+    shell: "cd ../results && python ../SingleCellProteogenomics/4_TemporalDelay.py &> {log}"
+
+rule ProteinProperties:
+    input: "../results/output/diff_max_pol.csv"
+    output: "../results/output/upstreamKinaseResults.csv"
+    conda: "../../SingleCellProteogenomics/workflow/envs/enviro.yaml"
+    log: "../results/output/5_ProteinProperties.log"
+    threads: workflow.cores # use a whole node
+    shell: "cd ../results && python ../SingleCellProteogenomics/5_ProteinProperties.py &> {log}"
+
 rule SingleCellProteogenomics_final:
     input:
         protein="../results/output/ProteinPseudotimePlotting.csv.gz",

diff --git a/workflow/rules/velo.smk b/workflow/rules/velo.smk
@@ -1,16 +1,17 @@
 rule velocity_analysis:
     '''Run the velocity analysis'''
     input:
-        unload_genome="../results/align/unloaded_2pass",
-        gtf=f"{GTF}.fix.gtf",
+        gtf=ENSEMBL_GTF,
         bams=expand("../results/align/{sra}Aligned.sortedByCoord.out.bam", sra=config['sra'])
     output: "../results/velocity/a.loom"
     conda: "../envs/velo.yaml"
     log: "../results/velocity.log"
     benchmark: "../results/velocity.benchmark"
     params:
         outfolder=lambda w, output: os.path.dirname(output[0]),
-        sampleid="a" # used this historically, so just keeping it consistent
+        sampleid="a", # used this historically, so just keeping it consistent
+    threads: 1
+    resources: mem_mb=7000
     shell:
         "velocyto run-smartseq2 --outputfolder {params.outfolder}"
         " --sampleid {params.sampleid} {input.bams} {input.gtf} &> {log}"