Merge pull request #19 from Phil9S/dev

development branch - issue resolutions
Phil9S · Oct 25, 2023 · 541f7d4 · 541f7d4
2 parents 82d7810 + 3a4d219
commit 541f7d4
Show file tree

Hide file tree

Showing 11 changed files with 119 additions and 82 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,3 @@
+logs/*
+.snakemake/
+profile/slurm/__pycache__/
diff --git a/README.md b/README.md
@@ -39,8 +39,6 @@ Samples passing all filtering criteria then undergo read downsampling to the spe
     - [Fit selection](#fit-selection)
   + [Step 7 Stage 2](#step-7-stage-2)
   + [Step 8 QC2](#step-8-qc2)
-  + [Step 9 Stage 3 - Cohort-level filtering](#step-9-stage-3---cohort-level-filtering)
-* [Addendum](#addendum)
 
 ## Compatibility
 
@@ -62,50 +60,31 @@ cd swgs-absolutecn/
 
 ### Step 2 Install conda
 
-Run the following to install conda whilst following the on-screen instructions.
-- When asked to run `conda init` and initialise conda please respond with 'yes'
+This pipeline utilises micromamba or conda installation environments to manage the software packages and versions. Please make sure either mamba or conda are installed and available on your system.
+Our recommendation is to use micromamba or, ideally, the installed conda version should utilise the libmamba solver library as the required environment contains a large number of packages.
 
-```
-wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
-bash Miniconda3-latest-Linux-x86_64.sh -p $HOME/miniconda/
-source ~/.bashrc
-rm Miniconda3-latest-Linux-x86_64.sh
-```
-
-See installing [conda](https://conda.io/projects/conda/en/latest/user-guide/install/index.html) for more information.
+See installing [micromamba](https://mamba.readthedocs.io/en/latest/user_guide/micromamba.html) or [conda](https://conda.io/projects/conda/en/latest/user-guide/install/index.html) for more information.
 
-#### For those with Conda already installed
+### Step 3 Installing environment & additional dependencies
 
-For systems where conda is already available the following requirements need to be met:
-- conda must be available on the PATH
-- conda version `4.8.3' or greater
-- the location of the installation folder is required
+From within the repository directory, run the `install_env.sh` script to generate a conda environment and install custom packages:
 
-Check the installed version of conda using the following:
 ```
-conda -V
+./install_env.sh mamba
 ```
-*If this command does not work then conda is also not available on the PATH*
-
-Find your installation directory using the following:
+or
 ```
-whereis conda | sed 's%condabin/conda%%'
+install_env.sh conda $HOME/miniconda/
 ```
 
-### Step 3 Installing additional dependencies
+If you used a previously installed conda build please use the conda or miniconda installation directory when running this section instead of '$HOME/miniconda/' to correctly initialise the conda environment.
 
-From within the repository directory, run the `install_env.sh` script to generate a conda environment and install custom packages:
+The newly installed environment can be activated using the following:
 
 ```
-install_env.sh $HOME/miniconda/
+micromamba activate swgs-abscn
 ```
-
-If you used a previously installed conda build please use the conda or miniconda installation directory when running this section instead of '$HOME/miniconda/' to correctly initialise the conda environment.
-
-*To be replaced with a built-in snakemake solution once possible*
-
-The newly installed conda environment can be activated using the following:
-
+or
 ```
 conda activate swgs-abscn
 ```
@@ -218,16 +197,6 @@ Where * is replaced with the profile matching your cluster/server configuration.
 
 To confirm the quality of newly generated downsampled absolute copy number profiles generated by stage 2, an evalution of outputted fits should be performed as described previously in step 6 [here](resources/quality_control_guide.md). The output `{}` should be updated accordingly, with poor fits being excluded.
 
-### Step 9 Stage 3 - Cohort-level filtering
-
-COMING SOON
-
-This stage has not yet been implemented and performs profile filtering based on step 8 and cohort-level outlier detection to remove specious samples.
-
-## Addendum
-
-None
-
 ## Authors
 
 * Philip Smith (@phil9s)

diff --git a/config/conda.yaml b/config/conda.yaml
@@ -17,3 +17,4 @@ dependencies:
   - r-foreach=1.5.0
   - bioconductor-qdnaseq
   - bioconductor-qdnaseq.hg19
+  - r-remotes
diff --git a/config/config.yaml b/config/config.yaml
@@ -3,12 +3,12 @@
 samplesheet: "sample_sheet.tsv"
 
 # Output location
-out_dir: "/mnt/scratcha/fmlab/smith10/"
+out_dir: "/mnt/scratcha/fmlab/smith10/britroc/"
 
 # Bin sizes
 # By default any in [1,5,15,30,50,100,500,1000]
 bins:
-- 100
+- 30
 project_name: "britroc"
 
 # Pipeline parameters

diff --git a/install_env.sh b/install_env.sh
@@ -1,52 +1,104 @@
 #!/bin/bash
 
+set -e 
+
 script="install_env"
 # VARS
+INSTALL_BIN="mamba"
 CONDA_VERSION=4.8.2
+MICROMAMBA_VERSION=1.3.1
 CONDA_VERSION_N=$(sed 's/\.//g' <<< "${CONDA_VERSION}")
+MICROMAMBA_VERSION_N=$(sed 's/\.//g' <<< "${MICROMAMBA_VERSION}")
 INSTALLED_CONDA_VERSION=$(conda -V | sed 's/conda //' | sed 's/\.//g')
+INSTALLED_MICROMAMBA_VERSION=$(micromamba --version | sed 's/conda //' | sed 's/\.//g')
 
-# Check conda available
-if ! [ -x "$(command -v conda)" ]; then
-	echo -e "[${script}] Error: conda has not been installed or is not available on PATH"
+## Default behaviour
+if [[ $# -eq 0 ]]; then
+	echo -e "[${script}] No arguments given. Specify a environment bin; mamba or conda."
 	exit 1
 fi
 
-# Check conda version (rudamentary)
-if [ "${INSTALLED_CONDA_VERSION}" -lt "${CONDA_VERSION_N}" ]; then
-	echo -e "[${script}] Error - conda/miniconda is older than the required version"
-	echo -e "[${script}] Required: conda ${CONDA_VERSION} / Installed: $(conda -V)"
-	exit	
-fi
+#echo -e "${MICROMAMBA_VERSION}"
+#echo -e "${MICROMAMBA_VERSION_N}"
+#echo -e "${INSTALLED_MICROMAMBA_VERSION}"
+
+if [ $1 == "mamba" ]; then
+    # Check MICROMAMBA available
+    if ! [ -x "$(command -v micromamba)" ]; then
+        echo -e "[${script}] Error: micromamba has not been installed or is not available on \$PATH"
+        exit 1
+    fi
+
+    # Check micromamba version (rudamentary)
+    if [ "${INSTALLED_MICROMAMBA_VERSION}" -lt "${MICROMAMBA_VERSION_N}" ]; then
+        echo -e "[${script}] Error - micromamba is older than the required version"
+        echo -e "[${script}] Required: ${MICROMAMBA_VERSION} / Installed: $(micromamba --version)"
+        exit 1
+    fi
+
+    echo -e "[${script}] Creating env"
+    # micromamba install
+    micromamba env create -y -f config/conda.yaml
+    eval "$(micromamba shell hook --shell=bash)"
+    micromamba activate swgs-abscn
+
+elif [ $1 == "conda" ]; then
+    # Check conda available
+    if ! [ -x "$(command -v conda)" ]; then
+        echo -e "[${script}] Error: conda has not been installed or is not available on PATH"
+        exit 1
+    fi
 
-# Check provided conda directory
-if [ "$#" -lt 1 ]; then
-	echo -e "[${script}] Error - conda/miniconda directory missing"
-	echo -e "[${script}] Usage example './install_env.sh /home/user/miniconda3/'"
+    # Check conda version (rudamentary)
+    if [ "${INSTALLED_CONDA_VERSION}" -lt "${CONDA_VERSION_N}" ]; then
+	echo -e "[${script}] Error - conda/miniconda is older than the required version"
+	echo -e "[${script}] Required: ${CONDA_VERSION} / Installed: $(conda -V)"
 	exit 1
-fi
+    fi
+
+    # Check provided conda directory
+    if [ "$#" -lt 2 ]; then
+        echo -e "[${script}] Error - conda/miniconda directory missing"
+        echo -e "[${script}] Usage example './install_env.sh /home/user/miniconda3/'"
+        exit 1
+    fi
 
-# Set conda directory
-CONDA_DIR=$1
+    # conda install
+    # Set conda directory
+    if ! [ -d "$2" ]; then
+        echo -e "[${script}] Error - conda/miniconda directory not correct"
+        echo -e "[${script}] Usage example './install_env.sh conda /home/user/miniconda3/'"
+        exit 1
+    else 
+        CONDA_DIR=$2
+    fi
+    conda env create -f config/conda.yaml
+    DIR=${CONDA_DIR}etc/profile.d/conda.sh
+    if [ -f "${DIR}" ]; then
+        echo -e "[${script}] Initialising conda env"
+        source ${CONDA_DIR}etc/profile.d/conda.sh
+    else
+        echo -e "[${script}] Error: Unable to find conda intialisation script"
+        echo -e "[${script}] Error: Make sure to provide the miniconda directory - e.g. '/home/user/miniconda3/'"
+        echo -e "[${script}] Usage example './install_env.sh /home/user/miniconda3/'"
+        exit 1
+    fi
+    conda activate swgs-abscn
 
-echo -e "[${script}] Creating conda env"
-conda env create -f config/conda.yaml
-DIR=${CONDA_DIR}etc/profile.d/conda.sh
-if [ -f "${DIR}" ]; then
-	echo -e "[${script}] Initialising conda env"
-	source ${CONDA_DIR}etc/profile.d/conda.sh
 else
-	echo -e "[${script}] Error: Unable to find conda intialisation script"
-	echo -e "[${script}] Error: Make sure to provide the miniconda directory - e.g. '/home/user/miniconda3/'"
-	echo -e "[${script}] Usage example './install_env.sh /home/user/miniconda3/'"
-	exit
+    echo -e "[${script}] Error - neither mamba or conda specified"
+    exit 1
 fi
 
-echo -e "[${script}] Activating conda env"
-conda activate swgs-abscn
-echo -e "[${script}] Adding modified QDNAseq package"
-R_LIB_PATH=$(Rscript resources/libpath.R)
-cp -r resources/packages/QDNAseqmod/ ${R_LIB_PATH} 
+echo -e "[${script}] Installing modified QDNAseq package"
+Rscript -e 'remotes::install_github(repo = "markowetzlab/QDNAseqmod",quiet=TRUE,upgrade=FALSE)'
 echo -e "[${script}] Testing package installation"
 Rscript resources/package_load.R
-echo -e "[${script}] conda env ready and all packages installed!"
+echo -e "[${script}] env ready and all packages installed!"
+if [ $1 == "mamba" ]; then
+    echo -e "[${script}] activate with 'micromamba activate swgs-abscn'"
+else
+    echo -e "[${script}] activate with 'conda activate swgs-abscn'"
+fi
+
+# END
diff --git a/profile/slurm/config.yaml b/profile/slurm/config.yaml
@@ -1,5 +1,5 @@
 ---
-jobs: 200
+jobs: 100
 jobscript: slurm-jobscript.sh
 cluster: slurm-submit.py
 local-cores: 1

diff --git a/rules/common.smk b/rules/common.smk
@@ -55,6 +55,7 @@ validate(config, schema="../schemas/config.schema.yaml")
 
 #Predefine output folders
 OUT_DIR=config["out_dir"]
+OUT_DIR=os.path.join(OUT_DIR,"")
 
 #Load sample sheet and set index
 samplesheet = pd.read_table(config["samplesheet"],dtype={'PATIENT_ID': str,'SAMPLE_ID':str,'TP53freq':float}).set_index(["SAMPLE_ID"], drop=False)

diff --git a/scripts/downsampleBams.R b/scripts/downsampleBams.R
@@ -20,7 +20,7 @@ fit.qc.filt <- fit.qc %>%
   filter(use == TRUE)
 
 fit.qc.filt$total.reads <- read.data$total.reads[match(x = fit.qc.filt$SAMPLE_ID,read.data$name)]
-fit.qc.filt$ratio <- round(fit.qc.filt$downsample_depth / fit.qc.filt$total.reads,digits = 2)
+fit.qc.filt$ratio <- round(fit.qc.filt$downsample_depth / fit.qc.filt$total.reads,digits = 3)
 
 perc <- fit.qc.filt %>%
    filter(SAMPLE_ID == sample_name) %>%
@@ -35,5 +35,7 @@ if( perc <= 0.96){
 
  }else{
   cmd.copy <- paste0("cp ",bam_in," ",outname)
+  cmd.index <- paste0("samtools index ",outname)
   system(cmd.copy)
+  system(cmd.index)
  }
diff --git a/scripts/ploidy_purity_search_standard_error.R b/scripts/ploidy_purity_search_standard_error.R
@@ -154,7 +154,7 @@ print(sample)
 res<-foreach(i=1:length(ploidies),.combine=rbind) %do%
 {
         ploidy<-ploidies[i]
-        print(1)
+        #print(ploidy)
         rowres<-foreach(j=1:length(purities),.combine=rbind)%do%
         {
             purity<-purities[j]

diff --git a/scripts/qdnaseq_mod.R b/scripts/qdnaseq_mod.R
@@ -23,7 +23,7 @@ bins <- getBinAnnotations(binSize=bin.size)
 # Samples to smooth
 smoothed_samples <- as.character(metadata$SAMPLE_ID[metadata$smooth == "TRUE"])
 
-readCounts <- mclapply(X=bam_list, FUN=binReadCounts, bins=bins, mc.cores=ncores)
+readCounts <- mclapply(X=bam_list, FUN=binReadCounts, bins=bins, mc.cores=ncores,chunkSize=1e7)
 
 ## if copyNumbersSegment file exists read it else generate it
 # apply filter based on loess fit residuals and encode/1000-genome balcklist
@@ -78,4 +78,8 @@ smooth_samples <- function(obj){
 
 copyNumbersSegmentedSmooth <- mclapply(X=copyNumbersSegmented, FUN=smooth_samples, mc.cores=ncores)
 
+if(is.na(pData(object=copyNumbersSegmentedSmooth[[1]])$loess.span)){
+	stop(paste0(sampleNames(copyNumbersSegmented)," BAM failed loess fitting. Remove this file from sample sheet"))
+}
+
 saveRDS(copyNumbersSegmentedSmooth,outname)
diff --git a/scripts/qdnaseq_mod_ds.R b/scripts/qdnaseq_mod_ds.R
@@ -27,7 +27,7 @@ bins <- getBinAnnotations(binSize=bin.size)
 # Samples to smooth
 smoothed_samples <- as.character(metadata$SAMPLE_ID[metadata$smooth == "TRUE"])
 
-readCounts <- mclapply(X=bam_list, FUN=binReadCounts, bins=bins, mc.cores=ncores)
+readCounts <- mclapply(X=bam_list, FUN=binReadCounts, bins=bins, mc.cores=ncores,chunkSize=1e7)
 ## if copyNumbersSegment file exists read it else generate it
 # apply filter based on loess fit residuals and encode/1000-genome balcklist
 readCountsFiltered <- mclapply(X=readCounts, FUN=applyFilters, mc.cores=1)
@@ -89,4 +89,9 @@ smooth_samples <- function(obj){
 
 copyNumbersSegmentedSmooth <- mclapply(X=copyNumbersSegmented, FUN=smooth_samples, mc.cores=ncores)
 
+if(is.na(pData(object=copyNumbersSegmentedSmooth[[1]])$loess.span)){
+        stop(paste0(sampleNames(copyNumbersSegmented)," BAM failed loess fitting. Remove this file from sample sheet"))
+}
+
+
 saveRDS(copyNumbersSegmentedSmooth,outname)