diff --git a/.devcontainer.json b/.devcontainer.json new file mode 100644 index 0000000..253be95 --- /dev/null +++ b/.devcontainer.json @@ -0,0 +1,41 @@ +// For format details, see https://aka.ms/devcontainer.json. For config options, see the +// README at: https://github.com/devcontainers/templates/tree/main/src/ubuntu +{ + "name": "Ubuntu", + // Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile + "image": "mcr.microsoft.com/devcontainers/base:jammy", + "features": { + "ghcr.io/rocker-org/devcontainer-features/miniforge:1": { + "version": "latest", + "variant": "Mambaforge" + } + }, + + // Features to add to the dev container. More info: https://containers.dev/features. + // "features": {}, + + // Use 'forwardPorts' to make a list of ports inside the container available locally. + // "forwardPorts": [], + + // Use 'postCreateCommand' to run commands after the container is created. + "postCreateCommand": "mamba env create -n happler -f dev-env.yml && conda run -n happler poetry install", + + // Configure tool-specific properties. + "customizations": { + "vscode": { + "extensions": ["ms-python.python"], + "settings": { + "python.condaPath": "/opt/conda/condabin/conda", + "python.defaultInterpreterPath": "/opt/conda/envs/happler/bin/python", + "python.terminal.activateEnvironment": true, + "python.terminal.activateEnvInCurrentTerminal": true, + "python.venvFolders": ["/home/vscode/.cache/pypoetry/virtualenvs"], + "terminal.integrated.environmentChangesRelaunch": true, + "terminal.integrated.hideOnStartup": "always" + } + } + } + + // Uncomment to connect as root instead. More info: https://aka.ms/dev-containers-non-root. + // "remoteUser": "root" +} \ No newline at end of file diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md new file mode 100644 index 0000000..2fbe6d9 --- /dev/null +++ b/.github/pull_request_template.md @@ -0,0 +1,13 @@ +## Checklist + +* [ ] I've checked to ensure there aren't already other open [pull requests](../../../pulls) for the same update/change +* [ ] I've prefixed the title of my PR according to [the conventional commits specification](https://www.conventionalcommits.org/). If your PR fixes a bug, please prefix the PR with `fix: `. Otherwise, if it introduces a new feature, please prefix it with `feat: `. If it introduces a breaking change, please add an exclamation before the colon, like `feat!: `. If the scope of the PR changes because of a revision to it, please update the PR title, since the title will be used in our CHANGELOG. +* [ ] At the top of the PR, I've [listed any open issues that this PR will resolve](https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue#linking-a-pull-request-to-an-issue-using-a-keyword). For example, "resolves #0" if this PR resolves issue #0 +- [ ] I've explained my changes in a manner that will make it possible for both users and maintainers of happler to understand them +* [ ] I have followed the [contributing guidelines](https://happler.readthedocs.io/en/stable/project_info/contributing.html#how-to-fix-a-bug-or-implement-a-new-feature) +* [ ] I have adhered to the [style guidelines](https://happler.readthedocs.io/en/stable/project_info/contributing.html#style) +* [ ] I've added tests for any new functionality. Or, if this PR fixes a bug, I've added test(s) that replicate it +* [ ] I've updated the relevant documentation and checked that the newly built documentation is formatted properly +* [ ] All functions, modules, classes etc. still conform to [numpy docstring standards](https://numpydoc.readthedocs.io/en/latest/format.html) +* [ ] (if applicable) I've updated the pyproject.toml file with any changes I've made to happler's dependencies, and I've run `poetry lock --no-update` to ensure the lock file stays up to date and that our dependencies are locked to their minimum versions +* [ ] In the body of this PR, I've included a short address to the reviewer highlighting one or two items that might deserve their focus diff --git a/.github/workflows/constraints.txt b/.github/workflows/constraints.txt deleted file mode 100644 index cff7619..0000000 --- a/.github/workflows/constraints.txt +++ /dev/null @@ -1,5 +0,0 @@ -pip==22.2.2 -nox==2022.8.7 -nox-poetry==1.0.1 -poetry==1.1.15 -virtualenv==20.16.6 diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 91dcb79..00cb76b 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -1,7 +1,6 @@ name: Tests -on: - - pull_request +on: [pull_request, workflow_call] jobs: tests: @@ -11,13 +10,15 @@ jobs: fail-fast: false matrix: include: - - { python: "3.7", os: "ubuntu-latest", session: "lint" } - - { python: "3.7", os: "ubuntu-latest", session: "tests" } + - { python: "3.8", os: "ubuntu-latest", session: "lint" } - { python: "3.8", os: "ubuntu-latest", session: "tests" } - { python: "3.9", os: "ubuntu-latest", session: "tests" } - { python: "3.10", os: "ubuntu-latest", session: "tests" } - # - { python: "3.10", os: "windows-latest", session: "tests" } - # - { python: "3.10", os: "macos-latest", session: "tests" } + - { python: "3.11", os: "ubuntu-latest", session: "tests" } + - { python: "3.12", os: "ubuntu-latest", session: "tests" } + # - { python: "3.11", os: "windows-latest", session: "tests" } + # - { python: "3.9", os: "macos-latest", session: "tests" } + - { python: "3.8", os: "ubuntu-latest", session: "size" } env: NOXSESSION: ${{ matrix.session }} @@ -26,90 +27,75 @@ jobs: steps: - name: Check out the repository - uses: actions/checkout@v3 + uses: actions/checkout@v4 - - name: Set up Python ${{ matrix.python }} - uses: actions/setup-python@v3 + - name: Setup Mambaforge + uses: conda-incubator/setup-miniconda@v3 with: - python-version: ${{ matrix.python }} + activate-environment: happler + miniforge-variant: Mambaforge + auto-activate-base: false + miniforge-version: latest + use-mamba: true - - name: Upgrade pip - run: | - pip install --constraint=.github/workflows/constraints.txt pip - pip --version - - name: Upgrade pip in virtual environments - shell: python - run: | - import os - import pip - with open(os.environ["GITHUB_ENV"], mode="a") as io: - print(f"VIRTUALENV_PIP={pip.__version__}", file=io) - - name: Install Poetry + - name: Get Date + id: get-date + run: echo "today=$(/bin/date -u '+%Y%m%d')" >> $GITHUB_OUTPUT + shell: bash + + - name: Cache Conda env + uses: actions/cache@v3 + with: + path: ${{ env.CONDA }}/envs + key: + conda-${{ runner.os }}--${{ runner.arch }}--${{ steps.get-date.outputs.today }}-${{ hashFiles('dev-env.yml') }}-${{ env.CACHE_NUMBER }} + env: + # Increase this value to reset cache if dev-env.yml has not changed + CACHE_NUMBER: 0 + id: cache + + - name: Install dev environment + run: + mamba env update -n happler -f dev-env.yml + if: steps.cache.outputs.cache-hit != 'true' + + - name: Try to build happler + shell: bash -el {0} run: | - pipx install --pip-args=--constraint=.github/workflows/constraints.txt poetry - poetry --version - - name: Install Nox + poetry build --no-ansi + + - name: Check distribution size + if: matrix.session == 'size' run: | - pipx install --pip-args=--constraint=.github/workflows/constraints.txt nox - pipx inject --pip-args=--constraint=.github/workflows/constraints.txt nox nox-poetry - nox --version - - name: Run Nox + du -csh dist/* + # check that the generated dist/ directory does not exceed 0.3 MB + # if this check fails, it's because you forgot to list large files in a "tool.poetry.exclude" section of our pyproject.toml + # https://python-poetry.org/docs/pyproject/#include-and-exclude + [ $(du -b dist | cut -f1) -lt 300000 ] + + - name: Run tests with nox + if: matrix.session != 'size' + shell: bash -el {0} run: | - nox --python=${{ matrix.python }} + nox --verbose --python=${{ matrix.python }} + - name: Upload coverage data if: always() && matrix.session == 'tests' uses: "actions/upload-artifact@v3" with: name: coverage-data path: ".coverage.*" + large-files: name: File sizes runs-on: ubuntu-latest steps: - name: Check out the repository - uses: actions/checkout@v3 + uses: actions/checkout@v4 + - name: Check for large files uses: actionsdesk/lfs-warning@v3.2 with: token: ${{ secrets.GITHUB_TOKEN }} # Optional filesizelimit: 500000b labelName: large-files - - # coverage: - # runs-on: ubuntu-latest - # needs: tests - # steps: - # - name: Check out the repository - # uses: actions/checkout@v3 - - # - name: Set up Python - # uses: actions/setup-python@v3 - # with: - # python-version: "3.10" - - # - name: Upgrade pip - # run: | - # pip install --constraint=.github/workflows/constraints.txt pip - # pip --version - # - name: Install Poetry - # run: | - # pipx install --pip-args=--constraint=.github/workflows/constraints.txt poetry - # poetry --version - # - name: Install Nox - # run: | - # pipx install --pip-args=--constraint=.github/workflows/constraints.txt nox - # pipx inject --pip-args=--constraint=.github/workflows/constraints.txt nox nox-poetry - # nox --version - # - name: Download coverage data - # uses: actions/download-artifact@v3 - # with: - # name: coverage-data - - # - name: Combine coverage data and display human readable report - # run: | - # nox --session=coverage - # - name: Create coverage report - # run: | - # nox --session=coverage -- xml - # - name: Upload coverage report - # uses: codecov/codecov-action@v3.1.0 diff --git a/.gitignore b/.gitignore index ada245f..9210f10 100644 --- a/.gitignore +++ b/.gitignore @@ -12,7 +12,10 @@ dist/ analysis/data analysis/out analysis/log +analysis/Rplots.pdf +analysis/myenv.RData .snakemake .ipynb_checkpoints # vscode .vscode +venv/ diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 35cdbe0..07c4eb8 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -2,18 +2,26 @@ # Read the Docs configuration file # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details -# Note: I used https://github.com/readthedocs/readthedocs.org/issues/4912#issuecomment-664002569 for inspiration +# Note: I used https://docs.readthedocs.io/en/stable/build-customization.html#install-dependencies-with-poetry for inspiration version: 2 +build: + os: "ubuntu-22.04" + tools: + python: "3.8" + jobs: + post_create_environment: + # Install poetry + # https://python-poetry.org/docs/#installing-manually + - pip install poetry + post_install: + # Install dependencies with 'docs' dependency group + # https://python-poetry.org/docs/managing-dependencies/#dependency-groups + # VIRTUAL_ENV needs to be set manually for now. + # See https://github.com/readthedocs/readthedocs.org/pull/11152/ + - VIRTUAL_ENV=$READTHEDOCS_VIRTUALENV_PATH poetry install --only main,docs + sphinx: configuration: docs/conf.py - -python: - version: 3.7 - install: - - method: pip - path: . - extra_requirements: - - docs - + fail_on_warning: true diff --git a/analysis/README.md b/analysis/README.md index 7b5012d..5a61c1b 100644 --- a/analysis/README.md +++ b/analysis/README.md @@ -1,4 +1,4 @@ -![Snakemake](https://img.shields.io/badge/snakemake-�~I�6.7.0-brightgreen.svg?style=flat-square)](https://snakemake.bitbucket.io) +![Snakemake](https://img.shields.io/badge/snakemake-�~I�8.12.0-brightgreen.svg?style=flat-square)](https://snakemake.bitbucket.io) # download Execute the following command. @@ -8,9 +8,9 @@ git clone https://github.com/aryarm/happler You can also download example data for the pipeline. See [the config file](config/config.yml) for links and instructions. # setup -The pipeline is written as a Snakefile which can be executed via [Snakemake](https://snakemake.readthedocs.io). For reproduciblity, we recommend installing the version that we used (6.7.0): +The pipeline is written as a Snakefile which can be executed via [Snakemake](https://snakemake.readthedocs.io). For reproduciblity, we recommend installing the version that we used (8.12.0): ``` -conda create -n snakemake -c conda-forge --no-channel-priority 'bioconda::snakemake==6.7.0' +conda create -n snakemake -c conda-forge --no-channel-priority 'bioconda::snakemake==8.12.0' ``` `snakemake` will [automatically install all dependencies](https://snakemake.readthedocs.io/en/stable/snakefiles/deployment.html#integrated-package-management) of the pipeline upon its first execution using `conda`. @@ -25,9 +25,9 @@ conda create -n snakemake -c conda-forge --no-channel-priority 'bioconda::snakem ``` ./run.bash & ``` - __or__ on a TORQUE cluster: + __or__ on a SLURM cluster: ``` - qsub run.bash + sbatch run.bash ``` ### Output All output of the pipeline will be placed in a new directory (`out/`, by default). diff --git a/analysis/config/config-geuvadis.yml b/analysis/config/config-geuvadis.yml new file mode 100644 index 0000000..b5f23d9 --- /dev/null +++ b/analysis/config/config-geuvadis.yml @@ -0,0 +1,87 @@ +# This is the Snakemake configuration file that specifies paths and +# and options for the pipeline. Anybody wishing to use +# the provided snakemake pipeline should first fill out this file with paths to +# their own data, as the Snakefile requires it. +# Every config option has reasonable defaults unless it is labeled as "required." +# All paths are relative to the directory that Snakemake is executed in. +# Note: this file is written in the YAML syntax (https://learnxinyminutes.com/docs/yaml/) + + +# Paths to a SNP-STR haplotype reference panel +# You can download this from http://gymreklab.com/2018/03/05/snpstr_imputation.html +# If the VCFs are per-chromosome, replace the contig name in the file name with "{chr}" +# The VCF(s) must be sorted and indexed (with a .tbi file in the same directory) +# required! +# ref_panel: "/projects/ps-gymreklab/resources/datasets/snpstr/1kg.snp.str.chr{chr}.vcf.gz" +# snp_panel: "/projects/ps-gymreklab/resources/datasets/ukbiobank/array_imputed/pfile_converted/chr{chr}.pgen" +snp_panel: "data/geuvadis/geuvadis_ensemble_phasing.pgen" +# str_panel: "/tscc/projects/ps-gymreklab/jmargoli/ukbiobank/str_imputed/runs/first_pass/vcfs/annotated_strs/chr{chr}.vcf.gz" + +# Path to a list of samples to exclude from the analysis +# There should be one sample ID per line +# exclude_samples: data/ukb_random_samples_exclude.tsv + +# If SNPs are unphased, provide the path to a SHAPEIT4 map file like these: +# https://github.com/odelaneau/shapeit4/tree/master/maps +# The map file should use the same reference genome as the reference panel VCFs +# phase_map: data/genetic_maps/chr{chr}.b37.gmap.gz + +# A "locus" is a string with a contig name, a colon, the start position, a dash, and +# the end position or a BED file with a ".bed" file ending +# There are different simulation modes that you can use: +# 1. "str" - a tandem repeat is a string with a contig name, a colon, and the start position +# 2. "snp" - a SNP follows the same format as "str" +# 3. "hap" - a haplotype +# 4. "ld_range" - creates random two-SNP haplotypes with a range of LD values between the alleles of each haplotype +# 5. "run" - execute happler on a locus without simulating anything +# The STR and SNP positions should be contained within the locus. +# The positions should be provided in the same coordinate system as the reference +# genome of the reference panel VCFs +# The contig should correspond with the contig name from the {chr} wildcard in the VCF +# required! and unless otherwise noted, all attributes of each mode are required +# locus: 19:45401409-46401409 # center: 45901409 (APOe4) +locus: data/geuvadis/geuvadis_eqtl_genes.full.liftover.bed +modes: + str: + pos: 19:45903857 # STR_691361 + snp: + pos: 19:45910672 # rs1046282 + hap: + alleles: [rs36046716:G, rs1046282:G] # 45892145, 45910672 + beta: [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45] + ld_range: + reps: 1 + min_ld: 0 + max_ld: 1 + step: 0.1 + min_af: 0.25 + max_af: 0.75 + # beta: [0.35] + beta: [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45] + alpha: [0.05] + random: false # whether to also produce random haplotypes + run: + pheno: data/geuvadis/phenos/{trait}.pheno + SVs: data/geuvadis/pangenie_hprc_hg38_all-samples_bi_SVs-missing_removed.pgen + # pheno_matrix: data/geuvadis/EUR_converted_expr_hg38.csv # optional +mode: run + +# Covariates to use if they're needed +# Otherwise, they're assumed to be regressed out of the phenotypes +# Note: the finemapping methods won't be able to handle these covariates +# covar: data/geuvadis/5PCs_sex.covar + +# Discard rare variants with a MAF below this number +# Defaults to 0 if not specified +min_maf: 0.1 + +# Sample sizes to use +# sample_size: [500, 1000, 1500, 2000, 2500] +# sample_size: 777 + +# Whether to include the causal variant in the set of genotypes provided to the +# finemapping methods. Set this to true if you're interested in seeing how the +# methods perform when the causal variant is absent from the data. +# Defaults to false if not specified +# You can also provide a list of booleans, if you want to test both values +exclude_causal: [true, false] diff --git a/analysis/config/config-ukb.yml b/analysis/config/config-ukb.yml new file mode 100644 index 0000000..7b04fcc --- /dev/null +++ b/analysis/config/config-ukb.yml @@ -0,0 +1,54 @@ +# This is the Snakemake configuration file that specifies paths and +# and options for the pipeline. Anybody wishing to use +# the provided snakemake pipeline should first fill out this file with paths to +# their own data, as the Snakefile requires it. +# Every config option has reasonable defaults unless it is labeled as "required." +# All paths are relative to the directory that Snakemake is executed in. +# Note: this file is written in the YAML syntax (https://learnxinyminutes.com/docs/yaml/) + + +# Paths to a SNP haplotype reference panel (like the 1000 Genomes) +# If the VCFs are per-chromosome, replace the contig name in the file name with "{chr}" +# The VCF(s) must be sorted and indexed (with a .tbi file in the same directory) +# required! +ref_panel: "/tscc/projects/ps-gymreklab/resources/datasets/1000Genomes/phase3VCFs/ALL.chr{chr}.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz" +snp_panel: "/tscc/projects/ps-gymreklab/resources/datasets/ukbiobank/array_imputed/pfile_converted/chr{chr}.pgen" + +# Path to a list of samples to exclude from the analysis +# There should be one sample ID per line +exclude_samples: data/ukb/samples/remove.txt + +# If SNPs are unphased, provide the path to a beagle map file like these: +# https://bochet.gcc.biostat.washington.edu/beagle/genetic_maps/ +# The map file should use the same reference genome as the reference panel VCFs +phase_map: data/genetic_maps/plink.chr{chr}.GRCh37.map + +# A "locus" is a string with a contig name, a colon, the start position, a dash, and +# the end position or a BED file with a ".bed" file ending +# There are different simulation modes that you can use: +# 1. "str" - a tandem repeat is a string with a contig name, a colon, and the start position +# 2. "snp" - a SNP follows the same format as "str" +# 3. "hap" - a haplotype +# 4. "ld_range" - creates random two-SNP haplotypes with a range of LD values between the alleles of each haplotype +# 5. "run" - execute happler on a locus without simulating anything +# The STR and SNP positions should be contained within the locus. +# The positions should be provided in the same coordinate system as the reference +# genome of the reference panel VCFs +# The contig should correspond with the contig name from the {chr} wildcard in the VCF +# required! and unless otherwise noted, all attributes of each mode are required +locus: data/ukb/phenos/platelet_count.bed +modes: + run: + pheno: data/ukb/phenos/{trait}.resid.pheno +mode: run + +# Discard rare variants with a MAF below this number +# Defaults to 0 if not specified +min_maf: 0.00005 + +# Whether to include the causal variant in the set of genotypes provided to the +# finemapping methods. Set this to true if you're interested in seeing how the +# methods perform when the causal variant is absent from the data. +# Defaults to false if not specified +# You can also provide a list of booleans, if you want to test both values +exclude_causal: [true, false] diff --git a/analysis/config/config.yml b/analysis/config/config.yml deleted file mode 100644 index ecbfdcc..0000000 --- a/analysis/config/config.yml +++ /dev/null @@ -1,81 +0,0 @@ -# This is the Snakemake configuration file that specifies paths and -# and options for the pipeline. Anybody wishing to use -# the provided snakemake pipeline should first fill out this file with paths to -# their own data, as the Snakefile requires it. -# Every config option has reasonable defaults unless it is labeled as "required." -# All paths are relative to the directory that Snakemake is executed in. -# Note: this file is written in the YAML syntax (https://learnxinyminutes.com/docs/yaml/) - - -# Paths to a SNP-STR haplotype reference panel -# You can download this from http://gymreklab.com/2018/03/05/snpstr_imputation.html -# If the VCFs are per-chromosome, replace the contig name in the file name with "{chr}" -# The VCF(s) must be sorted and indexed (with a .tbi file in the same directory) -# required! -# ref_panel: "/projects/ps-gymreklab/resources/datasets/snpstr/1kg.snp.str.chr{chr}.vcf.gz" -# snp_panel: "/projects/ps-gymreklab/resources/datasets/ukbiobank/array_imputed/pfile_converted/chr{chr}.pgen" -snp_panel: "../tests/data/19_45401409-46401409_1000G.pgen" -str_panel: "/projects/ps-gymreklab/jmargoli/ukbiobank/str_imputed/runs/first_pass/vcfs/annotated_strs/chr{chr}.vcf.gz" - -# Path to a list of samples to exclude from the analysis -# There should be one sample ID per line -# exclude_samples: data/ukb_random_samples_exclude.tsv - -# If SNPs are unphased, provide the path to a SHAPEIT4 map file like these: -# https://github.com/odelaneau/shapeit4/tree/master/maps -# The map file should use the same reference genome as the reference panel VCFs -# phase_map: data/genetic_maps/chr{chr}.b37.gmap.gz - -# A "locus" is a string with a contig name, a colon, the start position, a dash, and -# the end position -# There are different simulation modes that you can use: -# 1. "str" - a tandem repeat is a string with a contig name, a colon, and the start position -# 2. "snp" - a SNP follows the same format as "str" -# 3. "hap" - a haplotype -# The STR and SNP positions should be contained within the locus. -# The positions should be provided in the same coordinate system as the reference -# genome of the reference panel VCFs -# The contig should correspond with the contig name from the {chr} wildcard in the VCF -# required! -locus: 19:45401409-46401409 # center: 45901409 (APOe4) -modes: - str: - pos: 19:45903857 # STR_691361 - snp: - pos: 19:45910672 # rs1046282 - hap: - alleles: [rs36046716:G, rs1046282:G] # 45892145, 45910672 - beta: [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45] - ld_range: - reps: 1 - min_ld: 0 - max_ld: 1 - step: 0.1 - min_af: 0.25 - max_af: 0.75 - beta: [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45] -mode: ld_range - -# locus: 1:98001984-99001984 -# modes: -# str: 1:98506615 -# snp: 1:98501984 -# locus: 10:104457618-105457618 -# modes: -# str: 10:104639652 -# snp: 10:104612335 -# locus: 16:56927279-57084844 - -# Discard rare variants with a MAF below this number -# Defaults to 0 if not specified -min_maf: 0.1 - -# Sample sizes to use -sample_size: [100, 1000, 10000, 100000, 500000] - -# Whether to include the causal variant in the set of genotypes provided to the -# finemapping methods. Set this to true if you're interested in seeing how the -# methods perform when the causal variant is absent from the data. -# Defaults to false if not specified -# You can also provide a list of booleans, if you want to test both values -exclude_causal: [true, false] diff --git a/analysis/config/config.yml b/analysis/config/config.yml new file mode 120000 index 0000000..34462af --- /dev/null +++ b/analysis/config/config.yml @@ -0,0 +1 @@ +config-ukb.yml \ No newline at end of file diff --git a/analysis/profile/default/config.yaml b/analysis/profile/default/config.yaml new file mode 100644 index 0000000..2c81180 --- /dev/null +++ b/analysis/profile/default/config.yaml @@ -0,0 +1,3 @@ +latency-wait: 60 +use-conda: true +conda-frontend: conda diff --git a/analysis/profile/slurm/config.yaml b/analysis/profile/slurm/config.yaml new file mode 100644 index 0000000..3014012 --- /dev/null +++ b/analysis/profile/slurm/config.yaml @@ -0,0 +1,11 @@ +latency-wait: 60 +use-conda: true +conda-frontend: conda + +executor: slurm +default-resources: + slurm_account: ddp268 + slurm_partition: condo + runtime: 30 + nodes: 1 + slurm_extra: "'--qos=condo'" diff --git a/analysis/run.bash b/analysis/run.bash index e25969e..793c01a 100755 --- a/analysis/run.bash +++ b/analysis/run.bash @@ -1,15 +1,15 @@ #!/usr/bin/env bash -#PBS -V -#PBS -d . -#PBS -q home-gymrek -#PBS -j oe -#PBS -o /dev/null -#PBS -N run.snakemake -#PBS -l nodes=1:ppn=1 -#PBS -l walltime=8:00:00 -#PBS -W group_list=gymreklab-group -#PBS -A gymreklab-group - +#SBATCH --export ALL +#SBATCH --partition condo +#SBATCH --account ddp268 +#SBATCH --qos condo +#SBATCH --job-name happler-smk +#SBATCH --nodes 1 +#SBATCH --ntasks 1 +#SBATCH --cpus-per-task 1 +#SBATCH --time 0:30:00 +#SBATCH --mem 2G +#SBATCH --output /dev/null # An example bash script demonstrating how to run the entire snakemake pipeline # This script creates two separate log files in the output dir: @@ -20,17 +20,17 @@ # Also, make sure that this script is executed from the directory that it lives in! out_path="out" -mkdir -p "$out_path/logs" +mkdir -p "$out_path" # clear leftover log files -echo ""> "$out_path/logs/log" +echo ""> "$out_path/log" # try to find and activate the snakemake conda env if we need it if ! command -v 'snakemake' &>/dev/null && \ command -v 'conda' &>/dev/null && \ [ "$CONDA_DEFAULT_ENV" != "snakemake" ] && \ conda info --envs | grep "$CONDA_ROOT/snakemake" &>/dev/null; then - echo "Snakemake not detected. Attempting to switch to snakemake environment." >> "out/logs/log" + echo "Snakemake not detected. Attempting to switch to snakemake environment." >> "out/log" eval "$(conda shell.bash hook)" conda activate snakemake fi @@ -39,36 +39,31 @@ fi # check: are we being executed from within qsub? if [ "$ENVIRONMENT" = "BATCH" ]; then snakemake \ - --cluster "qsub -d . -V -q {resources.queue} -l walltime={resources.runtime} -l nodes=1:ppn={threads} -j oe -o /dev/null -W group_list=gymreklab-group -A gymreklab-group " \ - --cluster-cancel "qdel {cluster.jobid}" \ - --default-resources 'runtime="00:30:00"' 'queue="condo"' \ - --latency-wait 60 \ - --use-conda \ - --conda-frontend mamba \ + --workflow-profile profile/slurm \ --rerun-trigger {mtime,params,input} \ + --notemp \ -k \ - -j 12 \ - -c 12 \ - "$@" &>"$out_path/logs/log" + -j 32 \ + -c 32 \ + "$@" &>"$out_path/log" else snakemake \ - --latency-wait 60 \ - --use-conda \ - --conda-frontend mamba \ - --notemp \ + --workflow-profile profile/default \ --rerun-trigger {mtime,params,input} \ + --notemp \ -k \ -c 4 \ - "$@" &>"$out_path/logs/log" + "$@" &>"$out_path/log" fi +# send a slack message to notify that the job has finished exit_code="$?" if command -v 'slack' &>/dev/null; then if [ "$exit_code" -eq 0 ]; then slack "snakemake finished successfully" &>/dev/null else - slack "snakemake simulate_gwas job failed" &>/dev/null - slack "$(tail -n4 "$out_path/logs/log")" &>/dev/null + slack "snakemake happler-smk job failed" &>/dev/null + slack "$(tail -n4 "$out_path/log")" &>/dev/null fi fi exit "$exit_code" diff --git a/analysis/run_geuvadis.bash b/analysis/run_geuvadis.bash deleted file mode 100755 index d3d078a..0000000 --- a/analysis/run_geuvadis.bash +++ /dev/null @@ -1,50 +0,0 @@ -#!/usr/bin/env bash -#PBS -V -#PBS -d . -#PBS -q hotel -#PBS -j oe -#PBS -o /dev/null -#PBS -N run.happler -#PBS -l nodes=1:ppn=1 -#PBS -l walltime=1:00:00 - - -# An example bash script demonstrating how to run the entire happler pipeline -# This script creates a log file in the output dir: -# log - the basic happler log of completed rules - -# Before running this happler pipeline, remember to complete the config file -# with the required input info. -# Also, make sure that this script is executed from the directory that it lives in! - -out_path="out" - -# try to find and activate the happler conda env if we need it -if ! command -v 'happler' &>/dev/null && \ - command -v 'conda' &>/dev/null && \ - [ "$CONDA_DEFAULT_ENV" != "happler" ] && \ - conda info --envs | grep "$CONDA_ROOT/happler" &>/dev/null; then - echo "Snakemake not detected. Attempting to switch to happler environment." >> "out/logs/log" - eval "$(conda shell.bash hook)" - conda activate happler -fi - -happler run \ ---region '21:44694360-45694360' \ ---samples-file data/1000G_samples.tsv \ ---verbosity DEBUG \ ---discard-multiallelic \ -data/1000G.chr21.maf.vcf.gz \ -data/cstb.tsv \ ->"$out_path/geuvadis.hap" 2>"$out_path/logs/log" - -exit_code="$?" -if command -v 'slack' &>/dev/null; then - if [ "$exit_code" -eq 0 ]; then - slack "snakemake finished successfully" &>/dev/null - else - slack "snakemake run_geuvadis job failed" &>/dev/null - slack "$(tail -n4 "$out_path/logs/log")" &>/dev/null - fi -fi -exit "$exit_code" diff --git a/analysis/workflow/Snakefile b/analysis/workflow/Snakefile index 1d796a0..ac3074b 100644 --- a/analysis/workflow/Snakefile +++ b/analysis/workflow/Snakefile @@ -4,7 +4,7 @@ import snakemake.io as io from snakemake.utils import min_version ##### set minimum snakemake version ##### -min_version("7.14.0") +min_version("8.12.0") # IMPORT CONFIG VARIABLES @@ -21,12 +21,28 @@ def check_config(value, default=False, place=config, as_set=False): config["out"] = check_config("out", "out") config["min_maf"] = check_config("min_maf", 0) config["exclude_causal"] = check_config("exclude_causal", False, as_set=True) -locus = config["locus"].replace(":", "_") -# remove any trailing slashes in directories and set the variables -config["out"] = str(Path(config["out"])) + f"/{locus}" +mode = config["mode"] +# handle locus +if mode == "run": + pheno_set = set(glob_wildcards(Path(config["modes"][mode]["pheno"])).trait) +if Path(config["locus"]).exists() and Path(config["locus"]).suffix == ".bed": + with open(Path(config["locus"])) as f: + # the fourth column of the bed file must contain the name of the trait + locus_file = [ + (cols[3], f"{cols[0]}_{float(cols[1]):.0f}-{float(cols[2]):.0f}") + for cols in (line.strip().split() for line in f) + if not cols[0].startswith("#") and ( + mode != "run" or (cols[3] in pheno_set) + ) + ] + locus_traits, locus = list(zip(*locus_file)) +else: + locus = config["locus"].replace(":", "_") # convert the exclude_causal var into a dict for later exclude_causal = {("in", "ex")[val]: val for val in config["exclude_causal"]} -mode = config["mode"] +# remove any trailing slashes in directories +orig_out = str(Path(config["out"])) +config["out"] = str(Path(config["out"])) + "/{locus}" module genotypes: @@ -34,17 +50,45 @@ module genotypes: config: config use rule * from genotypes as genotypes_* -if check_config("phase_map") or check_config("exclude_samples") or not config["snp_panel"].endswith(".pgen"): - config["gts_snp_panel"] = rules.genotypes_vcf2plink.output.pgen -else: - config["gts_snp_panel"] = config["snp_panel"] config["gts_str_panel"] = rules.genotypes_subset_str.output.vcf - - -module simulate: - snakefile: "rules/simulate.smk" - config: config -use rule * from simulate as simulate_* +if check_config("sample_size"): + sampsize = config["sample_size"] + config["gts_snp_panel"] = rules.genotypes_subset.output.pgen + config["out"] += "/{sampsize}samples" +else: + config["gts_snp_panel"] = rules.genotypes_subset.input.pgen +config["random"] = False + +if mode != "run": + module simulate: + snakefile: "rules/simulate.smk" + config: config + use rule * from simulate as simulate_* + +if mode == "ld_range" and config["modes"][mode]["random"]: + config["random"] = True + module simulate_random: + snakefile: "rules/simulate.smk" + config: config + use rule * from simulate_random as simulate_random_* +del config["random"] + +covar_file = check_config("covar", default=[]) +if check_config("covar"): + covar_config = { + "covar": covar_file, + "out": orig_out, + } + if mode == "run": + covar_config["pheno"] = config["modes"][mode]["pheno"] + if check_config("pheno_matrix", place=config["modes"][mode]): + covar_config["pheno_matrix"] = config["modes"][mode]["pheno_matrix"] + module covariates: + snakefile: "rules/covariates.smk" + config: covar_config + use rule * from covariates as covariates_* + if "pheno_matrix" in covar_config: + covar_file = rules.covariates_merge.output.covar if mode in ("snp", "hap", "ld_range"): happler_config = { @@ -52,14 +96,29 @@ if mode in ("snp", "hap", "ld_range"): "hap_file": rules.simulate_transform.input.hap, "snp_panel": config["gts_snp_panel"], "out": config["out"] + "/happler/"+mode+"/{beta}", + "random": None, + "covar": covar_file, + "min_maf": check_config("min_maf", 0), } if mode in ("hap", "ld_range"): happler_config["snps"] = [] if mode == "hap": happler_config["snps"] = config["modes"][mode]["alleles"] - happler_config["causal_gt"] = rules.simulate_transform.output.pgen + happler_config["causal_gt"] = rules.simulate_transform.output if mode == "ld_range": - happler_config["out"] = config["out"] + "/happler/"+mode+"/ld_{ld}/{beta}" + happler_config["out"] = config["out"] + "/happler/"+mode+"/ld_{ld}/beta_{beta}/alpha_{alpha}" +elif mode == "run": + happler_config = { + "pheno": config["modes"][mode]["pheno"], + "hap_file": [], + "snp_panel": config["gts_snp_panel"], + "out": config["out"] + f"/happler/{mode}/" + "{trait}", + "random": None, + "covar": covar_file, + "min_maf": check_config("min_maf", 0), + } + if check_config("SVs", place=config["modes"][mode]): + happler_config["SVs"] = config["modes"][mode]["SVs"] elif "str" == mode: pass else: @@ -71,32 +130,42 @@ module happler: use rule * from happler as happler_* merged_happler = rules.happler_merge.output.pgen -if mode in ("hap", "str"): - finemappers_config = { - "pheno": rules.simulate_simphenotype.output.pheno, - "out": config["out"] + "/finemappers/"+mode+"/{beta}", - "snp_panel": config["gts_snp_panel"], - } - if mode == "hap": - finemappers_config["causal_gt"] = rules.simulate_transform.output.pgen - else: - raise ValueError("Not yet implemented operating mode 'str' in config") - module finemappers: - snakefile: "rules/finemappers.smk" - config: finemappers_config - use rule * from finemappers as finemappers_* -elif "ld_range" == mode: - pass -else: - raise ValueError(f"Unsupported operating mode '{mode}' in config") +if mode == "ld_range" and config["modes"][mode]["random"]: + happler_config["random"] = rules.simulate_random_transform.output + happler_config["random_hap"] = rules.simulate_random_transform.input.hap + happler_config["out"] = config["out"] + "/happler/"+mode+"/random/ld_{ld}/beta_{beta}/alpha_{alpha}" + module happler_random: + snakefile: "rules/happler.smk" + config: happler_config + use rule * from happler_random as happler_random_* + +# if mode in ("hap", "str", "ld_range"): +# finemappers_config = { +# "pheno": rules.simulate_simphenotype.output.pheno, +# "out": config["out"] + "/finemappers/"+mode+"/{beta}", +# "snp_panel": config["gts_snp_panel"], +# } +# if mode in ("hap", "ld_range"): +# finemappers_config["causal_gt"] = rules.simulate_transform.output.pgen +# if mode == "ld_range": +# finemappers_config["out"] = config["out"] + "/finemappers/"+mode+"/ld_{ld}/beta_{beta}/alpha_{alpha}" +# else: +# raise ValueError("Not yet implemented operating mode 'str' in config") +# module finemappers: +# snakefile: "rules/finemappers.smk" +# config: finemappers_config +# use rule * from finemappers as finemappers_* +# else: +# raise ValueError(f"Unsupported operating mode '{mode}' in config") plots_config = { "out": config["out"] + "/plots", "mode": mode, "mode_attrs": config["modes"][mode], "happler_hap": rules.happler_run.output.hap, - "causal_hap": rules.simulate_transform.input.hap, + "causal_hap": rules.simulate_transform.input.hap if mode != "run" else [], "snp_panel": config["gts_snp_panel"], + "happler_metrics": rules.happler_metrics.output.metrics, } if mode == "ld_range": plots_config["ld_range_checkpoint"] = checkpoints.simulate_create_hap_ld_range @@ -106,16 +175,80 @@ module plots: config: plots_config use rule * from plots as plots_* -rule all: - input: - expand( - [ - rules.happler_tree.output.png, - rules.happler_manhattan.output.png, - rules.finemappers_results.output.susie_pdf, - # rules.happler_results.output.susie_pdf, - ], - causal=(("ex",) if mode == "hap" else ("ex", "in")), - beta=config["modes"]["hap"]["beta"], - ) if mode != "ld_range" else [rules.plots_params.output.png] - default_target: True +if mode == "ld_range": + FINAL_OUTPUT = [rules.plots_params.output.png, rules.plots_metrics.output.png] +elif mode == "run": + checkpoint multiline: + input: + hps = expand( + rules.happler_run.output.hap, + zip, trait=locus_traits, locus=locus, allow_missing=True, + ), + params: + out = orig_out, + hp_path = Path(str(rules.happler_run.output.hap).format(locus="*", trait="*")), + output: + hps = orig_out + "/multiline.tsv", + resources: + runtime=3, + log: + orig_out + "/logs/multiline", + benchmark: + orig_out + "/bench/multiline", + conda: + "envs/default.yml" + shell: + "for i in {params.hp_path}; do if [ $(" + "grep '^V' $i | cut -f2 | sort | uniq -c | sed 's/^ *//' | awk -F ' ' '$1 > 1' | wc -l" + ") -ne 0 ]; then echo $i; fi; done | " + "sed 's+^{params.out}/++;s+/happler/run/+\\t+;s+/happler.hap$++' >{output} 2>{log}" + + def FINAL_OUTPUT(wildcards): + files = [ + rules.happler_tree.output.png, + rules.happler_linreg.output.png, + rules.happler_heatmap.output.png, + rules.happler_manhattan.output.png, + rules.happler_cond_linreg.output.png, + rules.happler_results.output.susie_pdf, + ] + if check_config("SVs", place=config["modes"][mode]): + files.append(rules.happler_sv_ld.output.ld) + with open(checkpoints.multiline.get().output["hps"], 'r') as file: + multiline_locus, multiline_traits = zip(*(f[:-1].split("\t") for f in file.readlines())) + return expand( + expand( + files, zip, trait=multiline_traits, locus=multiline_locus, + allow_missing=True, + ), ex=("in",), allow_missing=True, + ) + + rule all: + input: + FINAL_OUTPUT + default_target: True +else: + FINAL_OUTPUT = expand( + [ + rules.happler_tree.output.png, + rules.happler_manhattan.output.png, + # rules.finemappers_results.output.susie_pdf, + rules.happler_results.output.susie_pdf, + rules.plots_params.output.png, + ], + ex=(("in",) if mode == "hap" else ("ex", "in")), + beta=config["modes"]["hap"]["beta"], + allow_missing=True, + ) + +# If '--config debug=true' is specified, we won't try to build the entire DAG +# This can be helpful when you want to play around with a specific target output +if not check_config("debug", False) and mode != "run": + rule all: + input: + expand( + FINAL_OUTPUT, + locus=locus, + sampsize=config["sample_size"], + ) if check_config("sample_size") else expand(FINAL_OUTPUT, locus=locus) + default_target: True diff --git a/analysis/workflow/envs/shapeit.yml b/analysis/workflow/envs/beagle.yml similarity index 68% rename from analysis/workflow/envs/shapeit.yml rename to analysis/workflow/envs/beagle.yml index 510d294..210c6ae 100644 --- a/analysis/workflow/envs/shapeit.yml +++ b/analysis/workflow/envs/beagle.yml @@ -2,5 +2,5 @@ channels: - conda-forge - nodefaults dependencies: - - bioconda::shapeit4=4.2.2 + - bioconda::beagle=5.4_27May24.118 - bioconda::tabix==1.11 diff --git a/analysis/workflow/envs/default.yml b/analysis/workflow/envs/default.yml index bfee78d..f09130d 100644 --- a/analysis/workflow/envs/default.yml +++ b/analysis/workflow/envs/default.yml @@ -1,5 +1,6 @@ channels: - conda-forge + - bioconda - nodefaults dependencies: - bioconda::bcftools==1.13 @@ -10,6 +11,7 @@ dependencies: - conda-forge::sed==4.8 - conda-forge::bash==5.0.018 - conda-forge::gzip==1.10 - - bioconda::plink2==2.00a3.3 + - bioconda::plink2==2.00a5.10 - conda-forge::graphviz==2.50.0 + - bioconda::igv==2.17.4 - conda-forge::ipython # for debugging diff --git a/analysis/workflow/envs/haptools.yml b/analysis/workflow/envs/haptools.yml deleted file mode 100644 index 8eadbc6..0000000 --- a/analysis/workflow/envs/haptools.yml +++ /dev/null @@ -1,10 +0,0 @@ -channels: - - conda-forge - - bioconda - - nodefaults -dependencies: - - pysam>=0.19.1 - - pip - - gxx_linux-64 - - pip: - - git+https://github.com/CAST-genomics/haptools.git#haptools[files]==0.0.3 diff --git a/analysis/workflow/envs/peer.yml b/analysis/workflow/envs/peer.yml new file mode 100644 index 0000000..b72c6bb --- /dev/null +++ b/analysis/workflow/envs/peer.yml @@ -0,0 +1,6 @@ +channels: + - conda-forge + - bioconda + - nodefaults +dependencies: + - bioconda::r-peer==1.3 diff --git a/analysis/workflow/rules/covariates.smk b/analysis/workflow/rules/covariates.smk new file mode 100644 index 0000000..09b7298 --- /dev/null +++ b/analysis/workflow/rules/covariates.smk @@ -0,0 +1,51 @@ +from pathlib import Path + + +out = config["out"] + "/covars" +logs = out + "/logs" +bench = out + "/bench" + + +def check_config(value, default=False, place=config, as_set=False): + """return true if config value exists and is true""" + value = place[value] if (value in place and place[value]) else default + return (set(value) if isinstance(value, list) else {value}) if as_set else value + +rule peer: + """compute PEER factors""" + input: + pheno_matrix = config["pheno_matrix"], + output: + covar=out+"/peer_factors.covar", + resources: + runtime=30, + queue="hotel", + log: + logs + "/peer", + benchmark: + bench + "/peer", + threads: 36 + conda: + "../envs/peer.yml" + shell: + "workflow/scripts/peer_residuals.R {input} {output} &>{log}" + + +rule merge: + """combine covariates into a merged .covar file""" + input: + covar = config["covar"], + peers = rules.peer.output.covar, + output: + covar=out+"/covars.covar" + resources: + runtime=3, + log: + logs + "/covariates", + benchmark: + bench + "/covariates", + conda: + "../envs/default.yml" + shell: + "join -t $'\\t' -j1 " + "<(sort -k1,1 {input.covar}) <(sort -k1,1 {input.peers}) > {output.covar}" diff --git a/analysis/workflow/rules/finemappers.smk b/analysis/workflow/rules/finemappers.smk index 2e9804e..a6f8030 100644 --- a/analysis/workflow/rules/finemappers.smk +++ b/analysis/workflow/rules/finemappers.smk @@ -16,13 +16,13 @@ rule susie: phen=config["pheno"], params: outdir=lambda wildcards, output: Path(output.susie).parent, - exclude_causal=lambda wildcards: config["causal_gt"] if \ + exclude_causal=lambda wildcards: expand(config["causal_gt"], **wildcards)[0] if \ int(exclude_causal[wildcards.causal]) else "NULL", output: susie=out + "/{causal}clude/susie.rds", resources: - runtime="0:15:00", - queue="hotel", + runtime=15, + slurm_partition='hotel', log: logs + "/{causal}clude/susie", benchmark: @@ -40,14 +40,14 @@ rule finemap: phen=config["pheno"], params: outdir=lambda wildcards, output: Path(output.finemap).parent, - exclude_causal=lambda wildcards: config["causal_gt"] if \ + exclude_causal=lambda wildcards: expand(config["causal_gt"], **wildcards)[0] if \ not int(exclude_causal[wildcards.causal]) else "NULL", output: sumstats=temp(out + "/{causal}clude/sumstats.rds"), finemap=out + "/{causal}clude/finemap.rds", resources: - runtime="0:45:00", - queue="hotel", + runtime=45, + slurm_partition='hotel', log: logs + "/{causal}clude/finemap", benchmark: @@ -64,17 +64,17 @@ rule results: gt=config["snp_panel"], finemap=rules.finemap.output.finemap, susie=rules.susie.output.susie, + causal_gt=config["causal_gt"], params: outdir=lambda wildcards, output: Path(output.susie_pdf).parent, - exclude_causal=lambda wildcards: not int(exclude_causal[wildcards.causal]), causal_hap="", - causal_gt=config["causal_gt"], output: finemap_pdf= out + "/{causal}clude/finemap.pdf", susie_pdf= out + "/{causal}clude/susie.pdf", resources: - runtime="1:00:00", - queue="hotel", + runtime=60, + slurm_partition='hotel', + slurm_extra='--qos=hotel', # susie_eff_pdf=temp( out + "/{causal}clude/susie_eff.pdf"), log: logs + "/{causal}clude/results", diff --git a/analysis/workflow/rules/genotypes.smk b/analysis/workflow/rules/genotypes.smk index 6bd0bb8..f7dfa77 100644 --- a/analysis/workflow/rules/genotypes.smk +++ b/analysis/workflow/rules/genotypes.smk @@ -6,34 +6,36 @@ logs = out + "/logs" bench = out + "/bench" -locus_chr = config["locus"].split(":")[0] -locus_start = config["locus"].split(":")[1].split('-')[0] -locus_end = config["locus"].split(":")[1].split('-')[1] - - def check_config(value, default=False, place=config, as_set=False): """return true if config value exists and is true""" value = place[value] if (value in place and place[value]) else default return (set(value) if isinstance(value, list) else {value}) if as_set else value +def parse_locus(locus): + """parse locus into chrom, start, end""" + chrom = locus.split("_")[0] + end = locus.split("-")[1] + start = locus.split("_")[1].split("-")[0] + return chrom, start, end + + rule plink2vcf: """ convert a PLINK file to VCF """ input: - pgen=lambda wildcards: expand(config["snp_panel"], chr=locus_chr)[0], + pgen=lambda wildcards: expand(config["snp_panel"], chr=parse_locus(wildcards.locus)[0])[0], params: pfile=lambda wildcards, input: str(Path(input.pgen).with_suffix('')), - out=lambda wildcards, output: str(Path(output.bcf).with_suffix('')), - start=locus_start, - end=locus_end, - chrom=locus_chr, + out=lambda wildcards, output: str(Path(output.log).with_suffix('')), + start=lambda wildcards: parse_locus(wildcards.locus)[1], + end=lambda wildcards: parse_locus(wildcards.locus)[2], + chrom=lambda wildcards: parse_locus(wildcards.locus)[0], output: vcf=temp(out + "/unphased.vcf.gz"), - bcf=out + "/unphased.bcf", - bcf_idx=out + "/unphased.bcf.csi", + vcf_idx=temp(out + "/unphased.vcf.gz.tbi"), log=temp(out + "/unphased.log"), resources: - runtime="0:20:00" + runtime=20, log: logs + "/plink2vcf" benchmark: @@ -44,41 +46,62 @@ rule plink2vcf: "plink2 --pfile {params.pfile} --out {params.out} --from-bp {params.start} " "--to-bp {params.end} --chr {params.chrom} --snps-only 'just-acgt' " "--export vcf bgz id-paste=iid &>{log} && " - "bcftools view -Ob -o {output.bcf} {output.vcf} &>>{log} && " - "tabix -p bcf {output.bcf} &>>{log}" + "tabix -p vcf {output.vcf} &>>{log}" def phase_gt_input(wildcards): - input_files = { 'map': config['phase_map'].format(chr=locus_chr) } + chrom = parse_locus(wildcards.locus)[0] + input_files = { 'map': config['phase_map'].format(chr=chrom) } if config["snp_panel"].endswith('.pgen'): - input_files['unphased'] = rules.plink2vcf.output.bcf + input_files['unphased'] = rules.plink2vcf.output.vcf else: - input_files['unphased'] = config["snp_panel"] - input_files['unphased_idx'] = input_files['unphased'] + ".csi" + input_files['unphased'] = config["snp_panel"].format(chr=chrom) + input_files['unphased_idx'] = input_files['unphased'] + ".tbi" + input_files["ref"] = config["ref_panel"].format(chr=chrom) + input_files["log"] = rules.plink2vcf.output.log return input_files +def get_num_variants(wildcards): + with open(expand(rules.plink2vcf.output.log, **wildcards)[0], "r") as file: + return int(next(line for line in file if 'variants rem' in line).split(" ")[0]) + + rule phase_gt: """ phase an unphased set of genotypes """ input: unpack(phase_gt_input) params: - locus=config["locus"], + locus=lambda wildcards: wildcards.locus.replace("_", ":"), + prefix=lambda wildcards, output: str(Path(output.log).with_suffix("")), output: - phased=out + "/phased.bcf", - phased_idx=out + "/phased.bcf.csi", + phased=out + "/phased.vcf.gz", + phased_idx=out + "/phased.vcf.gz.tbi", + log=temp(out + "/phased.log"), resources: - runtime="4:00:00" - threads: 8 + # We use a custom formula to determine the time requirements: + # This computes the number of seconds from the number of variants and then it + # divides by 59 seconds per hour + runtime=lambda wildcards: int( + (0.11417075326083094 * get_num_variants(wildcards) + 922.265855531002) / 59 + ), + # We use a custom formula to determine the memory requirements: + # This computes the number of GB from the number of variants and then it + # multiplies by 1050 MB per GB + mem_mb=lambda wildcards: int( + (0.0004624708472873269 * get_num_variants(wildcards) + 13.369088722181218) * 1050 + ), + threads: 32 log: logs + "/phase_gt" benchmark: bench + "/phase_gt" conda: - "../envs/shapeit.yml" + "../envs/beagle.yml" shell: - "shapeit4 --input {input.unphased} --map {input.map} --region {params.locus} " - "--output {output.phased} --thread {threads} --log {log} &>>{log} && " - "tabix -p bcf {output.phased} &>>{log}" + "beagle -Xmx{resources.mem_mb}m gt={input.unphased} ref={input.ref} " + "out={params.prefix} map={input.map} chrom={params.locus} nthreads={threads} " + "impute=false &>{log} && " + "tabix -p vcf {output.phased} &>>{log}" rule keep_samps: @@ -88,11 +111,13 @@ rule keep_samps: else config["snp_panel"], snp_vcf_idx=lambda wildcards: rules.phase_gt.output.phased_idx if check_config('phase_map') else config["snp_panel"] + ".tbi", - str_vcf=expand(config["str_panel"], chr=locus_chr), - str_vcf_idx=expand(config["str_panel"], chr=locus_chr), + str_vcf=lambda wildcards: expand(config["str_panel"], chr=parse_locus(wildcards.locus)[0]), + str_vcf_idx=lambda wildcards: expand(config["str_panel"], chr=parse_locus(wildcards.locus)[0]), samp=lambda wildcards: config["exclude_samples"], output: samples=out+"/samples.tsv" + resources: + runtime=3, log: logs+"/keep_samps" conda: @@ -109,17 +134,24 @@ rule vcf2plink: else config["snp_panel"], vcf_idx=lambda wildcards: rules.phase_gt.output.phased_idx if check_config('phase_map') else config["snp_panel"] + ".tbi", - samples=rules.keep_samps.output.samples if check_config('exclude_samples') else [], + samples=( + (rules.keep_samps.output.samples if check_config("str_panel") else config["exclude_samples"]) + if check_config('exclude_samples') else [] + ), params: maf=config["min_maf"], prefix=lambda wildcards, output: Path(output.pgen).with_suffix(""), + samps=lambda wildcards, input: (" --" + ( + "keep " if check_config("str_panel") else "remove " + ) + input.samples) if check_config("exclude_samples") else "", output: pgen=out+"/snps.pgen", pvar=out+"/snps.pvar", psam=out+"/snps.psam", log=temp(out+"/snps.log"), resources: - runtime="0:03:00" + runtime=15, + threads: 2 log: logs + "/vcf2plink", benchmark: @@ -127,22 +159,21 @@ rule vcf2plink: conda: "../envs/default.yml" shell: - "plink2 --bcf {input.vcf} --maf {params.maf} --make-pgen " - "--keep {input.samples} " if check_config('exclude_samples') else "" - " --out {params.prefix} &>{log}" + "plink2 --vcf {input.vcf} --maf {params.maf} --geno 0 --make-pgen " + "--threads {threads}{params.samps} --out {params.prefix} &>{log}" rule subset_str: """ subset samples from a STR VCF """ input: - vcf=lambda wildcards: expand(config["str_panel"], chr=locus_chr)[0], - vcf_idx=lambda wildcards: expand(config["str_panel"] + ".tbi", chr=locus_chr), + vcf=lambda wildcards: expand(config["str_panel"], chr=parse_locus(wildcards.locus)[0])[0], + vcf_idx=lambda wildcards: expand(config["str_panel"] + ".tbi", chr=parse_locus(wildcards.locus)[0]), samples=rules.keep_samps.output.samples, output: vcf=out+"/strs.bcf", vcf_idx=out+"/strs.bcf.csi", resources: - runtime="0:30:00" + runtime=30, log: logs + "/subset_str", benchmark: @@ -152,3 +183,45 @@ rule subset_str: shell: "bcftools view -S {input.samples} --write-index " "-Ob -o {output.vcf} {input.vcf}" + + +def subset_input(): + if check_config("phase_map") or check_config("exclude_samples") or not config["snp_panel"].endswith(".pgen"): + return { + "pgen": rules.vcf2plink.output.pgen, + "pvar": rules.vcf2plink.output.pvar, + "psam": rules.vcf2plink.output.psam, + } + else: + return { + "pgen": config["snp_panel"], + "pvar": Path(config["snp_panel"]).with_suffix(".pvar"), + "psam": Path(config["snp_panel"]).with_suffix(".psam"), + } + + +rule subset: + """subset the simulation dataset if needed""" + input: + **subset_input() + params: + prefix=lambda wildcards, input: Path(input.pgen).with_suffix(""), + out=lambda wildcards, output: Path(output.pgen).with_suffix(""), + sampsize=lambda wildcards: wildcards.sampsize, + output: + pgen=out+"/subset/{sampsize}.pgen", + pvar=out+"/subset/{sampsize}.pvar", + psam=out+"/subset/{sampsize}.psam", + log=temp(out+"/subset/{sampsize}.log"), + resources: + runtime=12, + threads: 2 + log: + logs + "/subset/{sampsize}", + benchmark: + bench + "/subset/{sampsize}", + conda: + "../envs/default.yml" + shell: + "plink2 --keep <(head -n {params.sampsize} {input.psam} | grep -Ev '^#' | cut -f1) " + "--make-pgen --pfile {params.prefix} --out {params.out} &>{log}" diff --git a/analysis/workflow/rules/happler.smk b/analysis/workflow/rules/happler.smk index 9f522e0..1eeb9d2 100644 --- a/analysis/workflow/rules/happler.smk +++ b/analysis/workflow/rules/happler.smk @@ -6,20 +6,47 @@ logs = out + "/logs" bench = out + "/bench" +# whether to include the happler haplotype ("in") or no haplotypes ("ex") +exclude_obs = {"in": 0, "ex": 1} +# or, if config["random"] is not None, this denotes +# whether to include a random haplotype ("in") or the causal haplotype ("ex") + +def check_config(value, default=False, place=config, as_set=False): + """return true if config value exists and is true""" + value = place[value] if (value in place and place[value]) else default + return (set(value) if isinstance(value, list) else {value}) if as_set else value + +def parse_locus(locus): + """parse locus into chrom, start, end""" + chrom = locus.split("_")[0] + end = locus.split("-")[1] + start = locus.split("_")[1].split("-")[0] + return chrom, start, end + + rule run: """ execute happler! """ input: gts=config["snp_panel"], pts=config["pheno"], + covar=config["covar"], params: - thresh=0.05, + thresh=lambda wildcards: 0.05 if "alpha" not in wildcards else wildcards.alpha, + region=lambda wildcards: wildcards.locus.replace("_", ":"), + covar=lambda wildcards, input: ("--covar " + input["covar"] + " ") if check_config("covar") else "", + maf = check_config("min_maf", 0), output: hap=out + "/happler.hap", + gz=out + "/happler.hap.gz", + idx=out + "/happler.hap.gz.tbi", dot=out + "/happler.dot", resources: - runtime="0:30:00", - queue="hotel", - threads: 6 + runtime=120, + # slurm_partition="hotel", + # slurm_extra="--qos=hotel", + # mem_mb=lambda wildcards, threads: threads*4.57, + mem_mb=lambda wildcards: 5000, + threads: 1 log: logs + "/run", benchmark: @@ -27,9 +54,10 @@ rule run: conda: "happler" shell: - "happler run -o {output.hap} --verbosity DEBUG " - "--discard-multiallelic " - "-t {params.thresh} --show-tree {input.gts} {input.pts} &>{log}" + "happler run -o {output.hap} --verbosity DEBUG --maf {params.maf} " + "--discard-multiallelic --region {params.region} {params.covar}" + "-t {params.thresh} --show-tree {input.gts} {input.pts} &>{log} && " + "haptools index -o {output.gz} {output.hap} &>>{log}" rule tree: @@ -41,7 +69,7 @@ rule tree: output: png=out + "/happler.png", resources: - runtime="0:04:00", + runtime=4, log: logs + "/tree", benchmark: @@ -52,20 +80,114 @@ rule tree: "dot -T{params.file_ext} {input.dot} -o {output.png} &>{log}" -rule transform: +rule cond_linreg: + """plot conditional regressions for a haplotype""" + input: + pgen=config["snp_panel"], + pvar=Path(config["snp_panel"]).with_suffix(".pvar"), + psam=Path(config["snp_panel"]).with_suffix(".psam"), + hap=rules.run.output.hap, + pts=config["pheno"], + params: + hap_id = "H0", + maf = check_config("min_maf", 0), + region=lambda wildcards: wildcards.locus.replace("_", ":"), + output: + png=out + "/cond_linreg.pdf", + resources: + runtime=15, + log: + logs + "/cond_linreg", + benchmark: + bench + "/cond_linreg", + conda: + "happler" + shell: + "workflow/scripts/conditional_regression_plots.py --verbosity DEBUG " + "--show-original --region {params.region} -o {output.png} -i {params.hap_id} " + "--maf {params.maf} {input.pgen} {input.pts} {input.hap} &>{log}" + + +rule heatmap: + """look at the LD pattern of the haplotype""" + # TODO: also include causal hap if one exists + input: + pgen=config["snp_panel"], + pvar=Path(config["snp_panel"]).with_suffix(".pvar"), + psam=Path(config["snp_panel"]).with_suffix(".psam"), + hap=rules.run.output.hap, + pts=config["pheno"], + params: + region=lambda wildcards: wildcards.locus.replace("_", ":"), + output: + png=out + "/heatmap.pdf", + resources: + runtime=4, + log: + logs + "/heatmap", + benchmark: + bench + "/heatmap", + conda: + "happler" + shell: + "workflow/scripts/heatmap_alleles.py --verbosity DEBUG " + "--region {params.region} -o {output.png} " + "{input.pgen} {input.hap} {input.pts} &>{log}" + + +rule igv: + """look at the haplotype in IGV""" input: hap=rules.run.output.hap, + prefs = "data/igv.prefs.properties", # todo move this into config + hprc_urls = "data/hprc.bam.list", + params: + chrom=lambda wildcards: parse_locus(wildcards.locus)[0], + outdir=lambda wildcards, output: Path(output.png).parent, + outfile=lambda wildcards, output: Path(output.png).name, + hap_id="H0", #you can change this to "" to use all haps + width=10000, + output: + bed=temp(out + "/happler_snps.bed"), + png=out + "/igv.png", + resources: + runtime=10, + log: + logs + "/igv", + benchmark: + bench + "/igv", + conda: + "../envs/default.yml" + shell: + "region={params.chrom}:$(" + "grep -Ev '^#' {input.hap} | awk -F $'\\t' " + "'BEGIN {{ min = \"inf\"; max = \"-inf\" }} " + "'\"$([ \"{params.hap_id}\" != \"\" ] && echo '$2 == \"{params.hap_id}\" ')\"'" + "{{ if ($3 < min) min = $3; if ($4 > max) max = $4 }} END {{ print (min-{params.width})\"-\"(max+{params.width}) }}'" + ") && grep -E '^V' {input.hap} | " + "awk -F $'\\t' -v 'OFS=\\t' " + "\"$([ \"{params.hap_id}\" != \"\" ] && echo '$2 == \"{params.hap_id}\" ')\"'" + "{{print {params.chrom}, $3, $4, $2\"_\"$5;}}' > {output.bed} && " + "igv -o {input.prefs} -b <(" + "sed 's+mySnapshotDirectory+{params.outdir}+;s/REGION/'$region'/;s+BED+{output.bed}+' workflow/scripts/igv.bat && " + "cut -f1,2 --output-delimiter=: {output.bed} | sort -u | sed 's/^/SORT BASE /' && echo 'snapshot {params.outfile}' && echo exit" + ") &>{log}" + + +rule transform: + input: + hap=rules.run.output.gz, pgen=config["snp_panel"], pvar=Path(config["snp_panel"]).with_suffix(".pvar"), psam=Path(config["snp_panel"]).with_suffix(".psam"), params: - hap_id="H1", + region=lambda wildcards: wildcards.locus.replace("_", ":"), output: pgen=temp(out + "/happler.pgen"), pvar=temp(out + "/happler.pvar"), psam=temp(out + "/happler.psam"), resources: - runtime="0:04:00" + runtime=4, log: logs + "/transform", benchmark: @@ -73,8 +195,78 @@ rule transform: conda: "happler" shell: - "haptools transform -o {output.pgen} --id {params.hap_id} {input.pgen} " - "{input.hap} &>{log}" + "haptools transform -o {output.pgen} --region {params.region} " + "{input.pgen} {input.hap} &>{log}" + + +rule linreg: + """plot a haplotype's genotypes against its phenotypes""" + input: + hap=rules.transform.output.pgen, + hap_pvar=rules.transform.output.pvar, + hap_psam=rules.transform.output.psam, + pts=config["pheno"], + params: + region=lambda wildcards: wildcards.locus.replace("_", ":"), + output: + png=out + "/linreg.pdf", + resources: + runtime=4, + log: + logs + "/linreg", + benchmark: + bench + "/linreg", + conda: + "happler" + shell: + "workflow/scripts/plot_gts.py --verbosity DEBUG " + "-o {output.png} --region {params.region} " + "{input.hap} {input.pts} 2>{log}" + + +rule sv_ld: + """compute LD between this haplotype and a bunch of SVs""" + input: + hap=rules.transform.output.pgen, + hap_pvar=rules.transform.output.pvar, + hap_psam=rules.transform.output.psam, + sv=lambda wildcards: config["SVs"], + sv_pvar=lambda wildcards: Path(config["SVs"]).with_suffix(".pvar"), + sv_psam=lambda wildcards: Path(config["SVs"]).with_suffix(".psam"), + params: + start=lambda wildcards: max(0, int(parse_locus(wildcards.locus)[1])-1000000), + end=lambda wildcards: int(parse_locus(wildcards.locus)[2])+1000000, + chrom=lambda wildcards: parse_locus(wildcards.locus)[0], + maf = check_config("min_maf", 0), + hapid="H0", + output: + ld=out + "/happler_svs.ld", + resources: + runtime=4, + log: + logs + "/sv_ld", + benchmark: + bench + "/sv_ld", + conda: + "happler" + shell: + "workflow/scripts/compute_pgen_ld.py --verbosity DEBUG " + "--region '{params.chrom}:{params.start}-{params.end}' --maf {params.maf} " + "--hap-id {params.hapid} -o /dev/stdout {input.sv} {input.hap} 2>{log} | " + "grep -Ev 'nan$' > {output} 2>>{log}" + + +def merge_hps_input(wildcards): + if config["random"] is None: + # include the hap that happler found + return rules.transform.output + else: + if exclude_obs[wildcards.ex]: + # exclude the random hap (and use the causal hap, instead) + return config["causal_gt"] + else: + # include the random hap + return config["random"] rule merge: @@ -82,46 +274,113 @@ rule merge: gts=config["snp_panel"], gts_pvar=Path(config["snp_panel"]).with_suffix(".pvar"), gts_psam=Path(config["snp_panel"]).with_suffix(".psam"), - hps=rules.transform.output.pgen, - hps_pvar=rules.transform.output.pvar, - hps_psam=rules.transform.output.psam, + hps=lambda wildcards: merge_hps_input(wildcards).pgen, + hps_pvar=lambda wildcards: merge_hps_input(wildcards).pvar, + hps_psam=lambda wildcards: merge_hps_input(wildcards).psam, + params: + region=lambda wildcards: wildcards.locus.replace("_", ":"), + maf = check_config("min_maf", 0), output: - pgen=out + "/merged.pgen", - pvar=out + "/merged.pvar", - psam=out + "/merged.psam", + pgen=temp(out + "/{ex}clude/merged.pgen"), + pvar=temp(out + "/{ex}clude/merged.pvar"), + psam=temp(out + "/{ex}clude/merged.psam"), resources: - runtime="0:04:00" + runtime=10, log: - logs + "/merge", + logs + "/{ex}clude/merge", benchmark: - bench + "/merge", + bench + "/{ex}clude/merge", conda: "happler" shell: - "workflow/scripts/merge_plink.py {input.gts} {input.hps} {output.pgen} &> {log}" + "workflow/scripts/merge_plink.py --region {params.region} --maf {params.maf} " + "{input.gts} {input.hps} {output.pgen} &> {log}" + + +finemapper_input = lambda wildcards: rules.transform.input if (exclude_obs[wildcards.ex] and config["random"] is None) else rules.merge.output rule finemapper: """ execute SuSiE using the haplotypes from happler """ input: - gt=rules.merge.output.pgen, + gt=lambda wildcards: finemapper_input(wildcards).pgen, + gt_pvar=lambda wildcards: finemapper_input(wildcards).pvar, + gt_psam=lambda wildcards: finemapper_input(wildcards).psam, phen=config["pheno"], params: outdir=lambda wildcards, output: Path(output.susie).parent, exclude_causal="NULL", + region=lambda wildcards: wildcards.locus.replace("_", ":"), output: - susie=out + "/susie.rds", + susie=out + "/{ex}clude/susie.rds", resources: - runtime="1:15:00", - queue="hotel", + runtime=75, + threads: 6, log: - logs + "/finemapper", + logs + "/{ex}clude/finemapper", benchmark: - bench + "/finemapper", + bench + "/{ex}clude/finemapper", conda: "../envs/susie.yml" shell: - "workflow/scripts/run_SuSiE.R {input} {params} &>{log}" + "workflow/scripts/run_SuSiE.R {input.gt} {input.phen} {params} &>{log}" + + +rule pips: + """ extract PIPs to a TSV file """ + input: + rds=rules.finemapper.output.susie, + output: + tsv=out + "/{ex}clude/susie_pips.tsv", + resources: + runtime=5, + threads: 3, + log: + logs + "/{ex}clude/pips", + benchmark: + bench + "/{ex}clude/pips", + conda: + "../envs/susie.yml" + shell: + "workflow/scripts/extract_pips.R {input} {output} &>{log}" + + +rule metrics: + """ compute summary metrics from the output of the finemapper """ + input: + finemap=expand(rules.finemapper.output.susie, ex="in", allow_missing=True), + obs_hap=rules.run.output.hap, + caus_hap=config["hap_file"], + output: + metrics=out + "/susie_metrics.tsv", + resources: + runtime=5, + threads: 6, + log: + logs + "/metrics", + benchmark: + bench + "/metrics", + conda: + "../envs/susie.yml" + shell: + "workflow/scripts/susie_metrics.R {input} >{output} 2>{log}" + + +def results_happler_hap_input(wildcards): + if config["random"] is None: + if exclude_obs[wildcards.ex]: + return [] + return rules.run.output.hap + elif exclude_obs[wildcards.ex]: + return expand(config["hap_file"], **wildcards) + return expand(config["random_hap"], **wildcards) + + +def results_causal_hap_input(wildcards): + if config["random"] is None: + if exclude_obs[wildcards.ex]: + return [] + return expand(config["hap_file"], **wildcards) rule results: @@ -132,48 +391,87 @@ rule results: input: gt=rules.finemapper.input.gt, susie=rules.finemapper.output.susie, - happler_hap=rules.run.output.hap, + happler_hap=results_happler_hap_input, + causal_gt=config["causal_gt"].pgen if "causal_gt" in config else [], + causal_hap=results_causal_hap_input, params: outdir=lambda wildcards, output: Path(output.susie_pdf).parent, - exclude_causal=lambda wildcards: 0, - causal_hap=config["hap_file"], - causal_gt=config["causal_gt"], + region=lambda wildcards: wildcards.locus.replace("_", ":"), output: - susie_pdf = out + "/susie.pdf", + susie_pdf = out + "/{ex}clude/susie.pdf", # susie_eff_pdf=temp(out + "/susie_eff.pdf"), + resources: + runtime=5, log: - logs + "/results", + logs + "/{ex}clude/results", + benchmark: + bench + "/{ex}clude/results", conda: "../envs/susie.yml" script: "../scripts/summarize_results.R" +rule merge_SVs: + input: + gts=rules.merge.output.pgen, + gts_pvar=rules.merge.output.pvar, + gts_psam=rules.merge.output.psam, + svs=lambda wildcards: config["SVs"], + svs_pvar=lambda wildcards: Path(config["SVs"]).with_suffix(".pvar"), + svs_psam=lambda wildcards: Path(config["SVs"]).with_suffix(".psam"), + params: + maf = check_config("min_maf", 0), + region=lambda wildcards: wildcards.locus.replace("_", ":"), + output: + pgen=temp(out + "/{ex}clude/merged_SVs.pgen"), + pvar=temp(out + "/{ex}clude/merged_SVs.pvar"), + psam=temp(out + "/{ex}clude/merged_SVs.psam"), + resources: + runtime=4, + log: + logs + "/{ex}clude/merge_SVs", + benchmark: + bench + "/{ex}clude/merge_SVs", + conda: + "happler" + shell: + "workflow/scripts/merge_plink.py --region {params.region} --maf {params.maf} " + "{input.gts} {input.svs} {output.pgen} &> {log}" + + rule gwas: """run a GWAS""" input: - pgen=rules.merge.output.pgen, - pvar=rules.merge.output.pvar, - psam=rules.merge.output.psam, + pgen=(rules.merge_SVs if check_config("SVs") else rules.merge).output.pgen, + pvar=(rules.merge_SVs if check_config("SVs") else rules.merge).output.pvar, + psam=(rules.merge_SVs if check_config("SVs") else rules.merge).output.psam, pts=config["pheno"], + covar=config["covar"], params: + maf = check_config("min_maf", 0), in_prefix = lambda w, input: Path(input.pgen).with_suffix(""), out_prefix = lambda w, output: Path(output.log).with_suffix(""), + covar = lambda wildcards, input: ("'no-x-sex' --covar 'iid-only' " + input["covar"]) if check_config("covar") else "allow-no-covars", + start=lambda wildcards: parse_locus(wildcards.locus)[1], + end=lambda wildcards: parse_locus(wildcards.locus)[2], + chrom=lambda wildcards: parse_locus(wildcards.locus)[0], output: - log = temp(out + "/hap.log"), - linear = out + "/hap.hap.glm.linear", + log = temp(out + "/{ex}clude/hap.log"), + linear = out + "/{ex}clude/hap.hap.glm.linear", resources: - runtime="0:10:00", + runtime=10, log: - logs + "/gwas", + logs + "/{ex}clude/gwas", benchmark: - bench + "/gwas", + bench + "/{ex}clude/gwas", threads: 1 conda: "../envs/default.yml" shell: - "plink2 --linear allow-no-covars --variance-standardize " + "plink2 --glm {params.covar} --variance-standardize --maf {params.maf} " "--pheno iid-only {input.pts} --pfile {params.in_prefix} " + # "--from-bp {params.start} --to-bp {params.end} --chr {params.chrom} " # unnecessary b/c merge subsets by region already "--out {params.out_prefix} --threads {threads} &>{log}" @@ -183,19 +481,20 @@ rule manhattan: params: linear = lambda wildcards, input: f"-l "+input.linear, red_ids = lambda wildcards: [ - f"-i {i.split(':')[0]}" for i in config["snps"] - ], - orange_ids = lambda wildcards: "-b hap -b H1", + f"-i {i.split(':')[0]}" for i in check_config("snps", default=[]) + ] if not check_config("SVs") else "--red-chr-ids", + orange_ids = lambda wildcards: "-b hap --orange-Hids", + # TODO: allow specifying IDs from hap files, instead output: - png = out + "/manhattan.pdf", + png = out + "/{ex}clude/manhattan.pdf", resources: - runtime="0:05:00" + runtime=10, log: - logs + "/manhattan", + logs + "/{ex}clude/manhattan", benchmark: - bench + "/manhattan", + bench + "/{ex}clude/manhattan", conda: "happler" shell: - "workflow/scripts/manhattan.py -o {output.png} {params.linear} " + "workflow/scripts/manhattan.py -o {output.png} --no-label {params.linear} " "{params.red_ids} {params.orange_ids} &>{log}" diff --git a/analysis/workflow/rules/plots.smk b/analysis/workflow/rules/plots.smk index 73336d2..2ef6632 100644 --- a/analysis/workflow/rules/plots.smk +++ b/analysis/workflow/rules/plots.smk @@ -9,31 +9,71 @@ out += "/" + mode logs += "/" + mode bench += "/" + mode +def agg_ld(wildcards): + """ a helper function for the other agg functions """ + checkpoint_output = config["ld_range_checkpoint"].get(**wildcards).output.hap + ld_vals = glob_wildcards(Path(checkpoint_output) / "ld_{ld}/haplotype.hap").ld + return checkpoint_output, ld_vals + def agg_ld_range_obs(wildcards): """ return a list of hap files from happler """ if "ld_range_checkpoint" in config: - checkpoint_output = config["ld_range_checkpoint"].get(**wildcards).output.hap + checkpoint_output, ld_vals = agg_ld(wildcards) return expand( config["happler_hap"], - ld=glob_wildcards(Path(checkpoint_output) / "ld_{ld}/haplotype.hap").ld, + ld=ld_vals, + alpha=config["mode_attrs"]["alpha"], beta=config["mode_attrs"]["beta"], + **wildcards, ) else: - return expand(config["happler_hap"], beta=config["mode_attrs"]["beta"]) + return expand( + config["happler_hap"], + beta=config["mode_attrs"]["beta"], + **wildcards, + ) def agg_ld_range_causal(wildcards): """ return a list of hap files from the LD range checkpoint """ if "ld_range_checkpoint" in config: - checkpoint_output = config["ld_range_checkpoint"].get(**wildcards).output.hap + checkpoint_output, ld_vals = agg_ld(wildcards) return expand( str(checkpoint_output), - ld=glob_wildcards(Path(checkpoint_output) / "ld_{ld}/haplotype.hap").ld, + ld=ld_vals, beta=config["mode_attrs"]["beta"], + **wildcards, ) else: - causal_hap = config["causal_hap"] - return expand(causal_hap, beta=config["mode_attrs"]["beta"]) + return expand( + config["causal_hap"], + beta=config["mode_attrs"]["beta"], + **wildcards, + ) + +def agg_ld_range_metrics(wildcards): + """ return a list of metrics files from the LD range checkpoint """ + if "ld_range_checkpoint" in config: + checkpoint_output, ld_vals = agg_ld(wildcards) + return expand( + config["happler_metrics"], + ld=ld_vals, + beta=config["mode_attrs"]["beta"], + alpha=config["mode_attrs"]["alpha"], + **wildcards, + ) + else: + return expand( + config["happler_metrics"], + beta=config["mode_attrs"]["beta"], + **wildcards, + ) +fill_out_globals = lambda wildcards, val: expand( + val, + locus=wildcards.locus, + sampsize=wildcards.sampsize, + allow_missing=True, +) rule params: """ check how wildcards affect the haplotypes output by happler """ @@ -44,12 +84,12 @@ rule params: observed_haps=agg_ld_range_obs, causal_hap=agg_ld_range_causal, params: - observed_haps = lambda wildcards: config["happler_hap"], - causal_hap = lambda wildcards: config["causal_hap"], + observed_haps = lambda wildcards: fill_out_globals(wildcards, config["happler_hap"]), + causal_hap = lambda wildcards: fill_out_globals(wildcards, config["causal_hap"]), output: png=out + "/happler_params.png", resources: - runtime="0:10:00", + runtime=10, log: logs + "/plot_params", benchmark: @@ -59,3 +99,31 @@ rule params: shell: "workflow/scripts/parameter_plot.py -o {output.png} " "{input.gts} {params.observed_haps} {params.causal_hap} &> {log}" + + +rule metrics: + """ check how wildcards affect the finemapped haplotypes output by happler """ + input: + gts=config["snp_panel"], + gts_pvar=Path(config["snp_panel"]).with_suffix(".pvar"), + gts_psam=Path(config["snp_panel"]).with_suffix(".psam"), + observed_haps=agg_ld_range_obs, + causal_hap=agg_ld_range_causal, + metrics=agg_ld_range_metrics, + params: + observed_haps = lambda wildcards: fill_out_globals(wildcards, config["happler_hap"]), + causal_hap = lambda wildcards: fill_out_globals(wildcards, config["causal_hap"]), + metrics = lambda wildcards: fill_out_globals(wildcards, config["happler_metrics"]), + output: + png=out + "/finemapping_metrics.png", + resources: + runtime=10, + log: + logs + "/metrics", + benchmark: + bench + "/metrics", + conda: + "happler" + shell: + "workflow/scripts/parameter_plot.py -o {output.png} -m {params.metrics} " + "{input.gts} {params.observed_haps} {params.causal_hap} &> {log}" diff --git a/analysis/workflow/rules/simulate.smk b/analysis/workflow/rules/simulate.smk index 264fef8..7a9dbc3 100644 --- a/analysis/workflow/rules/simulate.smk +++ b/analysis/workflow/rules/simulate.smk @@ -10,11 +10,19 @@ out += "/" + mode logs += "/" + mode bench += "/" + mode +if mode == "ld_range" and config["random"]: + out += "/random" + logs += "/random" + bench += "/random" -locus_chr = config["locus"].split(":")[0] -locus_start = config["locus"].split(":")[1].split('-')[0] -locus_end = config["locus"].split(":")[1].split('-')[1] +def parse_locus(locus): + """parse locus into chrom, start, end""" + chrom = locus.split("_")[0] + end = locus.split("-")[1] + start = locus.split("_")[1].split("-")[0] + return chrom, start, end +hap_ld_range_output = out + "/create_ld_range/ld_{ld}/haplotype.hap" checkpoint create_hap_ld_range: """ create a hap file suitable for haptools transform and simphenotype """ @@ -29,10 +37,11 @@ checkpoint create_hap_ld_range: step = config["modes"]["ld_range"]["step"], min_af = config["modes"]["ld_range"]["min_af"], max_af = config["modes"]["ld_range"]["max_af"], + out = lambda wildcards: hap_ld_range_output, output: - hap=directory(out) + hap=directory(out + "/create_ld_range") resources: - runtime="0:05:00" + runtime=5, log: logs + "/create_hap_ld_range", benchmark: @@ -43,7 +52,7 @@ checkpoint create_hap_ld_range: "workflow/scripts/choose_different_ld.py -r {params.reps} " "--min-ld {params.min_ld} --max-ld {params.max_ld} " "--step {params.step} --min-af {params.min_af} " - "--max-af {params.max_af} {input.gts} {output.hap} &> {log}" + "--max-af {params.max_af} {input.gts} {params.out} &> {log}" rule create_hap: @@ -52,14 +61,14 @@ rule create_hap: gts=Path(config["gts_snp_panel"]).with_suffix(".pvar"), params: ignore="this", # the first parameter is always ignored for some reason - chrom=locus_chr, - locus=config["locus"].split(":")[1].replace('-', '\t'), + chrom=lambda wildcards: parse_locus(wildcards.locus)[0], + locus=lambda wildcards: wildcards.locus.split("_")[1].replace('-', '\t'), beta=0.99, alleles=lambda wildcards: config["modes"]["hap"]["alleles"], output: hap=out + "/haplotype.hap" resources: - runtime="0:05:00" + runtime=5, log: logs + "/create_hap", benchmark: @@ -69,28 +78,24 @@ rule create_hap: script: "../scripts/create_hap_file.sh" -transform_input = rules.create_hap.output.hap if mode == "ld_range": - transform_input = rules.create_hap_ld_range.output.hap + "/ld_{ld}/haplotype.hap" - -if mode == "ld_range": - out += "/ld_{ld}" - logs += "/ld_{ld}" - bench += "/ld_{ld}" + out += "/pheno/ld_{ld}" + logs += "/pheno/ld_{ld}" + bench += "/pheno/ld_{ld}" rule transform: """ use the hap file to create a pgen of the haplotype """ input: - hap=transform_input, + hap=hap_ld_range_output if mode == "ld_range" else rules.create_hap.output.hap, pgen=config["gts_snp_panel"], pvar=Path(config["gts_snp_panel"]).with_suffix(".pvar"), psam=Path(config["gts_snp_panel"]).with_suffix(".psam"), output: - pgen=temp(out + "/transform.pgen"), - pvar=temp(out + "/transform.pvar"), - psam=temp(out + "/transform.psam"), + pgen=out + "/transform.pgen", + pvar=out + "/transform.pvar", + psam=out + "/transform.psam", resources: - runtime="0:05:00" + runtime=5, log: logs + "/transform", benchmark: @@ -113,7 +118,7 @@ rule simphenotype: output: pheno=out + "/{beta}.pheno", resources: - runtime="0:05:00" + runtime=5, log: logs + "/{beta}/simphenotype", benchmark: diff --git a/analysis/workflow/scripts/README.md b/analysis/workflow/scripts/README.md index 40f1a1b..7846ec7 100644 --- a/analysis/workflow/scripts/README.md +++ b/analysis/workflow/scripts/README.md @@ -1,7 +1,7 @@ # scripts This directory contains various scripts used by the pipeline. However, you can use most of these scripts on their own, too. Some may even be helpful in day-to-day use. -All python scripts implement the --help argument. For bash and R scripts, you can run `head