Merge pull request #43 from gymrek-lab/feat/multiple-testing

feat: finish running on geuvadis
gymrek-lab · Jun 5, 2024 · 2bf6d5f · 2bf6d5f
2 parents 7876136 + 02f36c4
commit 2bf6d5f
Show file tree

Hide file tree

Showing 69 changed files with 8,294 additions and 2,085 deletions.
diff --git a/.devcontainer.json b/.devcontainer.json
@@ -0,0 +1,41 @@
+// For format details, see https://aka.ms/devcontainer.json. For config options, see the
+// README at: https://github.com/devcontainers/templates/tree/main/src/ubuntu
+{
+	"name": "Ubuntu",
+	// Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile
+	"image": "mcr.microsoft.com/devcontainers/base:jammy",
+	"features": {
+		"ghcr.io/rocker-org/devcontainer-features/miniforge:1": {
+			"version": "latest",
+			"variant": "Mambaforge"
+		}
+	},
+
+	// Features to add to the dev container. More info: https://containers.dev/features.
+	// "features": {},
+
+	// Use 'forwardPorts' to make a list of ports inside the container available locally.
+	// "forwardPorts": [],
+
+	// Use 'postCreateCommand' to run commands after the container is created.
+	"postCreateCommand": "mamba env create -n happler -f dev-env.yml && conda run -n happler poetry install",
+
+	// Configure tool-specific properties.
+	"customizations": {
+		"vscode": {
+			"extensions": ["ms-python.python"],
+			"settings": {
+				"python.condaPath": "/opt/conda/condabin/conda",
+				"python.defaultInterpreterPath": "/opt/conda/envs/happler/bin/python",
+				"python.terminal.activateEnvironment": true,
+				"python.terminal.activateEnvInCurrentTerminal": true,
+				"python.venvFolders": ["/home/vscode/.cache/pypoetry/virtualenvs"],
+				"terminal.integrated.environmentChangesRelaunch": true,
+				"terminal.integrated.hideOnStartup": "always"
+			}
+		}
+	}
+
+	// Uncomment to connect as root instead. More info: https://aka.ms/dev-containers-non-root.
+	// "remoteUser": "root"
+}
diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
@@ -0,0 +1,13 @@
+## Checklist
+
+* [ ] I've checked to ensure there aren't already other open [pull requests](../../../pulls) for the same update/change
+* [ ] I've prefixed the title of my PR according to [the conventional commits specification](https://www.conventionalcommits.org/). If your PR fixes a bug, please prefix the PR with `fix: `. Otherwise, if it introduces a new feature, please prefix it with `feat: `. If it introduces a breaking change, please add an exclamation before the colon, like `feat!: `. If the scope of the PR changes because of a revision to it, please update the PR title, since the title will be used in our CHANGELOG.
+* [ ] At the top of the PR, I've [listed any open issues that this PR will resolve](https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue#linking-a-pull-request-to-an-issue-using-a-keyword). For example, "resolves #0" if this PR resolves issue #0
+- [ ] I've explained my changes in a manner that will make it possible for both users and maintainers of happler to understand them
+* [ ] I have followed the [contributing guidelines](https://happler.readthedocs.io/en/stable/project_info/contributing.html#how-to-fix-a-bug-or-implement-a-new-feature)
+* [ ] I have adhered to the [style guidelines](https://happler.readthedocs.io/en/stable/project_info/contributing.html#style)
+* [ ] I've added tests for any new functionality. Or, if this PR fixes a bug, I've added test(s) that replicate it
+* [ ] I've updated the relevant documentation and checked that the newly built documentation is formatted properly
+* [ ] All functions, modules, classes etc. still conform to [numpy docstring standards](https://numpydoc.readthedocs.io/en/latest/format.html)
+* [ ] (if applicable) I've updated the pyproject.toml file with any changes I've made to happler's dependencies, and I've run `poetry lock --no-update` to ensure the lock file stays up to date and that our dependencies are locked to their minimum versions
+* [ ] In the body of this PR, I've included a short address to the reviewer highlighting one or two items that might deserve their focus
diff --git a/.github/workflows/constraints.txt b/.github/workflows/constraints.txt
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -1,7 +1,6 @@
 name: Tests
 
-on:
-  - pull_request
+on: [pull_request, workflow_call]
 
 jobs:
   tests:
@@ -11,13 +10,15 @@ jobs:
       fail-fast: false
       matrix:
         include:
-          - { python: "3.7", os: "ubuntu-latest", session: "lint" }
-          - { python: "3.7", os: "ubuntu-latest", session: "tests" }
+          - { python: "3.8", os: "ubuntu-latest", session: "lint" }
           - { python: "3.8", os: "ubuntu-latest", session: "tests" }
           - { python: "3.9", os: "ubuntu-latest", session: "tests" }
           - { python: "3.10", os: "ubuntu-latest", session: "tests" }
-          # - { python: "3.10", os: "windows-latest", session: "tests" }
-          # - { python: "3.10", os: "macos-latest", session: "tests" }
+          - { python: "3.11", os: "ubuntu-latest", session: "tests" }
+          - { python: "3.12", os: "ubuntu-latest", session: "tests" }
+          # - { python: "3.11", os: "windows-latest", session: "tests" }
+          # - { python: "3.9", os: "macos-latest", session: "tests" }
+          - { python: "3.8", os: "ubuntu-latest", session: "size" }
 
     env:
       NOXSESSION: ${{ matrix.session }}
@@ -26,90 +27,75 @@ jobs:
 
     steps:
       - name: Check out the repository
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
 
-      - name: Set up Python ${{ matrix.python }}
-        uses: actions/setup-python@v3
+      - name: Setup Mambaforge
+        uses: conda-incubator/setup-miniconda@v3
         with:
-          python-version: ${{ matrix.python }}
+          activate-environment: happler
+          miniforge-variant: Mambaforge
+          auto-activate-base: false
+          miniforge-version: latest
+          use-mamba: true
 
-      - name: Upgrade pip
-        run: |
-          pip install --constraint=.github/workflows/constraints.txt pip
-          pip --version
-      - name: Upgrade pip in virtual environments
-        shell: python
-        run: |
-          import os
-          import pip
-          with open(os.environ["GITHUB_ENV"], mode="a") as io:
-              print(f"VIRTUALENV_PIP={pip.__version__}", file=io)
-      - name: Install Poetry
+      - name: Get Date
+        id: get-date
+        run: echo "today=$(/bin/date -u '+%Y%m%d')" >> $GITHUB_OUTPUT
+        shell: bash
+
+      - name: Cache Conda env
+        uses: actions/cache@v3
+        with:
+          path: ${{ env.CONDA }}/envs
+          key:
+            conda-${{ runner.os }}--${{ runner.arch }}--${{ steps.get-date.outputs.today }}-${{ hashFiles('dev-env.yml') }}-${{ env.CACHE_NUMBER }}
+        env:
+          # Increase this value to reset cache if dev-env.yml has not changed
+          CACHE_NUMBER: 0
+        id: cache
+
+      - name: Install dev environment
+        run:
+          mamba env update -n happler -f dev-env.yml
+        if: steps.cache.outputs.cache-hit != 'true'
+
+      - name: Try to build happler
+        shell: bash -el {0}
         run: |
-          pipx install --pip-args=--constraint=.github/workflows/constraints.txt poetry
-          poetry --version
-      - name: Install Nox
+          poetry build --no-ansi
+
+      - name: Check distribution size
+        if: matrix.session == 'size'
         run: |
-          pipx install --pip-args=--constraint=.github/workflows/constraints.txt nox
-          pipx inject --pip-args=--constraint=.github/workflows/constraints.txt nox nox-poetry
-          nox --version
-      - name: Run Nox
+          du -csh dist/*
+          # check that the generated dist/ directory does not exceed 0.3 MB
+          # if this check fails, it's because you forgot to list large files in a "tool.poetry.exclude" section of our pyproject.toml
+          # https://python-poetry.org/docs/pyproject/#include-and-exclude
+          [ $(du -b dist | cut -f1) -lt 300000 ]
+
+      - name: Run tests with nox
+        if: matrix.session != 'size'
+        shell: bash -el {0}
         run: |
-          nox --python=${{ matrix.python }}
+          nox --verbose --python=${{ matrix.python }}
+
       - name: Upload coverage data
         if: always() && matrix.session == 'tests'
         uses: "actions/upload-artifact@v3"
         with:
           name: coverage-data
           path: ".coverage.*"
+
   large-files:
     name: File sizes
     runs-on: ubuntu-latest
     steps:
       - name: Check out the repository
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
+
       - name: Check for large files
         uses: actionsdesk/lfs-warning@v3.2
         with:
           token: ${{ secrets.GITHUB_TOKEN }} # Optional
           filesizelimit: 500000b
           labelName: large-files
-
-  # coverage:
-  #   runs-on: ubuntu-latest
-  #   needs: tests
-  #   steps:
-  #     - name: Check out the repository
-  #       uses: actions/checkout@v3
-
-  #     - name: Set up Python
-  #       uses: actions/setup-python@v3
-  #       with:
-  #         python-version: "3.10"
-
-  #     - name: Upgrade pip
-  #       run: |
-  #         pip install --constraint=.github/workflows/constraints.txt pip
-  #         pip --version
-  #     - name: Install Poetry
-  #       run: |
-  #         pipx install --pip-args=--constraint=.github/workflows/constraints.txt poetry
-  #         poetry --version
-  #     - name: Install Nox
-  #       run: |
-  #         pipx install --pip-args=--constraint=.github/workflows/constraints.txt nox
-  #         pipx inject --pip-args=--constraint=.github/workflows/constraints.txt nox nox-poetry
-  #         nox --version
-  #     - name: Download coverage data
-  #       uses: actions/download-artifact@v3
-  #       with:
-  #         name: coverage-data
-
-  #     - name: Combine coverage data and display human readable report
-  #       run: |
-  #         nox --session=coverage
-  #     - name: Create coverage report
-  #       run: |
-  #         nox --session=coverage -- xml
-  #     - name: Upload coverage report
-  #       uses: codecov/codecov-action@v3.1.0
diff --git a/.gitignore b/.gitignore
@@ -12,7 +12,10 @@ dist/
 analysis/data
 analysis/out
 analysis/log
+analysis/Rplots.pdf
+analysis/myenv.RData
 .snakemake
 .ipynb_checkpoints
 # vscode
 .vscode
+venv/
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
@@ -2,18 +2,26 @@
 # Read the Docs configuration file
 # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 
-# Note: I used https://github.com/readthedocs/readthedocs.org/issues/4912#issuecomment-664002569 for inspiration
+# Note: I used https://docs.readthedocs.io/en/stable/build-customization.html#install-dependencies-with-poetry for inspiration
 
 version: 2
 
+build:
+  os: "ubuntu-22.04"
+  tools:
+    python: "3.8"
+  jobs:
+    post_create_environment:
+      # Install poetry
+      # https://python-poetry.org/docs/#installing-manually
+      - pip install poetry
+    post_install:
+      # Install dependencies with 'docs' dependency group
+      # https://python-poetry.org/docs/managing-dependencies/#dependency-groups
+      # VIRTUAL_ENV needs to be set manually for now.
+      # See https://github.com/readthedocs/readthedocs.org/pull/11152/
+      - VIRTUAL_ENV=$READTHEDOCS_VIRTUALENV_PATH poetry install --only main,docs
+
 sphinx:
   configuration: docs/conf.py
-
-python:
-  version: 3.7
-  install:
-    - method: pip
-      path: .
-      extra_requirements:
-        - docs
-
+  fail_on_warning: true
diff --git a/analysis/README.md b/analysis/README.md
@@ -1,4 +1,4 @@
-![Snakemake](https://img.shields.io/badge/snakemake-�~I�6.7.0-brightgreen.svg?style=flat-square)](https://snakemake.bitbucket.io)
+![Snakemake](https://img.shields.io/badge/snakemake-�~I�8.12.0-brightgreen.svg?style=flat-square)](https://snakemake.bitbucket.io)
 
 # download
 Execute the following command.
@@ -8,9 +8,9 @@ git clone https://github.com/aryarm/happler
 You can also download example data for the pipeline. See [the config file](config/config.yml) for links and instructions.
 
 # setup
-The pipeline is written as a Snakefile which can be executed via [Snakemake](https://snakemake.readthedocs.io). For reproduciblity, we recommend installing the version that we used (6.7.0):
+The pipeline is written as a Snakefile which can be executed via [Snakemake](https://snakemake.readthedocs.io). For reproduciblity, we recommend installing the version that we used (8.12.0):
 ```
-conda create -n snakemake -c conda-forge --no-channel-priority 'bioconda::snakemake==6.7.0'
+conda create -n snakemake -c conda-forge --no-channel-priority 'bioconda::snakemake==8.12.0'
 ```
 `snakemake` will [automatically install all dependencies](https://snakemake.readthedocs.io/en/stable/snakefiles/deployment.html#integrated-package-management) of the pipeline upon its first execution using `conda`.
 
@@ -25,9 +25,9 @@ conda create -n snakemake -c conda-forge --no-channel-priority 'bioconda::snakem
     ```
     ./run.bash &
     ```
-    __or__ on a TORQUE cluster:
+    __or__ on a SLURM cluster:
     ```
-    qsub run.bash
+    sbatch run.bash
     ```
 ### Output
 All output of the pipeline will be placed in a new directory (`out/`, by default).

diff --git a/analysis/config/config-geuvadis.yml b/analysis/config/config-geuvadis.yml
@@ -0,0 +1,87 @@
+# This is the Snakemake configuration file that specifies paths and 
+# and options for the pipeline. Anybody wishing to use
+# the provided snakemake pipeline should first fill out this file with paths to
+# their own data, as the Snakefile requires it.
+# Every config option has reasonable defaults unless it is labeled as "required."
+# All paths are relative to the directory that Snakemake is executed in.
+# Note: this file is written in the YAML syntax (https://learnxinyminutes.com/docs/yaml/)
+
+
+# Paths to a SNP-STR haplotype reference panel
+# You can download this from http://gymreklab.com/2018/03/05/snpstr_imputation.html
+# If the VCFs are per-chromosome, replace the contig name in the file name with "{chr}"
+# The VCF(s) must be sorted and indexed (with a .tbi file in the same directory)
+# required!
+# ref_panel: "/projects/ps-gymreklab/resources/datasets/snpstr/1kg.snp.str.chr{chr}.vcf.gz"
+# snp_panel: "/projects/ps-gymreklab/resources/datasets/ukbiobank/array_imputed/pfile_converted/chr{chr}.pgen"
+snp_panel: "data/geuvadis/geuvadis_ensemble_phasing.pgen"
+# str_panel: "/tscc/projects/ps-gymreklab/jmargoli/ukbiobank/str_imputed/runs/first_pass/vcfs/annotated_strs/chr{chr}.vcf.gz"
+
+# Path to a list of samples to exclude from the analysis
+# There should be one sample ID per line
+# exclude_samples: data/ukb_random_samples_exclude.tsv
+
+# If SNPs are unphased, provide the path to a SHAPEIT4 map file like these:
+# https://github.com/odelaneau/shapeit4/tree/master/maps
+# The map file should use the same reference genome as the reference panel VCFs
+# phase_map: data/genetic_maps/chr{chr}.b37.gmap.gz
+
+# A "locus" is a string with a contig name, a colon, the start position, a dash, and
+# the end position or a BED file with a ".bed" file ending
+# There are different simulation modes that you can use:
+# 1. "str" - a tandem repeat is a string with a contig name, a colon, and the start position
+# 2. "snp" - a SNP follows the same format as "str"
+# 3. "hap" - a haplotype
+# 4. "ld_range" - creates random two-SNP haplotypes with a range of LD values between the alleles of each haplotype
+# 5. "run" - execute happler on a locus without simulating anything
+# The STR and SNP positions should be contained within the locus.
+# The positions should be provided in the same coordinate system as the reference
+# genome of the reference panel VCFs
+# The contig should correspond with the contig name from the {chr} wildcard in the VCF
+# required! and unless otherwise noted, all attributes of each mode are required
+# locus: 19:45401409-46401409 # center: 45901409 (APOe4)
+locus: data/geuvadis/geuvadis_eqtl_genes.full.liftover.bed
+modes:
+ str:
+  pos: 19:45903857 # STR_691361
+ snp:
+  pos: 19:45910672 # rs1046282
+ hap:
+  alleles: [rs36046716:G, rs1046282:G] # 45892145, 45910672
+  beta: [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45]
+ ld_range:
+  reps: 1
+  min_ld: 0
+  max_ld: 1
+  step: 0.1
+  min_af: 0.25
+  max_af: 0.75
+  # beta: [0.35]
+  beta: [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45]
+  alpha: [0.05]
+  random: false # whether to also produce random haplotypes
+ run:
+  pheno: data/geuvadis/phenos/{trait}.pheno
+  SVs: data/geuvadis/pangenie_hprc_hg38_all-samples_bi_SVs-missing_removed.pgen
+  # pheno_matrix: data/geuvadis/EUR_converted_expr_hg38.csv # optional
+mode: run
+
+# Covariates to use if they're needed
+# Otherwise, they're assumed to be regressed out of the phenotypes
+# Note: the finemapping methods won't be able to handle these covariates
+# covar: data/geuvadis/5PCs_sex.covar
+
+# Discard rare variants with a MAF below this number
+# Defaults to 0 if not specified
+min_maf: 0.1
+
+# Sample sizes to use
+# sample_size: [500, 1000, 1500, 2000, 2500]
+# sample_size: 777
+
+# Whether to include the causal variant in the set of genotypes provided to the
+# finemapping methods. Set this to true if you're interested in seeing how the
+# methods perform when the causal variant is absent from the data.
+# Defaults to false if not specified
+# You can also provide a list of booleans, if you want to test both values
+exclude_causal: [true, false]