diff --git a/.gitignore b/.gitignore index 5d70975f6..7e5dcdb40 100644 --- a/.gitignore +++ b/.gitignore @@ -1,9 +1,7 @@ # ignore test results -tests/test/* +oldtests/test/* # toy/experimental files -*.csv -*.tsv *.pkl # ignore eggs @@ -69,10 +67,12 @@ open_pipelines/ *RESERVE* doc/ +site/ build/ dist/ looper.egg-info/ loopercli.egg-info/ +__pycache__/ *ipynb_checkpoints* diff --git a/.travis.yml b/.travis.yml index 566f8749c..738b9c945 100644 --- a/.travis.yml +++ b/.travis.yml @@ -3,6 +3,8 @@ python: - "2.7" - "3.5" - "3.6" + - "3.7" + - "3.8" os: - linux install: @@ -10,7 +12,9 @@ install: - pip install . - pip install -r requirements/requirements-dev.txt - pip install -r requirements/requirements-test.txt -script: pytest --cov=looper +script: pytest tests -x -vv --cov=looper +after_success: + - coveralls branches: only: - dev diff --git a/MANIFEST.in b/MANIFEST.in index bbe6913ba..1f8c41772 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,4 +1,5 @@ include requirements/* include README.md include logo_looper.svg -include looper/jinja_templates/* \ No newline at end of file +include looper/jinja_templates/* +include looper/schemas/* \ No newline at end of file diff --git a/docs/README.md b/docs/README.md index 7d5276c4e..e6abf902c 100644 --- a/docs/README.md +++ b/docs/README.md @@ -4,16 +4,16 @@ ## What is looper? -`Looper` is a pipeline submitting engine. `Looper` deploys any command-line pipeline for each sample in a project organized in [standard PEP format](https://pepkit.github.io/docs/home/). You can think of `looper` as providing a single user interface to running, summarizing, monitoring, and otherwise managing all of your sample-intensive research projects the same way, regardless of data type or pipeline used. +Looper is a job submitting engine. Looper deploys arbitrary shell commands for each sample in a [standard PEP project](https://pepkit.github.io/docs/home/). You can think of looper as providing a single user interface to running, monitoring, and managing all of your sample-intensive research projects the same way, regardless of data type or pipeline used. ## What makes looper better? -`Looper`'s key strength is that it **decouples job handling from the pipeline process**. In a typical pipeline, job handling (managing how individual jobs are submitted to a cluster) is delicately intertwined with actual pipeline commands (running the actual code for a single compute job). The `looper` approach is modular, following the [the unix principle](https://en.wikipedia.org/wiki/Unix_philosophy): `looper` *only* manages job submission. This approach leads to several advantages compared with the traditional integrated approach: +Looper **decouples job handling from the pipeline process**. In a typical pipeline, job handling (managing how individual jobs are submitted to a cluster) is delicately intertwined with actual pipeline commands (running the actual code for a single compute job). In contrast, the looper approach is modular: looper *only* manages job submission. This approach leads to several advantages compared with the traditional integrated approach: -1. running a pipeline on just one or two samples/jobs is simpler, and does not require a full-blown distributed compute environment. -2. pipelines do not need to independently re-implement job handling code, which is shared. -3. every project uses a universal structure (expected folders, file names, and sample annotation format), so datasets can more easily move from one pipeline to another. -4. users must learn only a single interface that works with any of their projects for any pipeline. +1. pipelines do not need to independently re-implement job handling code, which is shared. +2. every project uses a universal structure, so datasets can move from one pipeline to another. +3. users must learn only a single interface that works with any project for any pipeline. +4. running just one or two samples/jobs is simpler, and does not require a distributed compute environment. @@ -24,13 +24,13 @@ Releases are posted as [GitHub releases](https://github.com/pepkit/looper/releas ```console -pip install --user loopercli +pip install --user looper ``` Update with: ```console -pip install --user --upgrade loopercli +pip install --user --upgrade looper ``` If the `looper` executable in not automatically in your `$PATH`, add the following line to your `.bashrc` or `.profile`: diff --git a/docs/changelog.md b/docs/changelog.md index 44628a8c1..2923150fa 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -2,7 +2,34 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) and [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) format. +## [1.2.0] - 2020-05-26 + +**This version introduced backwards-incompatible changes.** + +### Added +- Commands: + - `init`; initializes `.looper.yaml` file + - `inspect`; inspects `Project` or `Sample` objects + - `table`; writes summary stats table + - `runp`; runs project level pipelines +- Input schemas and output schemas +- `--settings` argument to specify compute resources as a YAML file +- Option to preset CLI options in a dotfile +- `--command-extra` and `--command-extra-override` arguments that append specified string to pipeline commands +- Option to specify destination of sample YAML in pipeline interface +- `--pipeline_interfaces` argument that allows pipeline interface specification via CLI + +### Changed +- `looper summarize` to `looper report` +- Pipeline interface format changed drastically +- The PyPi name changed from 'loopercli' to 'looper' +- resources section in pipeline interface replaced with `size_dependent_attributes` or `dynamic_variables_command_template`. +- `--compute` can be used to specify arguments other than resources +- `all_input_files` and `required_input_files` keys in pipeline interface moved to the input schema and renamed to `files` and `required_files` +- pipeline interface specification + ## [0.12.6] -- 2020-02-21 + ### Added - possibility to execute library module as a script: `python -m looper ...` diff --git a/docs/cluster-computing.md b/docs/cluster-computing.md deleted file mode 100644 index f4198dc10..000000000 --- a/docs/cluster-computing.md +++ /dev/null @@ -1,60 +0,0 @@ -# Cluster computing - -By default, `looper` will build a shell script for each sample and then run each sample serially on the local computer. This is convenient for simple cases, because it doesn't require any extra configuration. When it comes time to scale up, no problem! This is where `looper` really excels, in large projects that require submitting these jobs to a cluster resource manager (like SLURM, SGE, LFS, etc.). Starting with version `0.11` (released in 2019), `looper` uses [divvy](http://code.databio.org/divvy) to manage computing resource configuration so that projects and pipelines can easily travel among environments. - -`Divvy` uses a template system to build scripts for each job. To start, `divvy` includes a few built-in templates so you can run basic jobs without messing with anything, but the template system provides ultimate flexibility to customize your job scripts however you wish. This template system is how we can use looper to run jobs on any cluster resource manager, by simply setting up a template that fits our particular cluster manager. - -## Overview and basic example of cluster computing - -In a nutshell, to configure `looper` to use cluster computing, all you have to do is provide some information about your cluster setup. You create a `divvy` computing configuration file (`compute_config.yaml`) and point an environment variable (`DIVCFG`) to this file, and that's it! You then have access to any configured computing packages by using `looper --compute package`, where `package` can be any computing system you configure. - -For example, here's a `compute_config.yaml` file that works with a SLURM environment: -```yaml -compute: - default: - submission_template: templates/local_template.sub - submission_command: sh - loc: - submission_template: templates/local_template.sub - submission_command: sh - slurm: - submission_template: templates/slurm_template.sub - submission_command: sbatch - partition: queue_name -``` - -Each section within `compute` defines a "compute package" that can be activated. -By default, the package named `default` will be used, You may then choose a different compute package on the fly by specifying the `--compute` option: ``looper run --compute PACKAGE``. In this case, `PACKAGE` could be either `loc` (which would do the same thing as the default, so doesn't change anything) or `slurm`, which would run the jobs on SLURM, from queue `queue_name`. You can make as many compute packages as you wish (for example, to submit to different SLURM partitions). - -This is just an overview; when you're ready to configure your computing environment, head over to the [divvy docs](http://code.databio.org/divvy) to get the whole story. - - -## Using divvy with looper - -What is the source of values used to populate the variables? Well, they are pooled together from several sources. Divvy uses a hierarchical system to collect data values from global and local sources, which enables you to re-use settings across projects and environments. To start, there are a few built-ins: - -Built-in variables: - -- `{CODE}` is a reserved variable that refers to the actual command string that will run the pipeline. `Looper` will piece together this command individually for each sample -- `{JOBNAME}` -- automatically produced by `looper` using the `sample_name` and the pipeline name. -- `{LOGFILE}` -- automatically produced by `looper` using the `sample_name` and the pipeline name. - - -Other variables are not automatically created by `looper` and are specified in a few different places: - -*DIVCFG config file*. Variables that describes settings of a **compute environment** should go in the `DIVCFG` file. Any attributes in the activated compute package will be available to populate template variables. For example, the `partition` attribute is specified in many of our default `DIVCFG` files; that attribute is used to populate a template `{PARTITION}` variable. This is what enables pipelines to work in any compute environment, since we have no control over what your partitions are named. You can also use this to change SLURM queues on-the-fly. - -*pipeline_interface.yaml*. Variables that are **specific to a pipeline** can be defined in the `pipeline interface` file. Variables in two different sections are available to templates: the `compute` and `resources` sections. The difference between the two is that the `compute` section is common to all samples, while the `resources` section varies based on sample input size. As an example of a variable pulled from the `compute` section, we defined in our `pipeline_interface.yaml` a variable pointing to the singularity or docker image that can be used to run the pipeline, like this: - -``` -compute: - singularity_image: /absolute/path/to/images/image -``` - -Now, this variable will be available for use in a template as `{SINGULARITY_IMAGE}`. This makes sense to put in the `compute` section because it doesn't change for different sizes of input files. This path should probably be absolute, because a relative path will be interpreted as relative to the working directory where your job is executed (*not* relative to the pipeline interface). - -The other pipeline interface section that is available to templates is `resources`. This section uses a list of *resource packages* that vary based on sample input size. We use these in existing templates to adjust the amount of resources we need to request from a resource manager like SLURM. For example: `{MEM}`, `{CORES}`, and `{TIME}` are all defined in this section, and they vary for different input file sizes. - -[Read more about pipeline_interface.yaml here](pipeline-interface.md). - -*project_config.yaml*. Finally, project-level variables can also be populated from the `compute` section of a project config file. We don't recommend using this and it is not yet well documented, but it would enable you to make project-specific compute changes (such as billing a particular project to a particular SLURM resource account). diff --git a/docs/concentric-templates.md b/docs/concentric-templates.md new file mode 100644 index 000000000..efe8598d9 --- /dev/null +++ b/docs/concentric-templates.md @@ -0,0 +1,56 @@ +# Looper's concentric template system + +## Introduction + +To build job scripts, looper uses a 2-level template system consisting of an inner template wrapped by an outer template. The inner template is called a *command template*, which produces the individual commands to execute. The outer template is the *submission template*, which wraps the commands in environment handling code. This layered design allows us to decouple the computing environment from the pipeline, which improves portability. + +## The command template + +The command template is specified by a pipeline in the pipeline interface. A very basic command template could be something like this: + +```console +pipeline_command {sample.input_file} --arg +``` + +In the simplest case, looper can run the pipeline by simply running these commands. This example contains no information about computing environment, such as SLURM submission directives. + +## The submission template + +To extend to submitting the commands to a cluster, it may be tempting to add these details directly to the command template, which cause the jobs to be submitted to SLURM instead of run directly. However, this would restrict the pipeline to *only* running via SLURM, since the submission code would be tightly coupled to the command code. Instead, looper retains flexibility by introducing a second template layer, the *submission template*. The submission template is specified at the level of the computing environment. A submission template can also be as simple or complex as required. For a command to be run in a local computing environment, a basic template will suffice: + +```console +#! /usr/bin/bash + +{CODE} +``` + +A more complicated template could submit a job to a SLURM cluster: + +```console +#!/bin/bash +#SBATCH --job-name='{JOBNAME}' +#SBATCH --output='{LOGFILE}' +#SBATCH --mem='{MEM}' +#SBATCH --cpus-per-task='{CORES}' +#SBATCH --time='{TIME}' +echo 'Compute node:' `hostname` +echo 'Start time:' `date +'%Y-%m-%d %T'` + +srun {CODE} +``` + +## The advantages of concentric templates + +Looper first populates the command template, and then provides the output as a variable and used to populate the `{CODE}` variable in the submission template. This decoupling provides substantial advantages: + +1. The commands can be run on any computing environment by simply switching the submission template. +2. The submission template can be used for any computing environment parameters, such as containers. +3. The submission template only has to be defined once *per environment*, so many pipelines can use them. +4. We can [group multiple individual commands](grouping-jobs.md) into a single submission script. +5. The submission template is universal and can be handled by dedicated submission template software. + +In fact, looper uses [divvy](http://divvy.databio.org) to handle submission templates. The divvy submission templates can be used for interactive submission of jobs, or used by other software. + +## Populating templates + +The task of running jobs can be thought of as simply populating the templates with variables. To do this, Looper provides [variables from several sources](variable-namespaces.md). diff --git a/docs/config-files.md b/docs/config-files.md index af175771a..3e700e726 100644 --- a/docs/config-files.md +++ b/docs/config-files.md @@ -6,46 +6,39 @@ We've organized these files so that each handle a different level of infrastruct - Environment - Project -- Sample - Pipeline This makes the system very adaptable and portable, but for a newcomer, it is easy to map each to its purpose. So, here's an explanation of each for you to use as a reference until you are familiar with the whole ecosystem. -Which ones you need to know about will depend on whether you're a **pipeline *user*** (running pipelines on your project) -or a **pipeline *developer*** (building your own pipeline). +Which ones you need to know about will depend on whether you're a pipeline *user* (running pipelines on your project) +or a pipeline *developer* (building your own pipeline). ## Pipeline users -Users (non-developers) of pipelines only need to be aware of one or two config files: +Users (non-developers) of pipelines only need to be aware of one or two config files. -- The [project config](define-your-project): This file is specific to each project and -contains information about the project's metadata, where the processed files should be saved, -and other variables that allow to configure the pipelines specifically for this project. -It follows the standard `looper` format (now referred to as `PEP`, or "*portable encapsulated project*" format). +### Project configuration -If you are planning to submit jobs to a cluster, then you need to know about a second config file: -- The [`PEPENV` config](cluster-computing.md): This file tells `looper` how to use compute resource managers, like SLURM. -After initial setup it typically requires little (if any) editing or maintenance. +[**project config**](defining-a-project.md) -- this file is specific to each project and contains information about the project's metadata, where the processed files should be saved, and other variables that allow to configure the pipelines specifically for this project. It follows the standard Portable Encapsulated Project format, or PEP for short. -That should be all you need to worry about as a pipeline user. -If you need to adjust compute resources or want to develop a pipeline or have more advanced project-level control -over pipelines, you'll need knowledge of the config files used by pipeline developers. +### Environment configuration + +[**environment config**](http://divvy.databio.org/en/latest/configuration/) -- if you are planning to submit jobs to a cluster, then you need to be aware of environment configuration. This task is farmed out to [divvy](http://divvy.databio.org/en/latest/), a computing resource configuration manager. Follow the divvy documentation to learn about ways to tweak the computing environment settins according to your needs. + +That should be all you need to worry about as a pipeline user. If you need to adjust compute resources or want to develop a pipeline or have more advanced project-level control over pipelines, you'll need knowledge of the config files used by pipeline developers. ## Pipeline developers -If you want to make pipeline compatible with `looper`, tweak the way `looper` interacts with a pipeline for a given project, -or change the default cluster resources requested by a pipeline, you need to know about a configuration file that coordinates linking pipelines to a project. -- The [pipeline interface file](pipeline-interface.md): -This file sas two sections" - - `protocol_mapping` tells looper which pipelines exist, and how to map each protocol (sample data type) to a pipeline - - `pipelines` describes options, arguments, and compute resources that defined how `looper` should communicate with each pipeline. +### Pipeline configuration + +If you want to make pipeline compatible with looper, tweak the way looper interacts with a pipeline for a given project, +or change the default cluster resources requested by a pipeline, you need to know about a configuration file that coordinates linking pipelines to a project. This happens via the [pipeline interface file](pipeline-interface-specification.md). -Finally, if you're using [the `pypiper` framework](https://github.com/databio/pypiper) to develop pipelines, -it uses a pipeline-specific configuration file, which is detailed in the [`pypiper` documentation](http://pypiper.readthedocs.io/en/latest/advanced.html#pipeline-config-files). +Finally, if you're using [the pypiper framework](https://github.com/databio/pypiper) to develop pipelines, +it uses a pipeline-specific configuration file, which is detailed in the [pypiper documentation](http://pypiper.readthedocs.io/en/latest/advanced.html#pipeline-config-files). Essentially, each pipeline may provide a configuration file describing where software is, and parameters to use for tasks within the pipeline. This configuration file is by default named like pipeline name, -with a `.yaml` extension instead of `.py`. For example, by default `rna_seq.py` looks for an accompanying `rna_seq.yaml` file. -These files can be changed on a per-project level using the `pipeline_config` section of a [project configuration file](define-your-project). +with a `.yaml` extension instead of `.py`. For example, by default `rna_seq.py` looks for an accompanying `rna_seq.yaml` file. diff --git a/docs/containers.md b/docs/containers.md index 0a854937e..309ac673b 100644 --- a/docs/containers.md +++ b/docs/containers.md @@ -2,7 +2,7 @@ Because `looper` uses `divvy` for computing configuration, running jobs in containers is easy! `Divvy` can use the same template system to do either cluster computing or to run jobs in linux containers (for example, using `docker` or `singularity`). You can even run jobs in a container *on a cluster*. -All you need to do is follow the same instructions as in [running jobs on a cluster](cluster-computing.md), but use templates that run those jobs in containers. To see examples of how to do this, refer to the [divvy docs on running containers](http://code.databio.org/divvy/containers/). +All you need to do is follow the same instructions as in [running jobs on a cluster](running-on-a-cluster.md), but use templates that run those jobs in containers. To see examples of how to do this, refer to the [divvy docs on running containers](http://divvy.databio.org/en/latest/containers/). ## Overview @@ -61,4 +61,4 @@ srun singularity exec instance://{JOBNAME}_image {CODE} singularity instance.stop {JOBNAME}_image ``` -Notice how these values will be used to populate a template that will run the pipeline in a container. Now, to use singularity, you just need to activate this compute package in the usual way, which is using the `compute` argument: ``looper run --compute singularity_slurm``. +Notice how these values will be used to populate a template that will run the pipeline in a container. Now, to use singularity, you just need to activate this compute package in the usual way, which is using the `package` argument: ``looper run --package singularity_slurm``. diff --git a/docs/define-your-project.md b/docs/define-your-project.md deleted file mode 100644 index e35c463f4..000000000 --- a/docs/define-your-project.md +++ /dev/null @@ -1,5 +0,0 @@ -# How to define a project - -Most pipelines require a unique way to organize samples, but `looper` subscribes to [standard Portable Encapsulated Project (PEP) format](http://pepkit.github.io). PEP is a standardized way to represent metadata about your project and each of its samples. If you follow this format, then your project can be read not only by `looper`, but also by other software, like the [pepr R package](http://github.com/pepkit/pepr), or the [peppy python package](http://github.com/pepkit/peppy). You should read the instructions on [how to create a PEP](https://pepkit.github.io/docs/simple_example/) to use with `looper`. - -So, the first thing you should do is follow the [instructions for how to make a PEP](https://pepkit.github.io/docs/simple_example/). Once you've have a basic PEP created, the next section shows you [how to add looper-specific configuration to the PEP config file](project-config-looper.md), or you can jump ahead to [linking a project to a pipeline](linking-a-pipeline.md). diff --git a/docs/defining-a-project.md b/docs/defining-a-project.md new file mode 100644 index 000000000..1bcf39656 --- /dev/null +++ b/docs/defining-a-project.md @@ -0,0 +1,98 @@ +# How to define a project + +## 1. Start with a basic PEP + +To start, you need a project defined in the [standard Portable Encapsulated Project (PEP) format](http://pep.databio.org). Start by [creating a PEP](https://pep.databio.org/en/latest/simple_example/). + +## 2. Connect the PEP to looper + +Once you have a basic PEP, you can connect it to looper. Just provide the required looper-specific piece of information -- `output-dir`, a parent folder where you want looper to store your results. You do this by adding a `looper` section to your PEP. The `output_dir` key is expected in the top level of the `looper` section of the project configuration file. Here's an example: + +```yaml +looper: + output_dir: "/path/to/output_dir" +``` + + +## 3. Link a pipeline to your project + +Next, you'll need to point the PEP to the *pipeline interface* file that describes the command you want looper to run. + +### Understanding pipeline interfaces + +Looper links projects to pipelines through a file called the *pipeline interface*. Any looper-compatible pipeline must provide a pipeline interface. To link the pipeline, you simply point each sample to the pipeline interfaces for any pipelines you want to run. + +Looper pipeline interfaces can describe two types of pipeline: sample-level pipelines or project-level pipelines. Briefly, a sample-level pipeline is executed with `looper run`, which runs individually on each sample. A project-level pipeline is executed with `looper runp`, which runs a single job *per pipeline* on an entire project. Typically, you'll first be interested in the sample-level pipelines. You can read in more detail in the [pipeline tiers documentation](pipeline-tiers.md). + +### Adding a sample-level pipeline interface + +Sample pipelines are linked by adding a sample attribute called `pipeline_interfaces`. There are 2 easy ways to do this: you can simply add a `pipeline_interfaces` column in the sample table, or you can use an *append* modifier, like this: + +```yaml +sample_modifiers: + append: + pipeline_interfaces: "/path/to/pipeline_interface.yaml" +``` + +The value for the `pipeline_interfaces` key should be the *absolute* path to the pipeline interface file. The paths may also contain environment variables. Once your PEP is linked to the pipeline, you just need to make sure your project provides any sample metadata required by the pipeline. + +### Adding a project-level pipeline interface + +Project pipelines are linked in the `looper` section of the project configuration file: + +``` +looper: + pipeline_interfaces: "/path/to/project_pipeline_interface.yaml" +``` + +### How to link to multiple pipelines + +Looper decouples projects and pipelines, so you can have many projects using one pipeline, or many pipelines running on the same project. If you want to run more than one pipeline on a sample, you can simply add more than one pipeline interface, like this: + +```yaml +sample_modifiers: + append: + pipeline_interfaces: ["/path/to/pipeline_interface.yaml", "/path/to/pipeline_interface2.yaml"] +``` + +Looper will submit jobs for both of these pipelines. + +If you have a project that contains samples of different types, then you can use an `imply` modifier in your PEP to select which pipelines you want to run on which samples, like this: + + +```yaml +sample_modifiers: + imply: + - if: + protocol: "RRBS" + then: + pipeline_interfaces: "/path/to/pipeline_interface.yaml" + - if: + protocol: "ATAC" + then: + pipeline_interfaces: "/path/to/pipeline_interface2.yaml" +``` + + +## 5. Customize looper + +That's all you need to get started linking your project to looper. But you can also customize things further. Under the `looper` section, you can provide a `cli` keyword to specify any command line (CLI) options from within the project config file. The subsections within this section direct the arguments to the respective `looper` subcommands. So, to specify, e.g. sample submission limit for a `looper run` command use: + +```yaml +looper: + output_dir: "/path/to/output_dir" + cli: + run: + limit: 2 +``` + +or, to pass this argument to any subcommand: + +```yaml +looper: + output_dir: "/path/to/output_dir" + all: + limit: 2 +``` + +Keys in the `cli.` section *must* match the long argument parser option strings, so `command-extra`, `limit`, `dry-run` and so on. For more CLI options refer to the subcommands [usage](usage.md). \ No newline at end of file diff --git a/docs/faq.md b/docs/faq.md index 9aa7e8827..3088527da 100644 --- a/docs/faq.md +++ b/docs/faq.md @@ -13,7 +13,7 @@ You can add that location to your path by appending it (`export PATH=$PATH:~/.lo ## How can I run my jobs on a cluster? -Looper uses the external package [divvy](http://code.databio.org/divvy) for cluster computing, making it flexible enough to use with any cluster resource environment. Please see the [tutorial on cluster computing with looper and divvy](cluster-computing.md). +Looper uses the external package [divvy](http://code.databio.org/divvy) for cluster computing, making it flexible enough to use with any cluster resource environment. Please see the [tutorial on cluster computing with looper and divvy](running-on-a-cluster.md). ## What's the difference between `looper` and `pypiper`? @@ -36,7 +36,7 @@ As of version `0.11`, you can use `looper rerun` to submit only jobs with a `fai You may notice that the compute config file does not specify resources to request (like memory, CPUs, or time). Yet, these are required in order to submit a job to a cluster. **Resources are not handled by the divcfg file** because they not relative to a particular computing environment; instead they vary by pipeline and sample. As such, these items should be defined at other stages. -Resources defined in the `pipeline_interface.yaml` file (`pipelines` section) that connects looper to a pipeline. The reason for this is that the pipeline developer is the most likely to know what sort of resources her pipeline requires, so she is in the best position to define the resources requested. For more information on how to adjust resources, see the `pipelines` section of the [pipeline interface page](pipeline-interface.md). If all the different configuration files seem confusing, now is a good time to review [who's who in configuration files](config-files.md). +Resources defined in the `pipeline_interface.yaml` file that connects looper to a pipeline. The reason for this is that pipeline developers are the most likely to know what sort of resources their pipeline requires, so they are in the best position to define the resources requested. For more information on how to adjust resources, see the `compute` section of the [pipeline interface page](pipeline-interface-specification.md). If all the different configuration files seem confusing, now is a good time to review [who's who in configuration files](config-files.md). ## Which configuration file has which settings? diff --git a/docs/initialize.md b/docs/initialize.md new file mode 100644 index 000000000..281bbf4e9 --- /dev/null +++ b/docs/initialize.md @@ -0,0 +1,21 @@ +# How to initialize a looper repository + +*This is considered a beta feature and may change in future releases*. + +Looper provides a command `looper init` that allows you to initialize folders as looper repositories. This enables you to use `looper` without passing your PEP every time. + +```bash +looper init pep.yaml +``` + +Now, as long as you are operating from within this directory or any of the subdirectories, you can run any looper command without passing `pep.yaml`: + +```bash +looper run +``` + +The `looper init` command creates a dotfile called `.looper.yaml` in the current directory. This file simply points looper to the to the config file passed as positional argument to `looper init`: + +```yaml +config_file_path: relative/path/to/pep.yaml +``` diff --git a/docs/linking-a-pipeline.md b/docs/linking-a-pipeline.md deleted file mode 100644 index 665586292..000000000 --- a/docs/linking-a-pipeline.md +++ /dev/null @@ -1,19 +0,0 @@ -# How to link a project to a pipeline - -One of the advantages of looper is that it decouples projects and pipelines, so you can have many projects that all use the same pipeline, or many pipelines running on the same project. This modular connection between pipelines and projects happens through a file called the `pipeline interface`. The `pipeline interface` tells `looper` how to run the pipeline. - -**If you're using one or more existing looper-compatible pipelines**, all you have to do is point your project config file at the `pipeline interface` files for any pipelines your project needs. For most casual users of pipelines, that's all you'll need to do; you'll never need to create a new `pipeline interface` file. But **if you do need to make a new pipeline looper-compatible**, you do this by creating a `pipeline interface` file, which is explained in [Writing a pipeline interface](pipeline-interface.md). - -## Pointing your PEP to an existing pipeline interface file - -Many projects will require only existing pipelines that are already looper-compatible. We maintain a (growing) list of public [looper-compatible pipelines](https://github.com/pepkit/hello_looper/blob/master/looper_pipelines.md) that will get you started. To use one of these pipelines, first clone the desired code repository. Then, use the `pipeline_interfaces` key in the `metadata` section of a project config file to point your project to that `pipeline_interface` file: - -```yaml - metadata: - pipeline_interfaces: /path/to/cloned/pipeline_interface.yaml -``` - -The value for the `pipeline_interfaces` key should be the *absolute* path to the pipeline interface file. After that, you just need to make sure your project definition provides all the necessary sample metadata required by the pipeline you want to use. For example, you will need to make sure your sample annotation sheet specifies the correct value under `protocol` that your linked pipeline understands. -Such details are specific to each pipeline and should be defined somewhere in the pipeline's documentation, e.g. in a `README` file. - -You can also [link more than one pipeline](linking-multiple-pipelines.md). diff --git a/docs/linking-multiple-pipelines.md b/docs/linking-multiple-pipelines.md deleted file mode 100644 index b0e1fec95..000000000 --- a/docs/linking-multiple-pipelines.md +++ /dev/null @@ -1,14 +0,0 @@ -# How to link to multiple pipelines - -If you have a project that contains samples of different types, then you may need to **link more than one pipeline** to your project. You do this by simply adding other `pipeline interface` files to a list in the `metadata.pipeline_interfaces` field, like this: - -```yaml - metadata: - pipeline_interfaces: [/path/pipeline_interface1.yaml, /path/pipeline_interface2.yaml] -``` - - -In this case, for a given sample, looper will first look in `pipeline_interface1.yaml` to see if appropriate pipeline exists for this sample type. If it finds one, it will use this pipeline (or set of pipelines, as specified in the `protocol_mappings` section of the ``pipeline_interface.yaml` file). Having submitted a suitable pipeline it will ignore the pipeline_interface2.yaml interface. However if there is no suitable pipeline in the first interface, looper will check the second and, if it finds a match, will submit that. If no suitable pipelines are found in any of the interfaces, the sample will be skipped as usual. - -If your project contains samples with different protocols, you can use this to run several different pipelines. For example, if you have ATAC-seq, RNA-seq, and ChIP-seq samples in your project, you may want to include a `pipeline interface` for 3 different pipelines, each accepting one of those protocols. In the event that more than one of the `pipeline interface` files provide pipelines for the same protocol, looper will only submit the pipeline from the first interface. Thus, this list specifies a *priority order* to pipeline repositories. - diff --git a/docs/parameterizing-pipelines.md b/docs/parameterizing-pipelines.md new file mode 100644 index 000000000..66cca4cd8 --- /dev/null +++ b/docs/parameterizing-pipelines.md @@ -0,0 +1,69 @@ +# How to pass extra command-line arguments + +Occasionally, a particular project needs to run a particular flavor of a pipeline. We'd like to just adjust the arguments passed for just this project. +We may be passing a completely separate config file to the pipeline, or just tweaking a command-line argument. Either way, we treat things the same way. + +Looper provides a feature called *command extras* to solve this problem. Command extras provide a way to pass arbitrary commands through looper on to the pipeline. This *extra* information can be specified on the command line, or at the sample or project level, depending on the pipeline. + +## Sample-level command extras + +For sample pipelines, there are two possibilities: 1) command line argument for `run` subcommand and 2) setting sample attribute using general PEP sample modifiers to add a `command_extra` attribute to any samples, however you wish. + +You can pass extra arguments using `--command-extra` like this: + +``` +looper run project_config.yaml --command-extra="--flavor-flag" +``` + +For the PEP-based approach, for example, if your extras are the same for all samples you could just use an `append` modifier: + + +```yaml +sample_modifiers: + append: + command_extra: "--flavor-flag" +``` + +Or, if you need to modulate on the basis of some other attribute value, you could use an imply modifier: + + +```yaml +sample_modifiers: + imply: + - if: + protocol: "rrbs" + then: + command_extra: "-C flavor.yaml --epilog" +``` + +## Project-level command extras + +For *project pipelines*, you can specify command extras in the `looper` section of the PEP config: + +```yaml +looper: + output_dir: "/path/to/output_dir" + cli: + runp: + command-extra: "--flavor" +``` + +or as an argument to the `looper runp` command: + + +```bash +looper runp project_config.yaml --command-extra="--flavor-flag" +``` + + +## Overriding PEP-based command extras + +By default, the CLI extras are *appended to the command_extra specified in your PEP*. If you instead want to *override* the command extras listed in the PEP, you can instead use `--command-extra-override`. + +So, for example, make your looper call like this: + +```bash +looper run --command-extra-override="-R" +``` + +That will remove any defined command extras and append `-R` to the end of any commands created by looper. diff --git a/docs/project-config-looper.md b/docs/pipeline-config.md similarity index 94% rename from docs/project-config-looper.md rename to docs/pipeline-config.md index 655b2a15b..9975a89df 100644 --- a/docs/project-config-looper.md +++ b/docs/pipeline-config.md @@ -1,12 +1,11 @@ # Configure a PEP to work with looper -Once you have a basic [PEP config](https://pepkit.github.io/docs/project_config/) file, you can add some special sections to control `looper` features. In addition to the main sections, `looper` adds these sections: -### Project config section: `pipeline_config` +### Occasionally, a particular project needs to run a particular flavor of a pipeline. Rather than creating an entirely new pipeline, you can parameterize the differences with a **pipeline config** file, -and then specify that file in the **project config** file. +It used to be that you could add a `pipeline_config` section in the **project config** file. **Example**: @@ -20,7 +19,14 @@ pipeline_config: # Or you can point to a specific config to be used in this project: wgbs.py: wgbs_flavor1.yaml ``` + command_template: > + {% if project.looper.pipeline_config is defined %} -C {project.looper.pipeline_config}{% endif %} + + +Now we simply want to do: + +{} This will instruct `looper` to pass `-C wgbs_flavor1.yaml` to any invocations of wgbs.py (for this project only). Your pipelines will need to understand the config file (which will happen automatically if you use pypiper). diff --git a/docs/pipeline-interface-specification.md b/docs/pipeline-interface-specification.md new file mode 100644 index 000000000..e3efdd0c2 --- /dev/null +++ b/docs/pipeline-interface-specification.md @@ -0,0 +1,212 @@ +--- +title: Pipeline interface specification +--- + +

Pipeline interface specification

+ +Table of contents: + +[TOC] + +## Introduction + +In order to run an arbitrary pipeline, we require a formal specification for how the pipeline is to be used. We define this using a *pipeline interface* file. It maps attributes of a PEP project or sample to the pipeline CLI arguments. Thus, it defines the interface between the project metadata (the PEP) and the pipeline itself. + +If you're using *existing* `looper`-compatible pipelines, you don't need to create a new interface; just [point your project at the one that comes with the pipeline](defining-a-project.md). When creating *new* `looper`-compatible pipelines, you'll need to create a new pipeline interface file. + + + +## Overview of pipeline interface components + +A pipeline interface may contain the following keys: + +- `pipeline_name` (REQUIRED) - A string identifying the pipeline, +- `pipeline_type` (REQUIRED) - A string indicating a pipeline type: "sample" (for `run`) or "project" (for `runp`), +- `command_template` (REQUIRED) - A [Jinja2](https://jinja.palletsprojects.com/en/2.11.x/) template used to construct a pipeline command command to run. +- `path` (RECOMMENDED) - The path to the pipeline script, relative to the pipeline interface. +- `input_schema` (RECOMMENDED) - A [PEP Schema](http://eido.databio.org) formally defining *required inputs* for the pipeline +- `output_schema` (RECOMMENDED) - A schema describing the *outputs* of the pipeline +- `compute` (RECOMMENDED) - Settings for computing resources +- `sample_yaml_path` (OPTIONAL) - Path to sample yaml files produced by looper. + +The pipeline interface should define either a sample pipeline or a project pipeline. Here's a simple example: + +```yaml +pipeline_name: RRBS +pipeline_type: sample +path: path/to/rrbs.py +input_schema: path/to/rrbs_schema.yaml +command_template: {pipeline.path} --input {sample.data_path} +``` + +Pretty simple. The `pipeline_name` is arbitrary. It's used for messaging and identification. Ideally, it's unique to each pipeline. In this example, we define a single sample-level pipeline. + +## Details of pipeline interface components + +### pipeline_name + +The pipeline name is arbitrary. It should be unique for each pipeline. Looper uses it for a few things: + +1. to construct the `job_name` variable (accessible via `{ looper.job_name }`). See [variable namespaces](variable-namespaces.md) for more details. + +2. to check for flags. For pipelines that produce flags, looper will be aware of them and not re-submit running jobs. + +### pipeline_type + +Looper can run 2 kinds of pipeline: *sample pipelines* run once per sample; *project pipelines* run once per project. The type of pipeline must be specified in the pipeline interface as `pipeline_type: sample` or `pipeline_type: project`. + +### command_template + +The command template is the most critical part of the pipeline interface. It is a [Jinja2](https://jinja.palletsprojects.com/) template for the command to run for each sample. Within the `command_template`, you have access to variables from several sources. These variables are divided into namespaces depending on the variable source. You can access the values of these variables in the command template using the single-brace jinja2 template language syntax: `{namespace.variable}`. For example, looper automatically creates a variable called `job_name`, which you may want to pass as an argument to your pipeline. You can access this variable with `{looper.job_name}`. The available namespaces are described in detail in [looper variable namespaces](variable-namespaces.md). + +Because it's based on Jinja2, command templates are extremely flexible. For example, optional arguments can be accommodated using Jinja2 syntax, like this: + +``` +command_template: > + {pipeline.path} + --sample-name {sample.sample_name} + --genome {sample.genome} + --input {sample.read1} + --single-or-paired {sample.read_type} + {% if sample.read2 is defined %} --input2 {sample.read2} {% endif %} + {% if sample.peak_caller is defined %} --peak-caller {sample.peak_caller} {% endif %} + {% if sample.FRIP_ref is defined %} --frip-ref-peaks {sample.FRIP_ref} {% endif %} +``` + +Arguments wrapped in Jinja2 conditionals will only be added *if the specified attribute exists for the sample*. + +### path + +Absolute or relative path to the script or command for this pipeline. Relative paths are considered **relative to your pipeline_interface file**. We strongly recommend using relative paths where possible to keep your pipeline interface file portable. You may also use shell environment variables (like `${HOME}`) in the `path`. You can then use this variable to refer to the pipeline command to execute by using `{pipeline.path}` in the `command_template`. + +The `path` attribute is not necessary; it is possible to simply include the relative path to the pipeline inside the `command_template` directly. However, we recommend using `path` instead, and then referring to it in the command_template using `{pipeline.path}`, because this indicates more clearly what the base script of the pipeline is. + +### input_schema + +The input schema formally specifies the *input processed by this pipeline*. The input schema serves 2 related purposes: + +1. **Validation**. Looper uses the input schema to ensure that the project fulfills all pipeline requirements before submitting any jobs. Looper uses the PEP validation tool, [eido](http://eido.databio.org), to validate input data by ensuring that input samples have the attributes and input files required by the pipeline. Looper will only submit a sample pipeline if the sample validates against the pipeline's input schema. + +2. **Description**. The input schema is also useful to describe the inputs, including both required and optional inputs, thereby providing a standard way to describe a pipeline's inputs. In the schema, the pipeline author can describe exactly what the inputs mean, making it easier for users to learn how to structure a project for the pipeline. + +Details for how to write a schema in in [writing a schema](http://eido.databio.org/en/master/writing-a-schema/). The input schema format is an extended [PEP JSON-schema validation framework](http://pep.databio.org/en/latest/howto_validate/), which adds several capabilities, including + +- `required` (optional): A list of sample attributes (columns in the sample table) that **must be defined** +- `required_files` (optional): A list of sample attributes that point to **input files that must exist**. +- `files` (optional): A list of sample attributes that point to input files that are not necessarily required, but if they exist, should be counted in the total size calculation for requesting resources. + +If no `input_schema` is included in the pipeline interface, looper will not be able to validate the samples and will simply submit each job without validation. + +### output_schema + +The output schema formally specifies the *output produced by this pipeline*. It is used by downstream tools to that need to be aware of the products of the pipeline for further visualization or analysis. Like the input schema, it is based on the extended [PEP JSON-schema validation framework](http://pep.databio.org/en/latest/howto_schema/), but adds looper-specific capabilities. The base schema has two *properties* sections, one that pertains to the project, and one that pertains to the samples. The *properties* sections for both sample and project will recognize these attributes: + +- `title`, following the base JSON-schema spec. +- `description`, following the base JSON-schema spec. +- `path`, used to specify a relative path to an output file. The value in the `path` attribute is a template for a path that will be populated by sample variables. Sample variables can be used in the template using brace notation, like `{sample_attribute}`. +- `thumbnail_path`, templates similar to the `path` attribute, but used to specify a thumbnail output version. +- `type`, the data type of this output. Can be one of: link, image, file. + +The attributes added under the *Project properties* section are assumed to be project-level outputs, whereas attributes under the `samples` object are sample-level outputs. Here is an example output schema: + +``` +description: objects produced by PEPPRO pipeline. +properties: + samples: + type: array + items: + type: object + properties: + smooth_bw: + path: "aligned_{genome}/{sample_name}_smooth.bw" + type: string + description: "A smooth bigwig file" + aligned_bam: + path: "aligned_{genome}/{sample_name}_sort.bam" + type: string + description: "A sorted, aligned BAM file" + peaks_bed: + path: "peak_calling_{genome}/{sample_name}_peaks.bed" + type: string + description: "Peaks in BED format" + tss_file: + title: "TSS enrichment file" + description: "Plots TSS scores for each sample." + thumbnail_path: "summary/{name}_TSSEnrichment.png" + path: "summary/{name}_TSSEnrichment.pdf" + type: image + counts_table: + title: "Project peak coverage file" + description: "Project peak coverages: chr_start_end X sample" + path: "summary/{name}_peaks_coverage.tsv" + type: link +``` + +Looper uses the output schema in its `report` function, which produces a browsable HTML report summarizing the pipeline results. The output schema provides the relative locations to sample-level and project-level outputs produced by the pipeline, which looper can then integrate into the output results. If the output schema is not included, the `looper report` will be unable to locate and integrate the files produced by the pipeline and will therefore be limited to simple statistics. + +### compute + +The compute section of the pipeline interface provides a way to set compute settings at the pipeline level. These variables can then be accessed in the command template. They can also be overridden by values in the PEP config, or on the command line. See the [looper variable namespaces](variable-namespaces.md) for details. + +There are two reserved attributes under `compute` with specialized behavior: `size_dependent_variables` and `dynamic_variables_command_template`, which we'll now describe in detail. + +#### size_dependent_variables + +The `size_dependent_variables` section lets you specify variables with values that are modulated based on the total input file size for the run. This is typically used to add variables for memory, CPU, and clock time to request, if they depend on the input file size. Specify variables by providing a relative path to a `.tsv` file that defines the variables as columns, with input sizes as rows. + +The pipeline interface simply points to a `tsv` file: + +```yaml +pipeline_type: sample +path: pipelines/pepatac.py +command_template: > + {pipeline.path} ... +compute: + size_dependent_variables: resources-sample.tsv +``` + +The `resources-sample.tsv` file consists of a file with at least 1 column called `max_file_size`. Add any other columns you wish, each one will represent a new attribute added to the `compute` namespace and available for use in your command template. Here's an example: + +```tsv +max_file_size cores mem time +0.001 1 8000 00-04:00:00 +0.05 2 12000 00-08:00:00 +0.5 4 16000 00-12:00:00 +1 8 16000 00-24:00:00 +10 16 32000 02-00:00:00 +NaN 32 32000 04-00:00:00 +``` + +This example will add 3 variableS: `cores`, `mem`, and `time`, which can be accessed via `{compute.cores}`, `{compute.mem}`, and `{compute.time}`. Each row defines a "packages" of variable values. Think of it like a group of steps of increasing size. For a given job, looper calculates the total size of the input files (which are defined in the `input_schema`). Using this value, looper then selects the best-fit row by iterating over the rows until the calculated input file size does not exceed the `max_file_size` value in the row. This selects the largest resource package whose `max_file_size` attribute does not exceed the size of the input file. Max file sizes are specified in GB, so `5` means 5 GB. + +This final line in the resources `tsv` must include `NaN` in the `max_file_size` column, which serves as a catch-all for files larger than the largest specified file size. Add as many resource sets as you want. + +#### dynamic_variables_command_template + +The size-dependent variables is a convenient system to modulate computing variables based on file size, but it is not flexible enough to allow modulated compute variables on the basis of other sample attributes. For a more flexible version, looper provides the `dynamic_variables_command_template`. The dynamic variables command template specifies a Jinja2 template to construct a system command run in a subprocess. This command template has available all of the namespaces in the primary command template. The command should return a JSON object, which is then used to populate submission templates. This allows you to specify computing variables that depend on any attributes of a project, sample, or pipeline, which can be used for ultimate flexibility in computing. + +Example: + +``` +pipeline_type: sample +path: pipelines/pepatac.py +command_template: > + {pipeline.path} ... +compute: + dynamic_variables_command_template: python script.py --arg {sample.attribute} +``` + + +### sample_yaml_path + +Looper produces a yaml file that represents the sample. By default the file is saved in submission directory in `{sample.sample_name}.yaml`. You can override the default by specifying a `sample_yaml_path` attribute in the pipeline interface: + +``` +sample_yaml_path: {sample.sample_name}.yaml +``` + +This attribute, like the `command_template`, has access to any of the looper namespaces, in case you want to use them in the names of your sample yaml files. + +## Validating a pipeline interface + +A pipeline interface can be validated using JSON Schema against [schema.databio.org/pipelines/pipeline_interface.yaml](http://schema.databio.org/pipelines/pipeline_interface.yaml). diff --git a/docs/pipeline-interface.md b/docs/pipeline-interface.md deleted file mode 100644 index d445fc150..000000000 --- a/docs/pipeline-interface.md +++ /dev/null @@ -1,221 +0,0 @@ -# How to write a pipeline interface - -If you're using *existing* `looper`-compatible pipelines, you don't need to create a new interface; just [point your project at the one that comes with the pipeline](linking-a-pipeline.md). When creating *new* `looper`-compatible pipelines, you'll need to create a new pipeline interface file. Regardless of what pipelines you use, you will need to tell looper how to communicate with your pipeline. -That communication is defined in a **pipeline interface**, which is a `yaml` file with two sections: - -1. `protocol_mapping` - maps sample `protocol` (the assay type, sometimes called "library" or "library strategy") to one or more pipeline program -2. `pipelines` - describes the arguments and resources required by each pipeline - -Let's start with a simple example. The pipeline interface file may look like this: - -```yaml -protocol_mapping: - RRBS: rrbs_pipeline - -pipelines: - rrbs_pipeline: - name: RRBS - path: path/to/rrbs.py - arguments: - "--sample-name": sample_name - "--input": data_path -``` - -The first section specifies that samples of protocol `RRBS` will be mapped to the pipeline specified by key `rrbs_pipeline`. -The second section describes where the pipeline with key `rrbs_pipeline` is located and what command-line arguments it requires. -Pretty simple. Let's go through these 2 sections in more detail: - -### Protocol mapping section - -The `protocol_mapping` section explains how looper should map from a sample protocol -(like `RNA-seq`, which is a column in your annotation sheet) to a particular pipeline (like `rnaseq.py`), or group of pipelines. -Here's how to build `protocol_mapping`: - -**Case 1:** one protocol maps to one pipeline. Example: `RNA-seq: rnaseq.py` -Any samples that list "RNA-seq" under `library` will be run using the `rnaseq.py` pipeline. -You can list as many library types as you like in the protocol mapping, -mapping to as many pipelines as you configure in your `pipelines` section. - -Example: - -```yaml -protocol_mapping: - RRBS: rrbs.py - WGBS: wgbs.py - EG: wgbs.py - ATAC: atacseq.py - ATAC-SEQ: atacseq.py - CHIP: chipseq.py - CHIP-SEQ: chipseq.py - CHIPMENTATION: chipseq.py - STARR: starrseq.py - STARR-SEQ: starrseq.py -``` - -**Case 2:** one protocol maps to multiple *independent* pipelines. - -Example: - -```yaml -protocol_mapping - Drop-seq: quality_control.py, dropseq.py -``` - -You can map multiple pipelines to a single protocol if you want samples of a type to kick off more than one pipeline run. -The basic formats for independent pipelines (i.e., they can run concurrently): - -Example A: -```yaml -protocol_mapping: - SMART-seq: > - rnaBitSeq.py -f, - rnaTopHat.py -f -``` - - -Example B: -```yaml -protocol_mapping: - PROTOCOL: [pipeline1, pipeline2, ...] -``` - -**Case 3:** a protocol runs one pipeline which depends on another. - -*Warning*: This feature (pipeline dependency) is not implemented yet. This documentation describes a protocol that may be implemented in the future, if it is necessary to have dependency among pipeline submissions. - -Use *semicolons to indicate dependency*. - -Example: -```yaml -protocol_mapping: - WGBSQC: > - wgbs.py; - (nnm.py, pdr.py) -``` - -### Pipelines section -The `pipelines` section defines important information about each pipeline, including its name, location on disk/web, and optional or required command-line arguments. -In addition, if you're using a cluster resource manager, it also specifies which compute resources to request. -For each pipeline, you specify values for a few specific keys. - -Let's start with a **single-pipeline example**: - -```yaml -pipelines: - pipeline_key: # this is variable (script filename) - name: pipeline_name # used for assessing pipeline flags (optional) - path: relative/path/to/pipeline_script.py - looper_args: True - arguments: - "-k" : value - "--key2" : value - "--key3" : null # value-less argument flags - resources: - default: - file_size: "0" - cores: "4" - mem: "6000" - time: "2-00:00:00" - resource_package_name: - file_size: "2" - cores: "4" - mem: "6000" - time: "2-00:00:00" -``` - -Each pipeline gets its own section (here there's just one: `pipeline_key`). -The particular keys that you may specify for each pipeline are: - -- `path` (required): Absolute or relative path to the script for this pipeline. Relative paths are considered **relative to your pipeline_interface file**. -We strongly recommend using relative paths where possible to keep your pipeline interface file portable. You may also use shell environment variables (like `${HOME}`) in the `path`. -- `arguments` (required): List of key-value pairs of arguments required by the pipeline. -The key corresponds verbatim to the string that will be passed on the command line to the pipeline (i.e., the absolute, quoted name of the argument, like `"--input"`). -The value corresponds to an attribute of the sample, which will be derived from the sample_annotation csv file. -In other words, it's a column name of your sample annotation sheet. Looper will find the value of this attribute for each sample and pass that to the pipeline as the value for that argument. -For flag-like arguments that lack a value, you may specify `null` as the value (e.g. `"--quiet-mode": null`). -These arguments are considered *required*, and `looper` will not submit a pipeline if a sample lacks an attribute that is specified as a value for an argument. -- `name` (recommended): Name of the pipeline. This is used to assess pipeline flags (if your pipeline employs them, like `pypiper` pipelines). -- `optional_arguments`: Any arguments listed in this section will be passed to the pipeline *if the specified attribute exists for the sample*. -These are considered optional, and so the pipeline will still be submitted if they are not provided. -- `required_input_files` (optional): A list of sample attributes (annotation sheets column names) that will point to input files that must exist. -- `all_input_files` (optional): A list of sample attributes (annotation sheet column names) that will point to input files that are not required, but if they exist, should be counted in the total size calculation for requesting resources. -- `ngs_input_files` (optional): For pipelines using sequencing data, provide a list of sample attributes (annotation sheet column names) that will point to input files to be used for automatic detection of `read_length` and `read_type` sample attributes. -- `looper_args` (optional): Provide `True` or `False` to specify if this pipeline understands looper args, which are then automatically added for: - - `-C`: config_file (the pipeline config file specified in the project config file; or the default config file, if it exists) - - `-P`: cores (the number of processing cores specified by the chosen resource package) - - `-M`: mem (memory limit) -- `resources` (recommended): A section outlining how much memory, CPU, and clock time to request, modulated by input file size -If the `resources` section is missing, looper will only be able to run the pipeline locally (not submit it to a cluster resource manager). -If you provide a `resources` section, you must define at least 1 option named 'default' with `file_size: "0"`. -Then, you define as many more resource "packages" or "bundles" as you want. -- `outputs`: key-value pairs in which each key is a name for a kind of output file (or group of them) that a pipeline may produce, and the value is a template template for a path that will be populated by sample variables - -**More on `resources`** - -The `resources` section can be a bit confusing--think of it like a group of steps of increasing size. -The first step (default) starts at 0, and this step will catch any files that aren't big enough to get to the next level. -Each successive step is larger. -Looper determines the size of your input file, and then iterates over the resource packages until it can't go any further; -that is, the `file_size` of the package is bigger (in gigabytes) than the input file size of the sample. -At this point, iteration stops and looper has selected the best-fit resource package for that sample--the smallest package that is still big enough. - -Add as many additional resource sets as you want, with any names. Looper will determine which resource package to use based on the `file_size` of the input file. -It will select the lowest resource package whose `file_size` attribute does not exceed the size of the input file. -Because the partition or queue name is relative to your environment, we don't usually specify this in the `resources` section, but rather, in the `pepenv` config. -So, `file_size: "5"` means 5 GB. This means that resource package only will be used if the input files total size is greater than 5 GB. - -**More extensive example:** - -```yaml -pipelines: - rrbs: - name: RRBS - looper_args: True - path: path/to/rrbs.py - arguments: - "--sample-name": sample_name - "--genome": genome - "--input": data_path - "--single-or-paired": read_type - resources: - default: - file_size: "0" - cores: "4" - mem: "4000" - time: "2-00:00:00" - high: - file_size: "4" - cores: "6" - mem: "4000" - time: "2-00:00:00" - - rnaBitSeq.py: - looper_args: True - arguments: - "--sample-name": sample_name - "--genome": transcriptome - "--input": data_path - "--single-or-paired": read_type - resources: - default: - file_size: "0" - cores: "6" - mem: "6000" - time: "2-00:00:00" - - atacseq.py: - arguments: - "--sample-yaml": yaml_file - "-I": sample_name - "-G": genome - looper_args: True - resources: - default: - file_size: "0" - cores: "4" - mem: "8000" - time: "08:00:00" - outputs: - smoothed_bw: "aligned_{sample.genome}/{sample.name}_smoothed.bw" - pre_smoothed_bw: "aligned_{project.prealignments}/{sample.name}_smoothed.bw" -``` diff --git a/docs/pipeline-tiers.md b/docs/pipeline-tiers.md new file mode 100644 index 000000000..13c2593b6 --- /dev/null +++ b/docs/pipeline-tiers.md @@ -0,0 +1,19 @@ +# The concept of two-tiered pipelines + +In our experience, we are typically interested in running two different types of commands: Those that operate on each sample independently, and those that operate on all samples simultaneously. Since sample-independent pipelines can be easily parallelized by sample, we distinguish these. + +Looper divides pipelines into two types: *sample* pipelines and *project* pipelines. + +This philosophy is conceptually similar to the [MapReduce](https://en.wikipedia.org/wiki/MapReduce) programming model, which applies a *split-apply-combine* strategy. In the case of running pipelines on sample-intensive research projects, we *split* the project into samples and *apply* the first tier of processing (the *sample* pipeline). We then *combine* the results in the second tier of processing (the *project* pipeline). + +Looper doesn't require you to use this two-stage system, but it simply makes it easy to do so. Many pipelines operate only at the sample level and leave the downstream cross-sample analysis to the user. + +## Sample pipelines + +The typical use case is sample-level pipelines. These are run with `looper run`. Pipeline interface defining a sample pipeline must to include `pipeline_type: "sample"` statement. + +## Project pipelines + +Project pipelines, identified by `pipeline_type: "project"` statement in the pipeline interface, will be run with `looper runp` (where the *p* stands for *project*). Running a project pipeline operates in almost exactly the same way as the sample pipeline, with 2 key exceptions: First, instead of creating a separate command for every sample, the `looper runp` will only create a single command per pipeline for the project. And second, the command template itself will not have access to a `sample` namespace representing a particular sample, since it's not running on a particular sample; instead, it will have access to a `samples` (plural) namespace, which contains all the attributes from all the samples. + +In a typical workflow, a user will first run the samples individually using `looper run`, and then, if the pipeline provides one, will run the project component using `looper runp` to summarize or aggregate the results into a project-level output. diff --git a/docs/running-a-pipeline.md b/docs/running-a-pipeline.md new file mode 100644 index 000000000..38654c9f6 --- /dev/null +++ b/docs/running-a-pipeline.md @@ -0,0 +1,19 @@ +# How to run a pipeline + +You first have to [define your project](defining-a-project.md). This will give you a PEP linked to a pipeline. Next, we'll run the pipeline. + +The basic command is `looper run`. To run your pipeline, just: + +```console +looper run project_config.yaml +``` + +This will submit a job for each sample. That's basically all there is to it; after this, there's a lot of powerful options and tweaks you can do to control your jobs. Here we'll just mention a few of them. + +- **Dry runs**. You can use `-d, --dry-run` to create the job submission scripts, but not actually run them. This is really useful for testing that everything is set up correctly before you commit to submitting hundreds of jobs. +- **Limiting the number of jobs**. You can `-l, --limit` to test a few before running all samples. You can also use the `--selector-*` arguments to select certain samples to include or exclude. +- **Grouping jobs**. You can use `-u, --lump` or `-n, --lumpn` to group jobs. [More details on grouping jobs](grouping-jobs.md). +- **Changing compute settings**. You can use `-p, --package`, `-s, --settings`, or `-c, --compute` to change the compute templates. Read more in [running on a cluster](running-on-a-cluster.md). +- **Time delay**. You can stagger submissions to not overload a submission engine using `--time-delay`. +- **Use rerun to resubmit jobs**. To run only jobs that previously failed, try `looper rerun`. +- **Tweak the command on-the-fly**. The `--command-extra` arguments allow you to pass extra arguments to every command straight through from looper. See [parameterizing pipelines](parameterizing-pipelines.md). \ No newline at end of file diff --git a/docs/running-on-a-cluster.md b/docs/running-on-a-cluster.md new file mode 100644 index 000000000..d00ce2421 --- /dev/null +++ b/docs/running-on-a-cluster.md @@ -0,0 +1,25 @@ +# How to submit looper jobs to a cluster + +By default, `looper` will build a shell script for each sample and then run it sequentially on the local computer. This is convenient for simple cases, but when it comes time to scale up, this is where `looper` really excels. Looper uses a powerful [concentric template system](concentric-templates.md) that enables looper to run jobs on any cluster resource manager (like SLURM, SGE, LFS, etc.) by simply setting up a template for it. The environment templates are managed by [divvy](http://code.databio.org/divvy). + +## Overview and basic example of cluster computing + +To configure `looper` for cluster computing, you just configure divvy. Divvy is automatically installed when you install looper. Briefly, first create a `divvy` computing configuration file using `divvy init`: + +```bash +export DIVCFG="divvy_config.yaml" +divvy init -c $DIVCFG +``` + +Looper will now have access to your computing configuration. You can run `divvy list` to see what compute packages are available in this file. For example, you'll start with a package called 'slurm', which you can use with looper by calling `looper --package slurm`. For many systems (SLURM, SGE, LFS, etc), the default divvy configuration will work out of the box. If you need to tweak things, the template system is flexible and you can configure it to run in any compute environment. That's all there is to it. + +Complete details on how to configure divvy are described in the [divvy documentation](http://divvy.databio.org). + +## Divvy config file locations + +Looper will by default will look for the divvy configuration file in `$DIVCFG`, but you can override this by specifying a path to other file with `--divvy` argument, like this: + +```bash +looper --divvy /path/to/env_cfg.yaml ... +``` + diff --git a/docs/usage.md b/docs/usage.md index 126d09a49..5f7c2f61d 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -2,363 +2,360 @@ Looper doesn't just run pipelines; it can also check and summarize the progress of your jobs, as well as remove all files created by them. -Each task is controlled by one of the five main commands `run`, `summarize`, `destroy`, `check`, `clean`, `rerun`. +Each task is controlled by one of the following commands: `run`, `rerun`, `runp` , `table`,`report`, `destroy`, `check`, `clean`, `inspect`, `init` - `looper run`: Runs pipelines for each sample, for each pipeline. This will use your `compute` settings to build and submit scripts to your specified compute environment, or run them sequentially on your local computer. -- `looper summarize`: Summarize your project results. This command parses all key-value results reported in the each sample `stats.tsv` and collates them into a large summary matrix, which it saves in the project output directory. This creates such a matrix for each pipeline type run on the project, and a combined master summary table. +- `looper runp`: Runs pipelines for each pipeline for project. + +- `looper rerun`: Exactly the same as `looper run`, but only runs jobs with a failed flag. + +- `looper report`: Summarize your project results in a form of browsable HTML pages. + +- `looper table`: This command parses all key-value results reported in the each sample `stats.tsv` and collates them into a large summary matrix, which it saves in the project output directory. This creates such a matrix for each pipeline type run on the project, and a combined master summary table - `looper check`: Checks the run progress of the current project. This will display a summary of job status; which pipelines are currently running on which samples, which have completed, which have failed, etc. - `looper destroy`: Deletes all output results for this project. -- `looper rerun`: Exactly the same as `looper run`, but only runs jobs with a failed flag. +- `looper inspect`: Display the Prioject or Sample information + +- `looper init`: Initialize a looper dotfile (`.looper.yaml`) in the current directory Here you can see the command-line usage instructions for the main looper command and for each subcommand: ## `looper --help` - ```console -version: 0.11.0 -usage: looper [-h] [-V] [--logfile LOGFILE] [--verbosity {0,1,2,3,4}] [--dbg] - [--env ENV] - {run,rerun,summarize,destroy,check,clean} ... +version: 1.2.0-dev +usage: looper [-h] [--version] [--logfile LOGFILE] [--verbosity {0,1,2,3,4}] + [--dbg] + {run,rerun,runp,table,report,destroy,check,clean,inspect,init} + ... -looper - Loop through samples and submit pipelines. +looper - A project job submission engine and project manager. positional arguments: - {run,rerun,summarize,destroy,check,clean} - run Main Looper function: Submit jobs for samples. - rerun Resubmit jobs with failed flags. - summarize Summarize statistics of project samples. - destroy Remove all files of the project. - check Checks flag status of current runs. - clean Runs clean scripts to remove intermediate files of - already processed jobs. + {run,rerun,runp,table,report,destroy,check,clean,inspect,init} + run Run or submit sample jobs. + rerun Resubmit sample jobs with failed flags. + runp Run or submit project jobs. + table Write summary stats table for project samples. + report Create browsable HTML report of project results. + destroy Remove output files of the project. + check Check flag status of current runs. + clean Run clean scripts of already processed jobs. + inspect Print information about a project. + init Initialize looper dotfile. optional arguments: -h, --help show this help message and exit - -V, --version show program's version number and exit + --version show program's version number and exit --logfile LOGFILE Optional output file for looper logs (default: None) --verbosity {0,1,2,3,4} Choose level of verbosity (default: None) --dbg Turn on debug mode (default: False) - --env ENV Environment variable that points to the DIVCFG file. - (default: DIVCFG) For subcommand-specific options, type: 'looper -h' https://github.com/pepkit/looper ``` ## `looper run --help` - ```console -version: 0.11.0 -usage: looper run [-h] [--ignore-flags] [-t TIME_DELAY] - [--allow-duplicate-names] [--compute COMPUTE] - [--resources RESOURCES] [--limit LIMIT] [--lump LUMP] - [--lumpn LUMPN] [--file-checks] [-d] - [--selector-attribute SELECTOR_ATTRIBUTE] - [--selector-exclude [SELECTOR_EXCLUDE [SELECTOR_EXCLUDE ...]] - | --selector-include - [SELECTOR_INCLUDE [SELECTOR_INCLUDE ...]]] [--sp SUBPROJECT] - config_file - -Main Looper function: Submit jobs for samples. +usage: looper run [-h] [-i] [-d] [-t S] [-l N] [-x S] [-y S] [-f] [--divvy DIVCFG] [-p P] + [-s S] [-c K [K ...]] [-u X] [-n N] [-g K] [--sel-attr ATTR] + [--sel-excl [E [E ...]] | --sel-incl [I [I ...]]] [-a A [A ...]] + [config_file] + +Run or submit sample jobs. positional arguments: - config_file Project configuration file (YAML). + config_file Project configuration file (YAML) optional arguments: - -h, --help show this help message and exit - --ignore-flags Ignore run status flags? Default: False. By default, - pipelines will not be submitted if a pypiper flag file - exists marking the run (e.g. as 'running' or - 'failed'). Set this option to ignore flags and submit - the runs anyway. Default=False - -t TIME_DELAY, --time-delay TIME_DELAY - Time delay in seconds between job submissions. - --allow-duplicate-names - Allow duplicate names? Default: False. By default, - pipelines will not be submitted if a sample name is - duplicated, since samples names should be unique. Set - this option to override this setting. Default=False - --compute COMPUTE YAML file with looper environment compute settings. - --resources RESOURCES - Specification of individual computing resource - settings; separate setting name/key from value with - equals sign, and separate key-value pairs from each - other by comma; e.g., --resources k1=v1,k2=v2 - --limit LIMIT Limit to n samples. - --lump LUMP Maximum total input file size for a lump/batch of - commands in a single job (in GB) - --lumpn LUMPN Number of individual scripts grouped into single - submission - --file-checks Perform input file checks. Default=True. - -d, --dry-run Don't actually submit the project/subproject. - Default=False - --sp SUBPROJECT Name of subproject to use, as designated in the - project's configuration file - -select samples: - This group of arguments lets you specify samples to use by exclusion OR - inclusion of the samples attribute values. - - --selector-attribute SELECTOR_ATTRIBUTE - Specify the attribute for samples exclusion OR - inclusion - --selector-exclude [SELECTOR_EXCLUDE [SELECTOR_EXCLUDE ...]] - Operate only on samples that either lack this - attribute value or for which this value is not in this - collection. - --selector-include [SELECTOR_INCLUDE [SELECTOR_INCLUDE ...]] - Operate only on samples associated with these - attribute values; if not provided, all samples are - used. + -h, --help show this help message and exit + -i, --ignore-flags Ignore run status flags? Default=False + -d, --dry-run Don't actually submit the jobs. Default=False + -t S, --time-delay S Time delay in seconds between job submissions + -l N, --limit N Limit to n samples + -x S, --command-extra S String to append to every command + -y S, --command-extra-override S Same as command-extra, but overrides values in PEP + -f, --skip-file-checks Do not perform input file checks + -u X, --lump X Total input file size (GB) to batch into one job + -n N, --lumpn N Number of commands to batch into one job + -a A [A ...], --amend A [A ...] List of amendments to activate + +divvy arguments: + Configure divvy to change computing settings + + --divvy DIVCFG Path to divvy configuration file. Default=$DIVCFG env + variable. Currently: /Users/mstolarczyk/Uczelnia/UVA/ + code//divcfg/uva_rivanna.yaml + -p P, --package P Name of computing resource package to use + -s S, --settings S Path to a YAML settings file with compute settings + -c K [K ...], --compute K [K ...] List of key-value pairs (k1=v1) + +sample selection arguments: + Specify samples to include or exclude based on sample attribute values + + -g K, --toggle-key K Sample attribute specifying toggle. Default: toggle + --sel-attr ATTR Attribute for sample exclusion OR inclusion + --sel-excl [E [E ...]] Exclude samples with these values + --sel-incl [I [I ...]] Include only samples with these values ``` -## `looper summarize --help` - +## `looper runp --help` ```console -version: 0.11.0 -usage: looper summarize [-h] [--file-checks] [-d] - [--selector-attribute SELECTOR_ATTRIBUTE] - [--selector-exclude [SELECTOR_EXCLUDE [SELECTOR_EXCLUDE ...]] - | --selector-include - [SELECTOR_INCLUDE [SELECTOR_INCLUDE ...]]] - [--sp SUBPROJECT] - config_file +usage: looper runp [-h] [-i] [-d] [-t S] [-l N] [-x S] [-y S] [-f] [--divvy DIVCFG] [-p P] + [-s S] [-c K [K ...]] [-g K] [--sel-attr ATTR] [--sel-excl [E [E ...]] + | --sel-incl [I [I ...]]] [-a A [A ...]] + [config_file] -Summarize statistics of project samples. +Run or submit project jobs. positional arguments: - config_file Project configuration file (YAML). + config_file Project configuration file (YAML) optional arguments: - -h, --help show this help message and exit - --file-checks Perform input file checks. Default=True. - -d, --dry-run Don't actually submit the project/subproject. - Default=False - --sp SUBPROJECT Name of subproject to use, as designated in the - project's configuration file - -select samples: - This group of arguments lets you specify samples to use by exclusion OR - inclusion of the samples attribute values. - - --selector-attribute SELECTOR_ATTRIBUTE - Specify the attribute for samples exclusion OR - inclusion - --selector-exclude [SELECTOR_EXCLUDE [SELECTOR_EXCLUDE ...]] - Operate only on samples that either lack this - attribute value or for which this value is not in this - collection. - --selector-include [SELECTOR_INCLUDE [SELECTOR_INCLUDE ...]] - Operate only on samples associated with these - attribute values; if not provided, all samples are - used. + -h, --help show this help message and exit + -i, --ignore-flags Ignore run status flags? Default=False + -d, --dry-run Don't actually submit the jobs. Default=False + -t S, --time-delay S Time delay in seconds between job submissions + -l N, --limit N Limit to n samples + -x S, --command-extra S String to append to every command + -y S, --command-extra-override S Same as command-extra, but overrides values in PEP + -f, --skip-file-checks Do not perform input file checks + -a A [A ...], --amend A [A ...] List of amendments to activate + +divvy arguments: + Configure divvy to change computing settings + + --divvy DIVCFG Path to divvy configuration file. Default=$DIVCFG env + variable. Currently: /Users/mstolarczyk/Uczelnia/UVA/ + code//divcfg/uva_rivanna.yaml + -p P, --package P Name of computing resource package to use + -s S, --settings S Path to a YAML settings file with compute settings + -c K [K ...], --compute K [K ...] List of key-value pairs (k1=v1) + +sample selection arguments: + Specify samples to include or exclude based on sample attribute values + + -g K, --toggle-key K Sample attribute specifying toggle. Default: toggle + --sel-attr ATTR Attribute for sample exclusion OR inclusion + --sel-excl [E [E ...]] Exclude samples with these values + --sel-incl [I [I ...]] Include only samples with these values ``` -## `looper destroy --help` +## `looper rerun --help` +```console +usage: looper rerun [-h] [-i] [-d] [-t S] [-l N] [-x S] [-y S] [-f] [--divvy DIVCFG] + [-p P] [-s S] [-c K [K ...]] [-u X] [-n N] [-g K] [--sel-attr ATTR] + [--sel-excl [E [E ...]] | --sel-incl [I [I ...]]] [-a A [A ...]] + [config_file] +Resubmit sample jobs with failed flags. + +positional arguments: + config_file Project configuration file (YAML) + +optional arguments: + -h, --help show this help message and exit + -i, --ignore-flags Ignore run status flags? Default=False + -d, --dry-run Don't actually submit the jobs. Default=False + -t S, --time-delay S Time delay in seconds between job submissions + -l N, --limit N Limit to n samples + -x S, --command-extra S String to append to every command + -y S, --command-extra-override S Same as command-extra, but overrides values in PEP + -f, --skip-file-checks Do not perform input file checks + -u X, --lump X Total input file size (GB) to batch into one job + -n N, --lumpn N Number of commands to batch into one job + -a A [A ...], --amend A [A ...] List of amendments to activate + +divvy arguments: + Configure divvy to change computing settings + + --divvy DIVCFG Path to divvy configuration file. Default=$DIVCFG env + variable. Currently: /Users/mstolarczyk/Uczelnia/UVA/ + code//divcfg/uva_rivanna.yaml + -p P, --package P Name of computing resource package to use + -s S, --settings S Path to a YAML settings file with compute settings + -c K [K ...], --compute K [K ...] List of key-value pairs (k1=v1) + +sample selection arguments: + Specify samples to include or exclude based on sample attribute values + + -g K, --toggle-key K Sample attribute specifying toggle. Default: toggle + --sel-attr ATTR Attribute for sample exclusion OR inclusion + --sel-excl [E [E ...]] Exclude samples with these values + --sel-incl [I [I ...]] Include only samples with these values +``` + +## `looper report --help` ```console -version: 0.11.0 -usage: looper destroy [-h] [--force-yes] [--file-checks] [-d] - [--selector-attribute SELECTOR_ATTRIBUTE] - [--selector-exclude [SELECTOR_EXCLUDE [SELECTOR_EXCLUDE ...]] - | --selector-include - [SELECTOR_INCLUDE [SELECTOR_INCLUDE ...]]] - [--sp SUBPROJECT] - config_file +usage: looper report [-h] [-g K] [--sel-attr ATTR] [--sel-excl [E [E ...]] | --sel-incl + [I [I ...]]] [-a A [A ...]] + [config_file] -Remove all files of the project. +Create browsable HTML report of project results. positional arguments: - config_file Project configuration file (YAML). + config_file Project configuration file (YAML) optional arguments: - -h, --help show this help message and exit - --force-yes Provide upfront confirmation of destruction intent, to - skip console query. Default=False - --file-checks Perform input file checks. Default=True. - -d, --dry-run Don't actually submit the project/subproject. - Default=False - --sp SUBPROJECT Name of subproject to use, as designated in the - project's configuration file - -select samples: - This group of arguments lets you specify samples to use by exclusion OR - inclusion of the samples attribute values. - - --selector-attribute SELECTOR_ATTRIBUTE - Specify the attribute for samples exclusion OR - inclusion - --selector-exclude [SELECTOR_EXCLUDE [SELECTOR_EXCLUDE ...]] - Operate only on samples that either lack this - attribute value or for which this value is not in this - collection. - --selector-include [SELECTOR_INCLUDE [SELECTOR_INCLUDE ...]] - Operate only on samples associated with these - attribute values; if not provided, all samples are - used. + -h, --help show this help message and exit + -a A [A ...], --amend A [A ...] List of amendments to activate + +sample selection arguments: + Specify samples to include or exclude based on sample attribute values + + -g K, --toggle-key K Sample attribute specifying toggle. Default: toggle + --sel-attr ATTR Attribute for sample exclusion OR inclusion + --sel-excl [E [E ...]] Exclude samples with these values + --sel-incl [I [I ...]] Include only samples with these values ``` -## `looper check --help` +## `looper table --help` +```console +usage: looper table [-h] [-g K] [--sel-attr ATTR] [--sel-excl [E [E ...]] | --sel-incl + [I [I ...]]] [-a A [A ...]] + [config_file] + +Write summary stats table for project samples. + +positional arguments: + config_file Project configuration file (YAML) +optional arguments: + -h, --help show this help message and exit + -a A [A ...], --amend A [A ...] List of amendments to activate + +sample selection arguments: + Specify samples to include or exclude based on sample attribute values + + -g K, --toggle-key K Sample attribute specifying toggle. Default: toggle + --sel-attr ATTR Attribute for sample exclusion OR inclusion + --sel-excl [E [E ...]] Exclude samples with these values + --sel-incl [I [I ...]] Include only samples with these values +``` + +## `looper inspect --help` ```console -version: 0.11.0 -usage: looper check [-h] [-A] [-F [FLAGS [FLAGS ...]]] [--file-checks] [-d] - [--selector-attribute SELECTOR_ATTRIBUTE] - [--selector-exclude [SELECTOR_EXCLUDE [SELECTOR_EXCLUDE ...]] - | --selector-include - [SELECTOR_INCLUDE [SELECTOR_INCLUDE ...]]] - [--sp SUBPROJECT] - config_file +usage: looper inspect [-h] [-n S [S ...]] [-l L] [-g K] [--sel-attr ATTR] + [--sel-excl [E [E ...]] | --sel-incl [I [I ...]]] [-a A [A ...]] + [config_file] -Checks flag status of current runs. +Print information about a project. positional arguments: - config_file Project configuration file (YAML). + config_file Project configuration file (YAML) optional arguments: - -h, --help show this help message and exit - -A, --all-folders Check status for all project's output folders, not - just those for samples specified in the config file - used. Default=False - -F [FLAGS [FLAGS ...]], --flags [FLAGS [FLAGS ...]] - Check on only these flags/status values. - --file-checks Perform input file checks. Default=True. - -d, --dry-run Don't actually submit the project/subproject. - Default=False - --sp SUBPROJECT Name of subproject to use, as designated in the - project's configuration file - -select samples: - This group of arguments lets you specify samples to use by exclusion OR - inclusion of the samples attribute values. - - --selector-attribute SELECTOR_ATTRIBUTE - Specify the attribute for samples exclusion OR - inclusion - --selector-exclude [SELECTOR_EXCLUDE [SELECTOR_EXCLUDE ...]] - Operate only on samples that either lack this - attribute value or for which this value is not in this - collection. - --selector-include [SELECTOR_INCLUDE [SELECTOR_INCLUDE ...]] - Operate only on samples associated with these - attribute values; if not provided, all samples are - used. + -h, --help show this help message and exit + -n S [S ...], --snames S [S ...] Name of the samples to inspect + -l L, --attr-limit L Number of sample attributes to display + -a A [A ...], --amend A [A ...] List of amendments to activate + +sample selection arguments: + Specify samples to include or exclude based on sample attribute values + + -g K, --toggle-key K Sample attribute specifying toggle. Default: toggle + --sel-attr ATTR Attribute for sample exclusion OR inclusion + --sel-excl [E [E ...]] Exclude samples with these values + --sel-incl [I [I ...]] Include only samples with these values ``` -## `looper clean --help` +## `looper init --help` +```console +usage: looper init [-h] [-f] config_file + +Initialize looper dotfile. + +positional arguments: + config_file Project configuration file (YAML) + +optional arguments: + -h, --help show this help message and exit + -f, --force Force overwrite +``` +## `looper destroy --help` ```console -version: 0.11.0 -usage: looper clean [-h] [--force-yes] [--file-checks] [-d] - [--selector-attribute SELECTOR_ATTRIBUTE] - [--selector-exclude [SELECTOR_EXCLUDE [SELECTOR_EXCLUDE ...]] - | --selector-include - [SELECTOR_INCLUDE [SELECTOR_INCLUDE ...]]] - [--sp SUBPROJECT] - config_file +usage: looper destroy [-h] [-d] [--force-yes] [-g K] [--sel-attr ATTR] + [--sel-excl [E [E ...]] | --sel-incl [I [I ...]]] [-a A [A ...]] + [config_file] -Runs clean scripts to remove intermediate files of already processed jobs. +Remove output files of the project. positional arguments: - config_file Project configuration file (YAML). + config_file Project configuration file (YAML) optional arguments: - -h, --help show this help message and exit - --force-yes Provide upfront confirmation of cleaning intent, to - skip console query. Default=False - --file-checks Perform input file checks. Default=True. - -d, --dry-run Don't actually submit the project/subproject. - Default=False - --sp SUBPROJECT Name of subproject to use, as designated in the - project's configuration file - -select samples: - This group of arguments lets you specify samples to use by exclusion OR - inclusion of the samples attribute values. - - --selector-attribute SELECTOR_ATTRIBUTE - Specify the attribute for samples exclusion OR - inclusion - --selector-exclude [SELECTOR_EXCLUDE [SELECTOR_EXCLUDE ...]] - Operate only on samples that either lack this - attribute value or for which this value is not in this - collection. - --selector-include [SELECTOR_INCLUDE [SELECTOR_INCLUDE ...]] - Operate only on samples associated with these - attribute values; if not provided, all samples are - used. + -h, --help show this help message and exit + -d, --dry-run Don't actually submit the jobs. Default=False + --force-yes Provide upfront confirmation of destruction intent, to + skip console query. Default=False + -a A [A ...], --amend A [A ...] List of amendments to activate + +sample selection arguments: + Specify samples to include or exclude based on sample attribute values + + -g K, --toggle-key K Sample attribute specifying toggle. Default: toggle + --sel-attr ATTR Attribute for sample exclusion OR inclusion + --sel-excl [E [E ...]] Exclude samples with these values + --sel-incl [I [I ...]] Include only samples with these values ``` -## `looper rerun --help` +## `looper check --help` +```console +usage: looper check [-h] [-A] [-f [F [F ...]]] [-g K] [--sel-attr ATTR] + [--sel-excl [E [E ...]] | --sel-incl [I [I ...]]] [-a A [A ...]] + [config_file] + +Check flag status of current runs. + +positional arguments: + config_file Project configuration file (YAML) + +optional arguments: + -h, --help show this help message and exit + -A, --all-folders Check status for all output folders, not just for + samples specified in the config. Default=False + -f [F [F ...]], --flags [F [F ...]] + Check on only these flags/status values + -a A [A ...], --amend A [A ...] List of amendments to activate + +sample selection arguments: + Specify samples to include or exclude based on sample attribute values + + -g K, --toggle-key K Sample attribute specifying toggle. Default: toggle + --sel-attr ATTR Attribute for sample exclusion OR inclusion + --sel-excl [E [E ...]] Exclude samples with these values + --sel-incl [I [I ...]] Include only samples with these values +``` +## `looper clean --help` ```console -version: 0.11.0 -usage: looper rerun [-h] [--ignore-flags] [-t TIME_DELAY] - [--allow-duplicate-names] [--compute COMPUTE] - [--resources RESOURCES] [--limit LIMIT] [--lump LUMP] - [--lumpn LUMPN] [--file-checks] [-d] - [--selector-attribute SELECTOR_ATTRIBUTE] - [--selector-exclude [SELECTOR_EXCLUDE [SELECTOR_EXCLUDE ...]] - | --selector-include - [SELECTOR_INCLUDE [SELECTOR_INCLUDE ...]]] - [--sp SUBPROJECT] - config_file - -Resubmit jobs with failed flags. +usage: looper clean [-h] [-d] [--force-yes] [-g K] [--sel-attr ATTR] + [--sel-excl [E [E ...]] | --sel-incl [I [I ...]]] [-a A [A ...]] + [config_file] + +Run clean scripts of already processed jobs. positional arguments: - config_file Project configuration file (YAML). + config_file Project configuration file (YAML) optional arguments: - -h, --help show this help message and exit - --ignore-flags Ignore run status flags? Default: False. By default, - pipelines will not be submitted if a pypiper flag file - exists marking the run (e.g. as 'running' or - 'failed'). Set this option to ignore flags and submit - the runs anyway. Default=False - -t TIME_DELAY, --time-delay TIME_DELAY - Time delay in seconds between job submissions. - --allow-duplicate-names - Allow duplicate names? Default: False. By default, - pipelines will not be submitted if a sample name is - duplicated, since samples names should be unique. Set - this option to override this setting. Default=False - --compute COMPUTE YAML file with looper environment compute settings. - --resources RESOURCES - Specification of individual computing resource - settings; separate setting name/key from value with - equals sign, and separate key-value pairs from each - other by comma; e.g., --resources k1=v1,k2=v2 - --limit LIMIT Limit to n samples. - --lump LUMP Maximum total input file size for a lump/batch of - commands in a single job (in GB) - --lumpn LUMPN Number of individual scripts grouped into single - submission - --file-checks Perform input file checks. Default=True. - -d, --dry-run Don't actually submit the project/subproject. - Default=False - --sp SUBPROJECT Name of subproject to use, as designated in the - project's configuration file - -select samples: - This group of arguments lets you specify samples to use by exclusion OR - inclusion of the samples attribute values. - - --selector-attribute SELECTOR_ATTRIBUTE - Specify the attribute for samples exclusion OR - inclusion - --selector-exclude [SELECTOR_EXCLUDE [SELECTOR_EXCLUDE ...]] - Operate only on samples that either lack this - attribute value or for which this value is not in this - collection. - --selector-include [SELECTOR_INCLUDE [SELECTOR_INCLUDE ...]] - Operate only on samples associated with these - attribute values; if not provided, all samples are - used. + -h, --help show this help message and exit + -d, --dry-run Don't actually submit the jobs. Default=False + --force-yes Provide upfront confirmation of destruction intent, to + skip console query. Default=False + -a A [A ...], --amend A [A ...] List of amendments to activate + +sample selection arguments: + Specify samples to include or exclude based on sample attribute values + + -g K, --toggle-key K Sample attribute specifying toggle. Default: toggle + --sel-attr ATTR Attribute for sample exclusion OR inclusion + --sel-excl [E [E ...]] Exclude samples with these values + --sel-incl [I [I ...]] Include only samples with these values ``` diff --git a/docs/usage.template b/docs/usage.template index 4a7bbec89..26d1ea7ff 100644 --- a/docs/usage.template +++ b/docs/usage.template @@ -2,17 +2,25 @@ Looper doesn't just run pipelines; it can also check and summarize the progress of your jobs, as well as remove all files created by them. -Each task is controlled by one of the five main commands `run`, `summarize`, `destroy`, `check`, `clean`, `rerun`. +Each task is controlled by one of the following commands: `run`, `rerun`, `runp` , `table`,`report`, `destroy`, `check`, `clean`, `inspect`, `init` - `looper run`: Runs pipelines for each sample, for each pipeline. This will use your `compute` settings to build and submit scripts to your specified compute environment, or run them sequentially on your local computer. -- `looper summarize`: Summarize your project results. This command parses all key-value results reported in the each sample `stats.tsv` and collates them into a large summary matrix, which it saves in the project output directory. This creates such a matrix for each pipeline type run on the project, and a combined master summary table. +- `looper runp`: Runs pipelines for each pipeline for project. + +- `looper rerun`: Exactly the same as `looper run`, but only runs jobs with a failed flag. + +- `looper report`: Summarize your project results in a form of browsable HTML pages. + +- `looper table`: This command parses all key-value results reported in the each sample `stats.tsv` and collates them into a large summary matrix, which it saves in the project output directory. This creates such a matrix for each pipeline type run on the project, and a combined master summary table - `looper check`: Checks the run progress of the current project. This will display a summary of job status; which pipelines are currently running on which samples, which have completed, which have failed, etc. - `looper destroy`: Deletes all output results for this project. -- `looper rerun`: Exactly the same as `looper run`, but only runs jobs with a failed flag. +- `looper inspect`: Display the Prioject or Sample information + +- `looper init`: Initialize a looper dotfile (`.looper.yaml`) in the current directory Here you can see the command-line usage instructions for the main looper command and for each subcommand: diff --git a/docs/variable-namespaces.md b/docs/variable-namespaces.md new file mode 100644 index 000000000..695da06f6 --- /dev/null +++ b/docs/variable-namespaces.md @@ -0,0 +1,115 @@ +# Looper variable namespaces + +## Populating the templates + +Loper creates job scripts using [concentric templates](concentric-templates.md) consisting of a *command template* and a *submission template*. This layered design allows us to decouple the computing environment from the pipeline, which improves portability. The task of running jobs can be thought of as simply populating the templates with variables. To do this, Looper pools variables from several sources: + +1. the command line, where the user provides any on-the-fly variables for a particular run. +2. the PEP, which provides information on the project and samples. +3. the pipeline interface, which provides information on the pipeline to run. +4. the divvy config file, which provides information on the computing environment. + +Variables from these sources are used to populate the templates to construct the commands to run. To keep things organized, looper groups the variables into namespaces. These namespaces are used first to populate the command template, which produces a built command. This command is then treated as a variable in itself, which is pooled with the other variables to populate the submission template. Looper provides 6 variable namespaces for populating the templates: + +## 1. project +The `project` namespace contains all PEP config attributes. For example, if you have a config file like this: + +``` +pep_version: 2.0.0 +my_variable: 123 +``` + +Then `project.my_variable` would have value `123`. You can use the project namespace to refer to any information in the project. You can use `project.looper` to refer to any attributes in the `looper` section of the PEP. + +## 2. sample or samples + +For sample-level pipelines, the `sample` namespace contains all PEP post-processing sample attributes for the given sample. For project-level pipelines, looper constructs a single job for an entire project, so there is no `sample` namespace; instead, there is a `samples` (plural) namespace, which is a list of all the samples in the project. This can be useful if you need to iterate through all the samples in your command template. + +## 3. pipeline + +Everything under `pipeline` in the pipeline interface for this pipeline. This simply provides a convenient way to annotate pipeline-level variables for use in templates. + +## 4. looper + +The `looper` namespace consists of automatic variables created by looper: + +**paths:** + +- `output_dir` -- parent output directory provided in `project.looper.output_dir` in the project configuration file +- `results_subdir` -- the path to the results directory. It is a sub directory of `output_dir` called `project.looper.results_subdir` or "results_pipeline" by default +- `sample_output_folder` -- a sample-specific output folder (`results_subdir`/`sample.sample_name`) + +**others:** + +- `total_input_size` -- the sum of file sizes for all files marked as input files in the input schema +- `pipeline_config` -- renamed from `config` to disambiguate with new `pep_config` ? Not sure what this is +- `pep_config` -- path to the project configuration file used for this looper run +- `log_file` -- an automatically created log file path, to be stored in the looper submission subdirectory +- `command` -- the result of populating the command template +- `job_name` -- job name made by concatenating the pipeline identifier and unique sample name + +The `looper.command` value is what enables the two-layer template system, whereby the output of the command template is used as input to the submission template. + +## 5. compute + +The `compute` namespace consists of a group of variables relevant for computing resources. The `compute` namespace has a unique behavior: it aggregates variables from several sources in a priority order, overriding values with more specific ones as priority increases. The list of variable sources in priority order is: + +1. Looper CLI (`--compute` or `--settings` for on-the-fly settings) +2. PEP config, `project.looper.compute` section +3. Pipeline interface, `pipeline.compute` section +4. Activated divvy compute package (`--package` CLI argument) + +So, the compute namespace is first populated with any variables from the selected divvy compute package. It then updates this with settings given in the `compute` section of the pipeline interface. It then updates from the PEP `project.looper.compute`, and then finally anything passed to `--compute` on the looper CLI. This provides a way to modulate looper behavior at the level of a computing environment, a pipeline, a project, or a run, in that order. + + +## Mapping variables to submission templates using divvy adapters + +One remaining issue is how to map variables from the looper variable namespaces onto the variables used in divvy templates. Divvy is decoupled from looper, and its templates are completely customizable, so they do not necessarily understand how to connect to looper variables into divvy templates. The default divvy templates use variables like `{CODE}`, `{JOBNAME}`, and `{LOGFILE}`, among others. A user may customize rename these or add custom variables names in divvy templates. How do we map the looper variables onto these arbitrary divvy template variables? Through divvy adapters. + +These variables are linked to looper namespaces via *divvy adapters*. Here are the default divvy adapters: + +``` +adapters: + CODE: looper.command + JOBNAME: looper.job_name + CORES: compute.cores + LOGFILE: looper.log_file + TIME: compute.time + MEM: compute.mem + DOCKER_ARGS: compute.docker_args + DOCKER_IMAGE: compute.docker_image + SINGULARITY_IMAGE: compute.singularity_image + SINGULARITY_ARGS: compute.singularity_args +``` + +The divvy adapters is a section in the divvy configuration file that links the divvy template variable (left side) to any other arbitrary variable names (right side). This example, we've populated the adapters with links to the namespaced input variables provided by looper (right side). You can adjust this section in your configuration file to map any variables into your submission template. + +## Best practices on storing compute variables + +Since compute variables can be stored in several places, it can be confusing to know where you should put things. Here are some guidelines: + +### Partition or queue name + +Because the partition or queue name is relative to your environment, we don't usually specify this in the `resources` section, but rather, in the `pepenv` config. + +### DIVCFG config file + +Variables that describes settings of a **compute environment** should go in the `DIVCFG` file. Any attributes in the activated compute package will be available to populate template variables. For example, the `partition` attribute is specified in many of our default `DIVCFG` files; that attribute is used to populate a template `{PARTITION}` variable. This is what enables pipelines to work in any compute environment, since we have no control over what your partitions are named. You can also use this to change SLURM queues on-the-fly. + +### Pipeline interface + +Variables that are **specific to a pipeline** can be defined in the `pipeline interface` file, `compute` section.As an example of a variable pulled from the `compute` section, we defined in our `pipeline_interface.yaml` a variable pointing to the singularity or docker image that can be used to run the pipeline, like this: + +``` +compute: + singularity_image: /absolute/path/to/images/image +``` + +Now, this variable will be available for use in a template as `{SINGULARITY_IMAGE}`. This makes sense to put in the pipeline interface because it is specific to this pipeline. This path should probably be absolute, because a relative path will be interpreted as relative to the working directory where your job is executed (*not* relative to the pipeline interface). This section is also useful for adjusting the amount of resources we need to request from a resource manager like SLURM. For example: `{MEM}`, `{CORES}`, and `{TIME}` are all defined frequently in this section, and they vary for different input file sizes. + +### Project config + +Finally, project-level variables can also be populated from the `compute` section of a project config file. This would enable you to make project-specific compute changes (such as billing a particular project to a particular SLURM resource account). + + + diff --git a/docs/writing-a-pipeline-interface.md b/docs/writing-a-pipeline-interface.md new file mode 100644 index 000000000..10f0356ab --- /dev/null +++ b/docs/writing-a-pipeline-interface.md @@ -0,0 +1,32 @@ +--- +title: Pipeline interface specification +--- + +# Writing a pipeline interface + +## Introduction + +If you want to use looper to run samples in a PEP through an arbitrary shell command, you will need to write a pipeline interface. Here is a basic walkthrough to write a simple interface file. Once you've been through this, you can consult the formal [pipeline interface format specification](pipeline-interface-specification.md) for further details and reference. + +## Example + +Let's start with a simple example from the [hello_looper repository](https://github.com/pepkit/hello_looper): + +```yaml +pipeline_name: count_lines +pipeline_type: sample +path: count_lines.sh # relative to this pipeline_interface.yaml file +command_template: {pipeline.path} {sample.file} +``` + +You can edit this to start your own interface. + +First, think of a unique name for your pipeline and put it in `pipeline_name`. This will be used for messaging and identification. + +Next, choose a `pipeline_type`, which can be either "sample" or "project". Most likely, you're writing a sample pipeline, but you can read more about [sample and project pipelines](pipeline-tiers.md) if you like. + +Next, we need to set the `path` to our script. This path is relative to the pipeline interface file, so you need to put the pipeline interface somewhere specific relative to the pipeline; perhaps in the same folder or in a parent folder. + +Finally, populate the `command_template`. You can use the full power of Jinja2 Python templates here, but most likely you'll just need to use a few variables using curly braces. In this case, we refer to the `count_lines.sh` script with `{pipeline.path}`, which points directly to the `path` variable defined above. Then, we use `{sample.file}` to refer to the `file` column in the sample table specified in the PEP. This pipeline thus takes a single positional command-line argument. You can make the command template much more complicated and refer to any sample or project attributes, as well as a bunch of [other variables made available by looper](variable-namespaces.md). + +Now, you have a basic functional pipeline interface. There are many more advanced features you can use to make your pipeline more powerful, such as providing a schema to specify inputs or outputs, making input-size-dependent compute settings, and more. For complete details, consult the formal [pipeline interface format specification](pipeline-interface-specification.md). \ No newline at end of file diff --git a/docs_jupyter/hello-world.ipynb b/docs_jupyter/hello-world.ipynb index 45dc9dd49..478b7da10 100644 --- a/docs_jupyter/hello-world.ipynb +++ b/docs_jupyter/hello-world.ipynb @@ -2,10 +2,7 @@ "cells": [ { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "# Hello World! example for looper\n", "\n", @@ -14,7 +11,7 @@ "## 1. Install the latest version of looper:\n", "\n", "```console\n", - "pip install --user --upgrade https://github.com/pepkit/looper/zipball/master\n", + "pip install --user --upgrade looper\n", "```\n", "\n", "## 2. Download and unzip the hello_looper repository\n", @@ -24,32 +21,28 @@ }, { "cell_type": "code", - "execution_count": 1, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 7, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "--2019-04-24 08:35:57-- https://github.com/pepkit/hello_looper/archive/master.zip\n", - "Resolving github.com (github.com)... 192.30.253.112, 192.30.253.113\n", - "Connecting to github.com (github.com)|192.30.253.112|:443... connected.\n", + "--2020-05-21 08:23:43-- https://github.com/pepkit/hello_looper/archive/master.zip\n", + "Resolving github.com (github.com)... 140.82.112.4\n", + "Connecting to github.com (github.com)|140.82.112.4|:443... connected.\n", "HTTP request sent, awaiting response... 302 Found\n", "Location: https://codeload.github.com/pepkit/hello_looper/zip/master [following]\n", - "--2019-04-24 08:35:57-- https://codeload.github.com/pepkit/hello_looper/zip/master\n", - "Resolving codeload.github.com (codeload.github.com)... 192.30.253.120, 192.30.253.121\n", - "Connecting to codeload.github.com (codeload.github.com)|192.30.253.120|:443... connected.\n", + "--2020-05-21 08:23:43-- https://codeload.github.com/pepkit/hello_looper/zip/master\n", + "Resolving codeload.github.com (codeload.github.com)... 140.82.114.10\n", + "Connecting to codeload.github.com (codeload.github.com)|140.82.114.10|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: unspecified [application/zip]\n", "Saving to: ‘master.zip’\n", "\n", - "master.zip [ <=> ] 5.24K --.-KB/s in 0.005s \n", + "master.zip [ <=> ] 5.20K --.-KB/s in 0.004s \n", "\n", - "2019-04-24 08:35:57 (981 KB/s) - ‘master.zip’ saved [5366]\n", + "2020-05-21 08:23:44 (1.25 MB/s) - ‘master.zip’ saved [5328]\n", "\n" ] } @@ -60,19 +53,15 @@ }, { "cell_type": "code", - "execution_count": 2, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 8, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Archive: master.zip\r\n", - "47b9584b59841d54418699aafc8d8d13f201dac3\r\n", + "c8c4088d6e14df05071fb99809dfc86b2a55d86a\r\n", " creating: hello_looper-master/\r\n", " inflating: hello_looper-master/README.md \r\n", " creating: hello_looper-master/data/\r\n", @@ -95,10 +84,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "## 3. Run it\n", "\n", @@ -107,63 +93,46 @@ }, { "cell_type": "code", - "execution_count": 3, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, - "outputs": [], - "source": [ - "!cd hello_looper-master" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "execution_count": 9, + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Command: run (Looper version: 0.11.0)\r\n", - "Traceback (most recent call last):\r\n", - " File \"/home/nsheff/.local/bin/looper\", line 10, in \r\n", - " sys.exit(main())\r\n", - " File \"/home/nsheff/.local/lib/python3.5/site-packages/looper/looper.py\", line 802, in main\r\n", - " determine_config_path(args.config_file), subproject=args.subproject,\r\n", - " File \"/home/nsheff/.local/lib/python3.5/site-packages/looper/utils.py\", line 104, in determine_config_path\r\n", - " raise ValueError(\"Path doesn't exist: {}\".format(root))\r\n", - "ValueError: Path doesn't exist: project/project_config.yaml\r\n", + "Looper version: 1.2.0-dev\r\n", + "Command: run\r\n", + "Ignoring invalid pipeline interface source: ../pipeline/pipeline_interface.yaml. Caught exception: FileNotFoundError(2, 'No such file or directory')\r\n", + "> Not submitted: No pipeline interfaces defined\r\n", + "> Not submitted: No pipeline interfaces defined\r\n", + "\r\n", + "Looper finished\r\n", + "Samples valid for job generation: 0 of 2\r\n", + "Commands submitted: 0 of 0\r\n", + "Jobs submitted: 0\r\n", + "\r\n", + "1 unique reasons for submission failure: No pipeline interfaces defined\r\n", + "\r\n", + "Summary of failures:\r\n", + "\u001b[91mNo pipeline interfaces defined\u001b[0m: frog_2, frog_1\r\n", "\u001b[0m" ] } ], "source": [ - "!looper run project/project_config.yaml" + "!looper run hello_looper-master/project/project_config.yaml" ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Voila! You've run your very first pipeline across multiple samples using `looper`!" ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "# Exploring the results\n", "\n", @@ -173,11 +142,7 @@ { "cell_type": "code", "execution_count": 5, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -203,25 +168,19 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "These are:\n", "\n", " * `/data` -- contains 2 data files for 2 samples. These input files were each passed to the pipeline.\n", " * `/pipeline` -- contains the script we want to run on each sample in our project. Our pipeline is a very simple shell script named `count_lines.sh`, which (duh!) counts the number of lines in an input file.\n", - " * `/project` -- contains 2 files that describe metadata for the project (`project_config.yaml`) and the samples (`sample_annotation.csv`). This particular project describes just two samples listed in the annotation file. These files together make up a [PEP](http://pepkit.github.io)-formatted project, and can therefore be read by any PEP-compatible tool, including `looper`.\n", + " * `/project` -- contains 2 files that describe metadata for the project (`project_config.yaml`) and the samples (`sample_annotation.csv`). This particular project describes just two samples listed in the annotation file. These files together make up a [PEP](http://pep.databio.org)-formatted project, and can therefore be read by any PEP-compatible tool, including `looper`.\n", "\n" ] }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "\n", "When we invoke `looper` from the command line we told it to `run project/project_config.yaml`. `looper` reads the [project/project_config.yaml](https://github.com/pepkit/hello_looper/blob/master/project/project_config.yaml) file, which points to a few things:\n", @@ -235,26 +194,19 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "\n", "\n", "## Pipeline outputs\n", "\n", - "Outputs of pipeline runs will be under the directory specified in the `output_dir` variable under the `paths` section in the project config file (see the [config files page](config-files.md)). Let's inspect that `project_config.yaml` file to see what it says under `output_dir`:\n" + "Outputs of pipeline runs will be under the directory specified in the `output_dir` variable under the `paths` section in the project config file (see [defining a project](defining-a-project.md)). Let's inspect that `project_config.yaml` file to see what it says under `output_dir`:\n" ] }, { "cell_type": "code", "execution_count": 6, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -273,10 +225,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "Alright, next let's explore what this pipeline stuck into our `output_dir`:\n" ] @@ -284,11 +233,7 @@ { "cell_type": "code", "execution_count": 7, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -314,10 +259,7 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "\n", "Inside of an `output_dir` there will be two directories:\n", @@ -330,15 +272,12 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "\n", "## A few more basic looper options\n", "\n", - "Looper also provides a few other simple arguments that let you adjust what it does. You can find a [complete reference of usage](usage) in the docs. Here are a few of the more common options:\n", + "Looper also provides a few other simple arguments that let you adjust what it does. You can find a [complete reference of usage](usage.md) in the docs. Here are a few of the more common options:\n", "\n", "For `looper run`:\n", "\n", @@ -356,38 +295,31 @@ }, { "cell_type": "markdown", - "metadata": { - "deletable": true, - "editable": true - }, + "metadata": {}, "source": [ "## On your own\n", "\n", - "To use `looper` on your own, you will need to prepare 2 things: a **project** (metadata that define *what* you want to process), and **pipelines** (*how* to process data). \n", - "The next sections define these:\n", - "\n", - "1. **Project**. To link your project to `looper`, you will need to [define your project](define-your-project.md) using PEP format. \n", - "2. **Pipelines**. You will want to either use pre-made `looper`-compatible pipelines or link your own custom-built pipelines. Read how to [connect your pipeline](linking-a-pipeline.md) to `looper`.\n" + "To use `looper` on your own, you will need to prepare 2 things: a **project** (metadata that define *what* you want to process), and **pipelines** (*how* to process data). To link your project to `looper`, you will need to [define a project](defining-a-project.md). You will want to either use pre-made `looper`-compatible pipelines or link your own custom-built pipelines. These docs will also show you how to connect your pipeline to your project.\n" ] } ], "metadata": { "kernelspec": { - "display_name": "Python 2", + "display_name": "Python 3", "language": "python", - "name": "python2" + "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", - "version": 2 + "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.12" + "pygments_lexer": "ipython3", + "version": "3.7.5" } }, "nbformat": 4, diff --git a/looper/__init__.py b/looper/__init__.py index c61430775..aab2f56bc 100644 --- a/looper/__init__.py +++ b/looper/__init__.py @@ -8,13 +8,14 @@ """ import argparse +import os import logging from .conductor import SubmissionConductor from .pipeline_interface import PipelineInterface from .project import Project -from .sample import Sample from ._version import __version__ from .parser_types import * +from .const import * from ubiquerg import VersionInHelpParser from divvy import DEFAULT_COMPUTE_RESOURCES_NAME, NEW_COMPUTE_KEY as COMPUTE_KEY @@ -22,15 +23,8 @@ # looper, so that other modules within this package need not worry about # the locations of some of the peppy declarations. Effectively, concentrate # the connection between peppy and looper here, to the extent possible. -from peppy import \ - FLAGS, IMPLICATIONS_DECLARATION, SAMPLE_INDEPENDENT_PROJECT_SECTIONS, \ - SAMPLE_NAME_COLNAME -__all__ = ["Project", "PipelineInterface", "Sample", "SubmissionConductor"] - - -GENERIC_PROTOCOL_KEY = "*" -LOGGING_LEVEL = "INFO" +__all__ = ["Project", "PipelineInterface", "SubmissionConductor"] # Descending by severity for correspondence with logic inversion. # That is, greater verbosity setting corresponds to lower logging level. @@ -40,11 +34,13 @@ class _StoreBoolActionType(argparse.Action): """ - Enables the storage of a boolean const and custom type definition needed for systematic html interface generation. - To get the _StoreTrueAction output use default=False in the add_argument function + Enables the storage of a boolean const and custom type definition needed + for systematic html interface generation. To get the _StoreTrueAction + output use default=False in the add_argument function and default=True to get _StoreFalseAction output. """ - def __init__(self, option_strings, dest, type, default, required=False, help=None): + def __init__(self, option_strings, dest, type, default, + required=False, help=None): super(_StoreBoolActionType, self).__init__( option_strings=option_strings, dest=dest, @@ -65,157 +61,211 @@ def build_parser(): :return argparse.ArgumentParser """ - # Main looper program help text messages - banner = "%(prog)s - Loop through samples and submit pipelines." - additional_description = "For subcommand-specific options, type: '%(prog)s -h'" + banner = "%(prog)s - A project job submission engine and project manager." + additional_description = "For subcommand-specific options, " \ + "type: '%(prog)s -h'" additional_description += "\nhttps://github.com/pepkit/looper" - parser = VersionInHelpParser(prog="looper", description=banner, epilog=additional_description, version=__version__) - - # Logging control - parser.add_argument( - "--logfile", dest="logfile", - help="Optional output file for looper logs (default: %(default)s)") - parser.add_argument( - "--verbosity", dest="verbosity", - type=int, choices=range(len(_LEVEL_BY_VERBOSITY)), - help="Choose level of verbosity (default: %(default)s)") - parser.add_argument( - "--logging-level", dest="logging_level", - help=argparse.SUPPRESS) - parser.add_argument( - "--dbg", dest="dbg", action="store_true", - help="Turn on debug mode (default: %(default)s)") - parser.add_argument( - "--env", dest="env", - default=None, - help="Environment variable that points to the DIVCFG file. (default: DIVCFG)") - - # Individual subcommands - msg_by_cmd = { - "run": "Main Looper function: Submit jobs for samples.", - "rerun": "Resubmit jobs with failed flags.", - "summarize": "Summarize statistics of project samples.", - "destroy": "Remove all files of the project.", - "check": "Checks flag status of current runs.", - "clean": "Runs clean scripts to remove intermediate " - "files of already processed jobs."} - - subparsers = parser.add_subparsers(dest="command") - - def add_subparser(cmd): - message = msg_by_cmd[cmd] - return subparsers.add_parser(cmd, description=message, help=message) - - # Run and rerun command - run_subparser = add_subparser("run") - rerun_subparser = add_subparser("rerun") - for subparser in [run_subparser, rerun_subparser]: - subparser.add_argument( - "--ignore-flags", dest="ignore_flags", default=False, - action=_StoreBoolActionType, type=html_checkbox(checked=False), - help="Ignore run status flags? Default: False. " - "By default, pipelines will not be submitted if a pypiper " - "flag file exists marking the run (e.g. as " - "'running' or 'failed'). Set this option to ignore flags " - "and submit the runs anyway. Default=False") - subparser.add_argument( - "-t", "--time-delay", dest="time_delay", - type=html_range(min_val=0, max_val=30, value=0), default=0, - help="Time delay in seconds between job submissions.") - subparser.add_argument( - "--allow-duplicate-names", default=False, - action=_StoreBoolActionType, type=html_checkbox(checked=False), - help="Allow duplicate names? Default: False. " - "By default, pipelines will not be submitted if a sample name" - " is duplicated, since samples names should be unique. " - " Set this option to override this setting. Default=False") - subparser.add_argument( - "--compute-package", dest=COMPUTE_KEY, - default=DEFAULT_COMPUTE_RESOURCES_NAME, - help="Name of computing resource package to use") - subparser.add_argument( - "--resources", - help="Specification of individual computing resource settings; " - "separate setting name/key from value with equals sign, " - "and separate key-value pairs from each other by comma; " - "e.g., --resources k1=v1,k2=v2") - subparser.add_argument( - "--limit", dest="limit", default=None, - type=html_range(min_val=1, max_val="num_samples", value="num_samples"), - help="Limit to n samples.") - # Note that defaults for otherwise numeric lump parameters are set to - # null by default so that the logic that parses their values may - # distinguish between explicit 0 and lack of specification. - subparser.add_argument( - "--lump", default=None, - type=html_range(min_val=0, max_val=100, step=0.1, value=0), - help="Maximum total input file size for a lump/batch of commands " - "in a single job (in GB)") - subparser.add_argument( - "--lumpn", default=None, - type=html_range(min_val=1, max_val="num_samples", value=1), - help="Number of individual scripts grouped into single submission") - - # Other commands - summarize_subparser = add_subparser("summarize") - destroy_subparser = add_subparser("destroy") - check_subparser = add_subparser("check") - clean_subparser = add_subparser("clean") - - check_subparser.add_argument( - "-A", "--all-folders", action=_StoreBoolActionType, default=False, type=html_checkbox(checked=False), - help="Check status for all project's output folders, not just " - "those for samples specified in the config file used. Default=False") - check_subparser.add_argument( - "-F", "--flags", nargs='*', default=FLAGS, type=html_select(choices=FLAGS), - help="Check on only these flags/status values.") - - destroy_subparser.add_argument( - "--force-yes", action=_StoreBoolActionType, default=False, type=html_checkbox(checked=False), - help="Provide upfront confirmation of destruction intent, " - "to skip console query. Default=False") - - clean_subparser.add_argument( - "--force-yes", action=_StoreBoolActionType, default=False, type=html_checkbox(checked=False), - help="Provide upfront confirmation of cleaning intent, " - "to skip console query. Default=False") - - # Common arguments - for subparser in [run_subparser, rerun_subparser, summarize_subparser, - destroy_subparser, check_subparser, clean_subparser]: - subparser.add_argument( - "config_file", - help="Project configuration file (YAML).") - subparser.add_argument( - "--file-checks", dest="file_checks", - action=_StoreBoolActionType, default=True, type=html_checkbox(checked=True), - help="Perform input file checks. Default=True.") - subparser.add_argument( - "-d", "--dry-run", dest="dry_run", - action=_StoreBoolActionType, default=False, type=html_checkbox(checked=False), - help="Don't actually submit the project/subproject. Default=False") - - fetch_samples_group = \ - subparser.add_argument_group("select samples", - "This group of arguments lets you specify samples to use by " - "exclusion OR inclusion of the samples attribute values.") - fetch_samples_group.add_argument("--selector-attribute", dest="selector_attribute", - help="Specify the attribute for samples exclusion OR inclusion", - default="protocol") - protocols = fetch_samples_group.add_mutually_exclusive_group() - protocols.add_argument( - "--selector-exclude", nargs='*', dest="selector_exclude", - help="Operate only on samples that either lack this attribute value or " - "for which this value is not in this collection.") - protocols.add_argument( - "--selector-include", nargs='*', dest="selector_include", - help="Operate only on samples associated with these attribute values;" - " if not provided, all samples are used.") - subparser.add_argument( - "--sp", dest="subproject", - help="Name of subproject to use, as designated in the " - "project's configuration file") - - return parser + parser = VersionInHelpParser( + prog="looper", description=banner, epilog=additional_description, + version=__version__) + + aux_parser = VersionInHelpParser( + prog="looper", description=banner, epilog=additional_description, + version=__version__) + result = [] + for parser in [parser, aux_parser]: + # Logging control + parser.add_argument( + "--logfile", help="Optional output file for looper logs " + "(default: %(default)s)") + parser.add_argument( + "--verbosity", type=int, choices=range(len(_LEVEL_BY_VERBOSITY)), + help="Choose level of verbosity (default: %(default)s)") + parser.add_argument( + "--logging-level", help=argparse.SUPPRESS) + parser.add_argument( + "--dbg", action="store_true", + help="Turn on debug mode (default: %(default)s)") + # Individual subcommands + msg_by_cmd = { + "run": "Run or submit sample jobs.", + "rerun": "Resubmit sample jobs with failed flags.", + "runp": "Run or submit project jobs.", + "table": "Write summary stats table for project samples.", + "report": "Create browsable HTML report of project results.", + "destroy": "Remove output files of the project.", + "check": "Check flag status of current runs.", + "clean": "Run clean scripts of already processed jobs.", + "inspect": "Print information about a project.", + "init": "Initialize looper dotfile." + } + + subparsers = parser.add_subparsers(dest="command") + + def add_subparser(cmd): + message = msg_by_cmd[cmd] + return subparsers.add_parser(cmd, description=message, help=message, + formatter_class=lambda prog: argparse.HelpFormatter( + prog, max_help_position=37, width=90)) + + # Run and rerun command + run_subparser = add_subparser("run") + rerun_subparser = add_subparser("rerun") + collate_subparser = add_subparser("runp") + table_subparser = add_subparser("table") + report_subparser = add_subparser("report") + destroy_subparser = add_subparser("destroy") + check_subparser = add_subparser("check") + clean_subparser = add_subparser("clean") + inspect_subparser = add_subparser("inspect") + init_subparser = add_subparser("init") + + # Flag arguments + #################################################################### + for subparser in [run_subparser, rerun_subparser, collate_subparser]: + subparser.add_argument( + "-i", "--ignore-flags", default=False, + action=_StoreBoolActionType, type=html_checkbox(checked=False), + help="Ignore run status flags? Default=False") + + for subparser in [run_subparser, rerun_subparser, destroy_subparser, + clean_subparser, collate_subparser]: + subparser.add_argument( + "-d", "--dry-run", + action=_StoreBoolActionType, default=False, + type=html_checkbox(checked=False), + help="Don't actually submit the jobs. Default=False") + + # Parameter arguments + #################################################################### + for subparser in [run_subparser, rerun_subparser, collate_subparser]: + subparser.add_argument( + "-t", "--time-delay", metavar="S", + type=html_range(min_val=0, max_val=30, value=0), default=0, + help="Time delay in seconds between job submissions") + subparser.add_argument( + "-l", "--limit", default=None, metavar="N", + type=html_range(min_val=1, max_val="num_samples", + value="num_samples"), + help="Limit to n samples") + subparser.add_argument( + "-x", "--command-extra", default="", + metavar="S", help="String to append to every command") + subparser.add_argument( + "-y", "--command-extra-override", metavar="S", default="", + help="Same as command-extra, but overrides values in PEP") + subparser.add_argument( + "-f", "--skip-file-checks", + action=_StoreBoolActionType, default=False, + type=html_checkbox(checked=False), + help="Do not perform input file checks") + + divvy_group = \ + subparser.add_argument_group( + "divvy arguments", + "Configure divvy to change computing settings") + divvy_group.add_argument( + "--divvy", default=None, metavar="DIVCFG", + help="Path to divvy configuration file. Default=$DIVCFG env " + "variable. Currently: {}".format(os.getenv('DIVCFG', None) + or "not set")) + divvy_group.add_argument( + "-p", "--package", metavar="P", + help="Name of computing resource package to use") + divvy_group.add_argument( + "-s", "--settings", default="", metavar="S", + help="Path to a YAML settings file with compute settings") + divvy_group.add_argument( + "-c", "--compute", metavar="K", nargs="+", + help="List of key-value pairs (k1=v1)") + + for subparser in [run_subparser, rerun_subparser]: + subparser.add_argument( + "-u", "--lump", default=None, metavar="X", + type=html_range(min_val=0, max_val=100, step=0.1, value=0), + help="Total input file size (GB) to batch into one job") + subparser.add_argument( + "-n", "--lumpn", default=None, metavar="N", + type=html_range(min_val=1, max_val="num_samples", value=1), + help="Number of commands to batch into one job") + + inspect_subparser.add_argument( + "-n", "--snames", required=False, nargs="+", metavar="S", + help="Name of the samples to inspect") + inspect_subparser.add_argument( + "-l", "--attr-limit", required=False, type=int, default=10, + metavar="L", help="Number of sample attributes to display") + + check_subparser.add_argument( + "-A", "--all-folders", action=_StoreBoolActionType, + default=False, type=html_checkbox(checked=False), + help="Check status for all output folders, not just for " + "samples specified in the config. Default=False") + check_subparser.add_argument( + "-f", "--flags", nargs='*', default=FLAGS, + type=html_select(choices=FLAGS), metavar="F", + help="Check on only these flags/status values") + + for subparser in [destroy_subparser, clean_subparser]: + subparser.add_argument( + "--force-yes", action=_StoreBoolActionType, default=False, + type=html_checkbox(checked=False), + help="Provide upfront confirmation of destruction intent, " + "to skip console query. Default=False") + + init_subparser.add_argument("config_file", help="Project configuration " + "file (YAML)") + + init_subparser.add_argument("-f", "--force", help="Force overwrite", + action="store_true", default=False) + + # Common arguments + for subparser in [run_subparser, rerun_subparser, table_subparser, + report_subparser, destroy_subparser, check_subparser, + clean_subparser, collate_subparser, inspect_subparser]: + subparser.add_argument("config_file", nargs="?", default=None, + help="Project configuration file (YAML)") + # help="Path to the output directory" + subparser.add_argument("-o", "--output-dir", metavar="DIR", + help=argparse.SUPPRESS) + # "Submission subdirectory name" + subparser.add_argument("--submission-subdir", metavar="DIR", + help=argparse.SUPPRESS) + # "Results subdirectory name" + subparser.add_argument("--results-subdir", metavar="DIR", + help=argparse.SUPPRESS) + # "Sample attribute for pipeline interface sources" + subparser.add_argument("--pipeline-interfaces-key", metavar="K", + help=argparse.SUPPRESS) + # "Paths to pipeline interface files" + subparser.add_argument("--pipeline-interfaces", metavar="P", + nargs="+", action="append", + help=argparse.SUPPRESS) + + for subparser in [run_subparser, rerun_subparser, table_subparser, + report_subparser, destroy_subparser, check_subparser, + clean_subparser, collate_subparser, inspect_subparser]: + fetch_samples_group = \ + subparser.add_argument_group( + "sample selection arguments", + "Specify samples to include or exclude based on sample attribute values") + fetch_samples_group.add_argument( + "-g", "--toggle-key", metavar="K", + help="Sample attribute specifying toggle. Default: toggle") + fetch_samples_group.add_argument( + "--sel-attr", default="toggle", metavar="ATTR", + help="Attribute for sample exclusion OR inclusion") + protocols = fetch_samples_group.add_mutually_exclusive_group() + protocols.add_argument( + "--sel-excl", nargs='*', metavar="E", + help="Exclude samples with these values") + protocols.add_argument( + "--sel-incl", nargs='*', metavar="I", + help="Include only samples with these values") + subparser.add_argument( + "-a", "--amend", nargs="+", metavar="A", + help="List of amendments to activate") + result.append(parser) + return result diff --git a/looper/_version.py b/looper/_version.py index 8e2394f4e..c68196d1c 100644 --- a/looper/_version.py +++ b/looper/_version.py @@ -1 +1 @@ -__version__ = "0.12.6" +__version__ = "1.2.0" diff --git a/looper/conductor.py b/looper/conductor.py index a07f5f4e2..b47f962f0 100644 --- a/looper/conductor.py +++ b/looper/conductor.py @@ -2,24 +2,19 @@ import logging import os -import re import subprocess import time +from jinja2.exceptions import UndefinedError -from .const import OUTKEY -from .exceptions import JobSubmissionException -from .pipeline_interface import PL_KEY -from .utils import \ - create_looper_args_text, grab_project_data, fetch_sample_flags - -from .sample import Sample -from peppy import VALID_READ_TYPES -from peppy.sample import SAMPLE_YAML_EXT - - -__author__ = "Vince Reuter" -__email__ = "vreuter@virginia.edu" +from attmap import AttMap +from eido import read_schema, validate_inputs +from peppy.const import CONFIG_KEY, SAMPLE_YAML_EXT, SAMPLE_NAME_ATTR +from .processed_project import populate_sample_paths +from .const import * +from .exceptions import JobSubmissionException +from .utils import grab_project_data, fetch_sample_flags, \ + jinja_render_cmd_strictly _LOGGER = logging.getLogger(__name__) @@ -36,10 +31,10 @@ class SubmissionConductor(object): """ - def __init__(self, pipeline_key, pipeline_interface, cmd_base, prj, - dry_run=False, delay=0, sample_subtype=None, extra_args=None, - ignore_flags=False, compute_variables=None, - max_cmds=None, max_size=None, automatic=True): + def __init__(self, pipeline_interface, prj, delay=0, extra_args=None, + extra_args_override=None, ignore_flags=False, + compute_variables=None, max_cmds=None, max_size=None, + automatic=True, collate=False): """ Create a job submission manager. @@ -48,27 +43,23 @@ def __init__(self, pipeline_key, pipeline_interface, cmd_base, prj, information like resource allocation packages and which pipeline will be overseen by this instance, respectively. - :param str pipeline_key: 'Hook' into the pipeline interface, and the - datum that determines which pipeline this manager will oversee. :param PipelineInterface pipeline_interface: Collection of important data for one or more pipelines, like resource allocation packages and option/argument specifications - :param str cmd_base: Base of each command for each job, e.g. the - script path and command-line options/flags that are constant - across samples. :param prj: Project with which each sample being considered is associated (what generated each sample) - :param bool dry_run: Whether this is a dry run and thus everything - but the actual job submission should be done. :param float delay: Time (in seconds) to wait before submitting a job once it's ready - :param type sample_subtype: Extension of base Sample, for particular - pipeline for which submissions will be managed by this instance - :param list extra_args: Additional arguments to add (positionally) to - each command within each job generated + :param str extra_args: string to pass to each job generated, + for example additional pipeline arguments + :param str extra_args_override: string to pass to each job generated, + for example additional pipeline arguments. This deactivates the + 'extra' functionality that appends strings defined in + Sample.command_extra and Project.looper.command_extra to the + command template. :param bool ignore_flags: Whether to ignore flag files present in the sample folder for each sample considered for submission - :param str compute_variables: A dict with variables that will be made + :param dict[str] compute_variables: A dict with variables that will be made available to the compute package. For example, this should include the name of the cluster partition to which job or jobs will be submitted :param int | NoneType max_cmds: Upper bound on number of commands to @@ -77,51 +68,52 @@ def __init__(self, pipeline_key, pipeline_interface, cmd_base, prj, size of inputs used by the commands lumped into single job script. :param bool automatic: Whether the submission should be automatic once the pool reaches capacity. + :param bool collate: Whether a collate job is to be submitted (runs on + the project level, rather that on the sample level) """ super(SubmissionConductor, self).__init__() - - self.pl_key = pipeline_key + self.collate = collate + self.section_key = PROJECT_PL_KEY if self.collate else SAMPLE_PL_KEY self.pl_iface = pipeline_interface - self.pl_name = pipeline_interface.get_pipeline_name(pipeline_key) - self.cmd_base = cmd_base.rstrip(" ") - - self.dry_run = dry_run - self.delay = float(delay) - - self.sample_subtype = sample_subtype or Sample - if not issubclass(self.sample_subtype, Sample): - raise TypeError("Sample type must extend {}; got {}".format( - Sample.__name__, type(self.sample_subtype).__name__)) - + self.pl_name = self.pl_iface.pipeline_name + self.prj = prj self.compute_variables = compute_variables - self.extra_pipe_args = extra_args or [] - #self.extra_args_text = (extra_args and " ".join(extra_args)) or "" - self.uses_looper_args = \ - pipeline_interface.uses_looper_args(pipeline_key) + self.extra_pipe_args = extra_args + self.override_extra = False + if extra_args_override: + self.extra_pipe_args = extra_args_override + self.override_extra = True self.ignore_flags = ignore_flags - self.prj = prj - self.automatic = automatic - - if max_cmds is None and max_size is None: - self.max_cmds = 1 - elif (max_cmds is not None and max_cmds < 1) or \ - (max_size is not None and max_size < 0): - raise ValueError( - "If specified, max per-job command count must positive, " - "and max per-job total file size must be nonnegative") - else: - self.max_cmds = max_cmds - self.max_size = max_size or float("inf") - self._failed_sample_names = [] - self._pool = [] - self._curr_size = 0 - self._reset_curr_skips() - self._skipped_sample_pools = [] + self.dry_run = self.prj.dry_run + self.delay = float(delay) self._num_good_job_submissions = 0 self._num_total_job_submissions = 0 self._num_cmds_submitted = 0 + self._curr_size = 0 + self._failed_sample_names = [] + + if self.extra_pipe_args: + _LOGGER.debug("String appended to every pipeline command: " + "{}".format(self.extra_pipe_args)) + + if not self.collate: + self.automatic = automatic + if max_cmds is None and max_size is None: + self.max_cmds = 1 + elif (max_cmds is not None and max_cmds < 1) or \ + (max_size is not None and max_size < 0): + raise ValueError( + "If specified, max per-job command count must positive, " + "and max per-job total file size must be nonnegative") + else: + self.max_cmds = max_cmds + self.max_size = max_size or float("inf") + + self._pool = [] + self._reset_curr_skips() + self._skipped_sample_pools = [] @property def failed_samples(self): @@ -158,130 +150,65 @@ def add_sample(self, sample, rerun=False): :raise TypeError: If sample subtype is provided but does not extend the base Sample class, raise a TypeError. """ - - _LOGGER.debug("Adding {} to conductor for {}".format(sample.name, self.pl_name)) + _LOGGER.debug("Adding {} to conductor for {} to {}run".format( + sample.sample_name, self.pl_name, "re" if rerun else "")) flag_files = fetch_sample_flags(self.prj, sample, self.pl_name) - use_this_sample = True + use_this_sample = not rerun - if flag_files: + if flag_files or rerun: if not self.ignore_flags: use_this_sample = False # But rescue the sample in case rerun/failed passes failed_flag = any("failed" in x for x in flag_files) - if rerun and failed_flag: - _LOGGER.info("> Re-running failed sample '%s' for pipeline '%s'.", - sample.name, self.pl_name) - use_this_sample = True + if rerun: + if failed_flag: + _LOGGER.info("> Re-running failed sample") + use_this_sample = True + else: + use_this_sample = False if not use_this_sample: - _LOGGER.info("> Skipping sample '%s' for pipeline '%s', " - "%s found: %s", sample.name, self.pl_name, - "flags" if len(flag_files) > 1 else "flag", - ", ".join(['{}'.format( - os.path.basename(fp)) for fp in flag_files])) - _LOGGER.debug("NO SUBMISSION") - - if type(sample) != self.sample_subtype: - _LOGGER.debug( - "Building {} from {}".format(self.sample_subtype, type(sample))) - sample = self.sample_subtype(sample.to_dict()) - else: - _LOGGER.debug( - "{} is already of type {}".format(sample.name, self.sample_subtype)) - _LOGGER.debug("Created %s instance: '%s'", - self.sample_subtype.__name__, sample.name) + msg = "> Skipping sample" + if flag_files: + msg += ". Flags found: {}".format(flag_files) + _LOGGER.info(msg) + + if self.prj.toggle_key in sample \ + and int(sample[self.prj.toggle_key]) == 0: + _LOGGER.warning( + "> Skiping sample ({}: {})". + format(self.prj.toggle_key, sample[self.prj.toggle_key]) + ) + use_this_sample = False + sample.prj = grab_project_data(self.prj) skip_reasons = [] - - try: - # Add pipeline-specific attributes. - sample.set_pipeline_attributes( - self.pl_iface, pipeline_name=self.pl_key) - except AttributeError: - # TODO: inform about WHICH missing attributes? - fail_message = "Pipeline required attribute missing" - _LOGGER.warning("> Not submitted: %s", fail_message) - use_this_sample and skip_reasons.append(fail_message) - + sample.setdefault("input_file_size", 0) # Check for any missing requirements before submitting. _LOGGER.debug("Determining missing requirements") - error_type, missing_reqs_general, missing_reqs_specific = \ - sample.determine_missing_requirements() - if missing_reqs_general: - missing_reqs_msg = "{}: {}".format( - missing_reqs_general, missing_reqs_specific) - if self.prj.permissive: - _LOGGER.warning("> Not submitted: %s", missing_reqs_msg) - else: - raise error_type(missing_reqs_msg) - use_this_sample and skip_reasons.append(missing_reqs_general) - - # Check if single_or_paired value is recognized. - if hasattr(sample, "read_type"): - # Drop "-end", "_end", or "end" from end of the column value. - rtype = re.sub('[_\\-]?end$', '', - str(sample.read_type)) - sample.read_type = rtype.lower() - if sample.read_type not in VALID_READ_TYPES: - _LOGGER.debug( - "Invalid read type: '{}'".format(sample.read_type)) - use_this_sample and skip_reasons.append( - "read_type must be in {}".format(VALID_READ_TYPES)) - - # Append arguments for this pipeline - # Sample-level arguments are handled by the pipeline interface. - try: - argstring = self.pl_iface.get_arg_string( - pipeline_name=self.pl_key, sample=sample, - submission_folder_path=self.prj.submission_folder) - except AttributeError: - argstring = None - # TODO: inform about which missing attribute. - fail_message = "Required attribute missing " \ - "for pipeline arguments string" - _LOGGER.warning("> Not submitted: %s", fail_message) - use_this_sample and skip_reasons.append(fail_message) - use_this_sample = False - - this_sample_size = float(sample.input_file_size) + schema_source = self.pl_iface.get_pipeline_schemas() + if schema_source and self.prj.file_checks: + missing = validate_inputs(sample, read_schema(schema_source)) + if missing: + missing_reqs_msg = "{}: {}".format("Missing files", missing) + _LOGGER.warning(NOT_SUB_MSG.format(missing_reqs_msg)) + use_this_sample and skip_reasons.append("Missing files") if _use_sample(use_this_sample, skip_reasons): - _check_argstring(argstring, sample.name) - self._pool.append((sample, argstring)) - self._curr_size += this_sample_size + self._pool.append(sample) + self._curr_size += float(sample.input_file_size) if self.automatic and self._is_full(self._pool, self._curr_size): self.submit() - elif argstring is not None: - self._curr_skip_size += this_sample_size - self._curr_skip_pool.append((sample, argstring)) + else: + self._curr_skip_size += float(sample.input_file_size) + self._curr_skip_pool.append(sample) if self._is_full(self._curr_skip_pool, self._curr_skip_size): - self._skipped_sample_pools.append( - (self._curr_skip_pool, self._curr_skip_size)) + self._skipped_sample_pools.append((self._curr_skip_pool, + self._curr_skip_size)) self._reset_curr_skips() return skip_reasons - def _get_settings_looptext_prjtext(self, size): - """ - Determine settings, looper argstring, and project argstring. - - :param int | float size: size of submission, used to select the proper - resource package from the pipeline interface - :return dict, str, str: collection of settings, looper argstring, and - project argstring - """ - settings = self.pl_iface.choose_resource_package(self.pl_key, size) - settings.update(self.compute_variables or {}) - if self.uses_looper_args: - settings.setdefault("cores", 1) - looper_argtext = \ - create_looper_args_text(self.pl_key, settings, self.prj) - else: - looper_argtext = "" - prj_argtext = self.prj.get_arg_string( - self.pl_key, {x for x in self.extra_pipe_args if x.startswith("-")}) - return settings, looper_argtext, prj_argtext - def submit(self, force=False): """ Submit one or more commands as a job. @@ -295,44 +222,25 @@ def submit(self, force=False): :return bool: Whether a job was submitted (or would've been if not for dry run) """ - + submitted = False if not self._pool: _LOGGER.debug("No submission (no pooled samples): %s", self.pl_name) - submitted = False - - elif force or self._is_full(self._pool, self._curr_size): - # Ensure that each sample is individually represented on disk, - # specific to subtype as applicable (should just be a single - # subtype for each submission conductor, but some may just be - # the base Sample while others are the single valid subtype.) - pipe_data = self.pl_iface[PL_KEY][self.pl_key] - try: - outputs = pipe_data[OUTKEY] - except KeyError: - _LOGGER.debug("No outputs for pipeline '{}'".format(self.pl_key)) - add_outputs = lambda _: None - else: - def add_outputs(s): - s[OUTKEY] = outputs - for s, _ in self._pool: - if not _is_base_sample(s): - subtype_name = s.__class__.__name__ - _LOGGER.debug("Writing %s representation to disk: '%s'", - subtype_name, s.name) - add_outputs(s) - yaml_path = s.to_yaml(subs_folder_path=self.prj.submission_folder) - _LOGGER.debug("Wrote sample YAML: {}".format(yaml_path)) - + # submitted = False + elif self.collate or force or self._is_full(self._pool, self._curr_size): + if not self.collate: + for s in self._pool: + schemas = self.prj.get_schemas(self.prj.get_sample_piface( + s[SAMPLE_NAME_ATTR]), OUTPUT_SCHEMA_KEY) + [populate_sample_paths(s, read_schema(schema)) + for schema in schemas] + s.to_yaml(self._get_sample_yaml_path(s)) script = self.write_script(self._pool, self._curr_size) - - self._num_total_job_submissions += 1 - # Determine whether to actually do the submission. - _LOGGER.info("Job script (n=%d; %.2f Gb): %s", - len(self._pool), self._curr_size, script) + _LOGGER.info("Job script (n={0}; {1:.2f}Gb): {2}". + format(len(self._pool), self._curr_size, script)) if self.dry_run: _LOGGER.info("Dry run, not submitted") - else: + elif self._rendered_ok: sub_cmd = self.prj.dcc.compute.submission_command submission_command = "{} {}".format(sub_cmd, script) # Capture submission command return value so that we can @@ -340,26 +248,50 @@ def add_outputs(s): try: subprocess.check_call(submission_command, shell=True) except subprocess.CalledProcessError: - self._failed_sample_names.extend( - [s.name for s in self._samples]) + fails = "" if self.collate \ + else [s.sample_name for s in self._samples] + self._failed_sample_names.extend(fails) self._reset_pool() raise JobSubmissionException(sub_cmd, script) time.sleep(self.delay) # Update the job and command submission tallies. _LOGGER.debug("SUBMITTED") - submitted = True - self._num_good_job_submissions += 1 - self._num_cmds_submitted += len(self._pool) + if self._rendered_ok: + submitted = True + self._num_cmds_submitted += len(self._pool) self._reset_pool() else: _LOGGER.debug("No submission (pool is not full and submission " "was not forced): %s", self.pl_name) - submitted = False + # submitted = False return submitted + def _get_sample_yaml_path(self, sample): + """ + Generate path to the sample YAML target location. + + Render path template defined in the pipeline section + (relative to the pipeline output directory). + If no template defined, output to the submission directory. + + :param peppy.Sample sample: sample to generate yaml path for + :return str: path to yaml file + """ + if SAMPLE_YAML_PATH_KEY not in self.pl_iface: + return os.path.join(self.prj.submission_folder, + "{}{}".format(sample.sample_name, + SAMPLE_YAML_EXT[0])) + pth_templ = self.pl_iface[SAMPLE_YAML_PATH_KEY] + namespaces = {"sample": sample, + "project": self.prj.prj[CONFIG_KEY], + "pipeline": self.pl_iface} + path = jinja_render_cmd_strictly(pth_templ, namespaces) + return path if os.path.isabs(path) \ + else os.path.join(self.prj.output_dir, path) + def _is_full(self, pool, size): """ Determine whether it's time to submit a job for the pool of commands. @@ -381,69 +313,140 @@ def _samples(self): :return Iterable[str]: collection of samples currently in the active pool for this submission conductor """ - return [s for s, _ in self._pool] + return [s for s in self._pool] - def _jobname(self, pool): - """ Create the name for a job submission. """ + def _sample_lump_name(self, pool): + """ Determine how to refer to the 'sample' for this submission. """ + if self.collate: + return "collate" if 1 == self.max_cmds: assert 1 == len(pool), \ - "If there's a single-command limit on job submission, jobname " \ - "must be determined with exactly one sample in the pool, but " \ - "there is/are {}.".format(len(pool)) - sample, _ = pool[0] - name = sample.name + "If there's a single-command limit on job submission, jobname" \ + " must be determined with exactly one sample in the pool," \ + " but there is/are {}.".format(len(pool)) + sample = pool[0] + return sample.sample_name else: # Note the order in which the increment of submission count and # the call to this function can influence naming. Make the jobname # generation call (this method) before incrementing the # submission counter, but add 1 to the index so that we get a # name concordant with 1-based, not 0-based indexing. - name = "lump{}".format(self._num_total_job_submissions + 1) - return "{}_{}".format(self.pl_key, name) + return "lump{}".format(self._num_total_job_submissions + 1) - def _cmd_text_extra(self, size): - _LOGGER.debug("Determining submission settings for pool of size %.2f Gb", size) - settings, ltext, ptext = self._get_settings_looptext_prjtext(size) - from_cli = " ".join(self.extra_pipe_args) if self.extra_pipe_args else "" - return settings, " ".join([t for t in [ptext, ltext, from_cli] if t]) + def _jobname(self, pool): + """ Create the name for a job submission. """ + return "{}_{}".format(self.pl_iface.pipeline_name, + self._sample_lump_name(pool)) + + def _set_looper_namespace(self, pool, size): + """ + Compile a dictionary of looper/submission related settings for use in + the command templates and in submission script creation + in divvy (via adapters). Accessible via: {looper.attrname} + + :param Iterable[peppy.Sample] pool: collection of sample instances + :param float size: cumulative size of the given pool + :return dict: looper/submission related settings + """ + settings = AttMap() + settings.pep_config = self.prj.config_file + settings.results_subdir = self.prj.results_folder + settings.submission_subdir = self.prj.submission_folder + settings.output_dir = self.prj.output_dir + settings.sample_output_folder = \ + os.path.join(self.prj.results_folder, self._sample_lump_name(pool)) + settings.job_name = self._jobname(pool) + settings.total_input_size = size + settings.log_file = \ + os.path.join(self.prj.submission_folder, settings.job_name) + ".log" + if hasattr(self.prj, "pipeline_config"): + # Make sure it's a file (it could be provided as null.) + pl_config_file = self.prj.pipeline_config + if pl_config_file: + if not os.path.isfile(pl_config_file): + _LOGGER.error("Pipeline config file specified " + "but not found: %s", pl_config_file) + raise IOError(pl_config_file) + _LOGGER.info("Found config file: %s", pl_config_file) + # Append arg for config file if found + settings.pipeline_config = pl_config_file + return settings def write_script(self, pool, size): """ Create the script for job submission. - :param Iterable[(peppy.Sample, str)] pool: collection of pairs in which - first component is a sample instance and second is command/argstring + :param Iterable[peppy.Sample] pool: collection of sample instances :param float size: cumulative size of the given pool :return str: Path to the job submission script created. """ - - template_values, extra_parts_text = self._cmd_text_extra(size) - - def get_final_cmd(c): - return "{} {}".format(c, extra_parts_text) if extra_parts_text else c - - def get_base_cmd(argstr): - b = self.cmd_base - return (argstr and "{} {}".format(b, argstr.strip(" "))) or b - - # Create the individual commands to lump into this job. - commands = [get_final_cmd(get_base_cmd(argstring)) for _, argstring in pool] - - jobname = self._jobname(pool) - submission_base = os.path.join( - self.prj.submission_folder, jobname) - logfile = submission_base + ".log" - template_values["JOBNAME"] = jobname - template_values["CODE"] = "\n".join(commands) - template_values["LOGFILE"] = logfile - submission_script = submission_base + ".sub" - - _LOGGER.debug("> Creating submission script; command count: %d", len(commands)) - return self.prj.dcc.write_script(submission_script, template_values) + # looper settings determination + if self.collate: + pool = [None] + looper = self._set_looper_namespace(pool, size) + commands = [] + namespaces = dict(project=self.prj[CONFIG_KEY], + looper=looper, + pipeline=self.pl_iface) + templ = self.pl_iface["command_template"] + if not self.override_extra: + extras_template = EXTRA_PROJECT_CMD_TEMPLATE if self.collate \ + else EXTRA_SAMPLE_CMD_TEMPLATE + templ += extras_template + for sample in pool: + # cascading compute settings determination: + # divcfg < pipeline interface < config < CLI + cli = self.compute_variables or {} # CLI + if sample: + namespaces.update({"sample": sample}) + else: + namespaces.update({"samples": self.prj.samples}) + res_pkg = self.pl_iface.choose_resource_package(namespaces, size or 0) # config + res_pkg.update(cli) + self.prj.dcc.compute.update(res_pkg) # divcfg + namespaces.update({"compute": self.prj.dcc.compute}) + self._rendered_ok = False + try: + argstring = jinja_render_cmd_strictly(cmd_template=templ, + namespaces=namespaces) + except UndefinedError as jinja_exception: + _LOGGER.warning(NOT_SUB_MSG.format(str(jinja_exception))) + except KeyError as e: + exc = "pipeline interface is missing {} section".format(str(e)) + _LOGGER.warning(NOT_SUB_MSG.format(exc)) + else: + commands.append("{} {}".format(argstring, self.extra_pipe_args)) + self._rendered_ok = True + self._num_good_job_submissions += 1 + self._num_total_job_submissions += 1 + looper.command = "\n".join(commands) + if self.collate: + _LOGGER.debug("samples namespace:\n{}".format(self.prj.samples)) + else: + _LOGGER.debug("sample namespace:\n{}".format(sample)) + _LOGGER.debug("project namespace:\n{}".format(self.prj[CONFIG_KEY])) + _LOGGER.debug("pipeline namespace:\n{}".format(self.pl_iface)) + _LOGGER.debug("compute namespace:\n{}".format(self.prj.dcc.compute)) + _LOGGER.debug("looper namespace:\n{}".format(looper)) + subm_base = os.path.join(self.prj.submission_folder, looper.job_name) + return self.prj.dcc.write_script(output_path=subm_base + ".sub", + extra_vars=[{"looper": looper}]) def write_skipped_sample_scripts(self): - """ For any sample skipped during initial processing, write submission script. """ - return [self.write_script(pool, size) for pool, size in self._skipped_sample_pools] + """ + For any sample skipped during initial processingwrite submission script + """ + if self._curr_skip_pool: + # move any hanging samples from current skip pool to the main pool + self._skipped_sample_pools.append( + (self._curr_skip_pool, self._curr_skip_size) + ) + if self._skipped_sample_pools: + _LOGGER.info("Writing {} submission scripts for skipped samples". + format(len(self._skipped_sample_pools))) + [self.write_script(pool, size) + for pool, size in self._skipped_sample_pools] def _reset_pool(self): """ Reset the state of the pool of samples """ @@ -455,14 +458,5 @@ def _reset_curr_skips(self): self._curr_skip_size = 0 -def _check_argstring(argstring, sample_name): - assert argstring is not None, \ - "Failed to create argstring for sample: {}".format(sample_name) - - -def _is_base_sample(s): - return type(s) is Sample - - def _use_sample(flag, skips): return flag and not skips diff --git a/looper/const.py b/looper/const.py index 9264a03ff..cdbbdd77e 100644 --- a/looper/const.py +++ b/looper/const.py @@ -1,12 +1,28 @@ """ Shared project constants """ +import os + __author__ = "Databio lab" __email__ = "nathan@code.databio.org" +__all__ = [ + "BUTTON_APPEARANCE_BY_FLAG", "TABLE_APPEARANCE_BY_FLAG", + "ID_COLNAME", "NO_DATA_PLACEHOLDER", "OUTKEY", "ALL_SUBCMD_KEY", + "OUTDIR_KEY", "LOOPER_KEY", "COMPUTE_KEY", "PIPELINE_INTERFACES_KEY", + "SIZE_DEP_VARS_KEY", "FLAGS", "DYN_VARS_KEY", "SAMPLE_YAML_PATH_KEY", + "RESOURCES_KEY", "NOT_SUB_MSG", "EXTRA_KEY", "DEFAULT_CFG_PATH", + "PIFACE_SCHEMA_SRC", "RESULTS_SUBDIR_KEY", "SUBMISSION_SUBDIR_KEY", + "TEMPLATES_DIRNAME", "FILE_SIZE_COLNAME", "COMPUTE_PACKAGE_KEY", + "INPUT_SCHEMA_KEY", "OUTPUT_SCHEMA_KEY", "EXAMPLE_COMPUTE_SPEC_FMT", + "SAMPLE_PL_KEY", "PROJECT_PL_KEY", "CFG_ENV_VARS", "LOGGING_LEVEL", + "PIFACE_KEY_SELECTOR", "SUBMISSION_FAILURE_MESSAGE", "IMAGE_EXTS", + "PROFILE_COLNAMES", "SAMPLE_TOGGLE_ATTR", "TOGGLE_KEY_SELECTOR", + "LOOPER_DOTFILE_NAME", "POSITIONAL", "EXTRA_PROJECT_CMD_TEMPLATE", + "EXTRA_SAMPLE_CMD_TEMPLATE", "SELECTED_COMPUTE_PKG", "CLI_PROJ_ATTRS", + "DOTFILE_CFG_PTH_KEY", "DRY_RUN_KEY", "FILE_CHECKS_KEY", "CLI_KEY" +] -__all__ = ["BUTTON_APPEARANCE_BY_FLAG", "TABLE_APPEARANCE_BY_FLAG", "NO_DATA_PLACEHOLDER", "OUTKEY", - "PIPELINE_INTERFACES_KEY", "PIPELINE_REQUIREMENTS_KEY", - "RESULTS_SUBDIR_KEY", "SUBMISSION_SUBDIR_KEY", "TEMPLATES_DIRNAME"] +FLAGS = ["completed", "running", "failed", "waiting", "partial"] APPEARANCE_BY_FLAG = { "completed": { @@ -34,9 +50,10 @@ def _get_apperance_dict(type, templ=APPEARANCE_BY_FLAG): """ - Based on the type of the HTML element provided construct the appearence mapping using the template + Based on the type of the HTML element provided construct the appearence + mapping using the template - :param dict templ: appearance templete to populate + :param dict templ: appearance template to populate :param str type: type of HTML element to populate template with :return dict: populated appearance template """ @@ -48,14 +65,57 @@ def _get_apperance_dict(type, templ=APPEARANCE_BY_FLAG): return ret +LOGGING_LEVEL = "INFO" +CFG_ENV_VARS = ["LOOPER"] TABLE_APPEARANCE_BY_FLAG = _get_apperance_dict("table") BUTTON_APPEARANCE_BY_FLAG = _get_apperance_dict("btn btn") NO_DATA_PLACEHOLDER = "NA" +PIFACE_KEY_SELECTOR = "pipeline_interfaces_key" PIPELINE_INTERFACES_KEY = "pipeline_interfaces" -PIPELINE_REQUIREMENTS_KEY = "required_executables" +RESOURCES_KEY = "resources" +SAMPLE_PL_KEY = "sample_pipeline" +PROJECT_PL_KEY = "project_pipeline" +PIFACE_SCHEMA_SRC = os.path.join(os.path.dirname(os.path.abspath(__file__)), + "schemas", "pipeline_interface_schema_{}.yaml") +EXTRA_SAMPLE_CMD_TEMPLATE = "{%- if sample.command_extra is defined %} {sample.command_extra} {% endif -%}" +EXTRA_PROJECT_CMD_TEMPLATE = "{%- if project.looper.command_extra is defined %} {project.looper.command_extra}{% endif -%}" +DOTFILE_CFG_PTH_KEY = "config_file_path" +INPUT_SCHEMA_KEY = "input_schema" +OUTPUT_SCHEMA_KEY = "output_schema" +SAMPLE_YAML_PATH_KEY = "sample_yaml_path" +TOGGLE_KEY_SELECTOR = "toggle_key" +SAMPLE_TOGGLE_ATTR = "toggle" OUTKEY = "outputs" +COMPUTE_KEY = "compute" +COMPUTE_PACKAGE_KEY = "package" +SIZE_DEP_VARS_KEY = "size_dependent_variables" +DYN_VARS_KEY = "dynamic_variables_command_template" +TEMPLATES_DIRNAME = "jinja_templates" +NOT_SUB_MSG = "> Not submitted: {}" +IMAGE_EXTS = ('.png', '.jpg', '.jpeg', '.svg', '.gif') +PROFILE_COLNAMES = ['pid', 'hash', 'cid', 'runtime', 'mem', 'cmd', 'lock'] # this strongly depends on pypiper's profile.tsv format + +PIPE_ARGS_SECTION = "pipeline_args" +CLI_KEY = "cli" +LOOPER_KEY = "looper" +OUTDIR_KEY = "output_dir" RESULTS_SUBDIR_KEY = "results_subdir" SUBMISSION_SUBDIR_KEY = "submission_subdir" -TEMPLATES_DIRNAME = "jinja_templates" +DRY_RUN_KEY = "dry_run" +FILE_CHECKS_KEY = "skip_file_checks" +EXAMPLE_COMPUTE_SPEC_FMT = "k1=v1 k2=v2" +SUBMISSION_FAILURE_MESSAGE = "Cluster resource failure" +LOOPER_DOTFILE_NAME = "." + LOOPER_KEY + ".yaml" +POSITIONAL = ["config_file", "command"] +SELECTED_COMPUTE_PKG = "package" +EXTRA_KEY = "_cli_extra" +ALL_SUBCMD_KEY = "all" +DEFAULT_CFG_PATH = os.path.join(os.getcwd(), LOOPER_DOTFILE_NAME) +CLI_PROJ_ATTRS = [OUTDIR_KEY, TOGGLE_KEY_SELECTOR, SUBMISSION_SUBDIR_KEY, PIPELINE_INTERFACES_KEY, + RESULTS_SUBDIR_KEY, PIFACE_KEY_SELECTOR, COMPUTE_PACKAGE_KEY, DRY_RUN_KEY, FILE_CHECKS_KEY] + +# resource package TSV-related consts +ID_COLNAME = "id" +FILE_SIZE_COLNAME = "max_file_size" IMAGE_EXTS = ('.png', '.jpg', '.jpeg', '.svg', '.gif') PROFILE_COLNAMES = ['pid', 'hash', 'cid', 'runtime', 'mem', 'cmd', 'lock'] # this strongly depends on pypiper's profile.tsv format diff --git a/looper/exceptions.py b/looper/exceptions.py index 995b72c7b..56ee54e26 100644 --- a/looper/exceptions.py +++ b/looper/exceptions.py @@ -12,7 +12,8 @@ "LooperError", "MissingPipelineConfigurationException", "PipelineInterfaceConfigError", - "PipelineInterfaceRequirementsError"] + "PipelineInterfaceRequirementsError", + "MisconfigurationException"] class LooperError(Exception): @@ -20,6 +21,12 @@ class LooperError(Exception): __metaclass__ = ABCMeta +class MisconfigurationException(LooperError): + """ Duplication of pipeline identifier precludes unique pipeline ref. """ + def __init__(self, key): + super(MisconfigurationException, self).__init__(key) + + class DuplicatePipelineKeyException(LooperError): """ Duplication of pipeline identifier precludes unique pipeline ref. """ def __init__(self, key): @@ -34,7 +41,6 @@ def __init__(self, reason): class JobSubmissionException(LooperError): """ Error type for when job submission fails. """ - def __init__(self, sub_cmd, script): self.script = script reason = "Error for command {} and script '{}'".\ diff --git a/looper/html_reports.py b/looper/html_reports.py index cafaa51e6..9491ea5f9 100644 --- a/looper/html_reports.py +++ b/looper/html_reports.py @@ -10,7 +10,11 @@ from warnings import warn from datetime import timedelta from ._version import __version__ as v -from .const import TEMPLATES_DIRNAME, BUTTON_APPEARANCE_BY_FLAG, TABLE_APPEARANCE_BY_FLAG, NO_DATA_PLACEHOLDER, IMAGE_EXTS, PROFILE_COLNAMES +from .const import * +from .processed_project import get_project_outputs +from .utils import get_file_for_project +from peppy.const import * +from eido import read_schema from copy import copy as cp _LOGGER = logging.getLogger("looper") @@ -27,20 +31,26 @@ def __init__(self, prj): super(HTMLReportBuilder, self).__init__() self.prj = prj self.j_env = get_jinja_env() - self.reports_dir = get_reports_dir(self.prj) - self.index_html_path = get_index_html_path(self.prj) + self.reports_dir = get_file_for_project(self.prj, "reports") + self.index_html_path = get_file_for_project(self.prj, "summary.html") self.index_html_filename = os.path.basename(self.index_html_path) + self._outdir = self.prj.output_dir _LOGGER.debug("Reports dir: {}".format(self.reports_dir)) def __call__(self, objs, stats, columns): """ Do the work of the subcommand/program. """ # Generate HTML report - navbar = self.create_navbar(self.create_navbar_links(objs=objs, stats=stats, wd=self.prj.metadata.output_dir), - self.index_html_filename) - navbar_reports = self.create_navbar(self.create_navbar_links(objs=objs, stats=stats, wd=self.reports_dir), - os.path.join("..", self.index_html_filename)) - index_html_path = self.create_index_html(objs, stats, columns, footer=self.create_footer(), navbar=navbar, - navbar_reports=navbar_reports) + navbar = self.create_navbar(self.create_navbar_links( + objs=objs, stats=stats, + wd=self._outdir), + self.index_html_filename) + navbar_reports = self.create_navbar( + self.create_navbar_links( + objs=objs, stats=stats, wd=self.reports_dir), + os.path.join("..", self.index_html_filename)) + index_html_path = self.create_index_html( + objs, stats, columns, footer=self.create_footer(), + navbar=navbar, navbar_reports=navbar_reports) return index_html_path def create_object_parent_html(self, objs, navbar, footer): @@ -204,28 +214,28 @@ def create_object_html(self, single_object, navbar, footer): err_msg = ("Sample: {} | " + "Missing valid object path for: {}") # Report the sample that fails, if that information exists if str(row['sample_name']) and str(row['filename']): - _LOGGER.warn(err_msg.format(row['sample_name'], row['filename'])) + _LOGGER.warning(err_msg.format(row['sample_name'], row['filename'])) else: - _LOGGER.warn(err_msg.format("Unknown sample")) + _LOGGER.warning(err_msg.format("Unknown sample")) object_relpath = "" # Set the PATH to the image/file. Catch any errors. # Check if the object is an HTML document + if not str(row['anchor_image']).lower().endswith(IMAGE_EXTS): image_path = object_path else: try: image_path = os.path.join(self.prj.results_folder, row['sample_name'], row['anchor_image']) except AttributeError: - _LOGGER.warn(str(row)) + _LOGGER.warning(str(row)) err_msg = ("Sample: {} | " + "Missing valid image path for: {}") # Report the sample that fails, if that information exists if str(row['sample_name']) and str(row['filename']): - _LOGGER.warn(err_msg.format(row['sample_name'], row['filename'])) + _LOGGER.warning(err_msg.format(row['sample_name'], row['filename'])) else: - _LOGGER.warn(err_msg.format("Unknown", "Unknown")) + _LOGGER.warning(err_msg.format("Unknown", "Unknown")) image_path = "" - # Check for the presence of both the file and thumbnail if os.path.isfile(image_path) and os.path.isfile(object_path): image_relpath = os.path.relpath(image_path, self.reports_dir) @@ -263,7 +273,7 @@ def create_sample_html(self, objs, sample_name, sample_stats, navbar, footer): """ html_filename = sample_name + ".html" html_page = os.path.join(self.reports_dir, html_filename.replace(' ', '_').lower()) - sample_page_relpath = os.path.relpath(html_page, self.prj.metadata.output_dir) + sample_page_relpath = os.path.relpath(html_page, self._outdir) single_sample = _pd.DataFrame() if objs.empty else objs[objs['sample_name'] == sample_name] if not os.path.exists(os.path.dirname(html_page)): os.makedirs(os.path.dirname(html_page)) @@ -371,59 +381,79 @@ def create_status_html(self, status_table, navbar, footer): :return str: rendered status HTML file """ _LOGGER.debug("Building status page...") - template_vars = dict(status_table=status_table, navbar=navbar, footer=footer) + template_vars = dict(status_table=status_table, navbar=navbar, + footer=footer) return render_jinja_template("status.html", self.j_env, template_vars) def create_project_objects(self): - """ Render available project level summaries as additional figures/links """ - _LOGGER.debug("Building project object...") - all_protocols = [sample.protocol for sample in self.prj.samples] - + """ + Render available project level outputs defined in the + pipeline output schemas + """ + _LOGGER.debug("Building project objects section...") + figures = [] + links = [] + warnings = [] # For each protocol report the project summarizers' results - for protocol in set(all_protocols): - _LOGGER.debug("Creating project objects for protocol:{}".format(protocol)) - figures = [] - links = [] - warnings = [] - ifaces = self.prj.get_interfaces(protocol) - - # Check the interface files for summarizers - for iface in ifaces: - pl = iface.fetch_pipelines(protocol) - summary_results = iface.get_attribute(pl, "summary_results") - - # Build the HTML for each summary result - if summary_results is not None: - for result in summary_results: - result.setdefault('caption', "No caption") - caption = str(result['caption']) - result_file = str(result['path']).replace('{name}', str(self.prj.name)) - result_img = str(result['thumbnail_path']).replace('{name}', str(self.prj.name)) - search = os.path.join(self.prj.metadata.output_dir, '{}'.format(result_file)) + self.prj.populate_pipeline_outputs() + ifaces = self.prj.project_pipeline_interfaces + # Check the interface files for summarizers + for iface in ifaces: + schema_paths = \ + iface.get_pipeline_schemas(OUTPUT_SCHEMA_KEY) + if schema_paths is not None: + if isinstance(schema_paths, str): + schema_paths = [schema_paths] + for output_schema_path in schema_paths: + results = get_project_outputs( + self.prj, read_schema(output_schema_path)) + for name, result in results.items(): + title = str(result.setdefault('title', "No caption")) + result_type = str(result['type']) + result_file = str(result['path']) + result_img = \ + str(result.setdefault('thumbnail_path', None)) + if result_img and not os.path.isabs(result_file): + result_img = os.path.join( + self._outdir, result_img) + if not os.path.isabs(result_file): + result_file = os.path.join( + self._outdir, result_file) + _LOGGER.debug("Looking for project file: {}". + format(result_file)) # Confirm the file itself was produced - if glob.glob(search): - file_path = str(glob.glob(search)[0]) - file_relpath = os.path.relpath(file_path, self.prj.metadata.output_dir) - search = os.path.join(self.prj.metadata.output_dir, '{}'.format(result_img)) - - # Add as a figure if thumbnail exists - if glob.glob(search): - img_path = str(glob.glob(search)[0]) - img_relpath = os.path.relpath(img_path, self.prj.metadata.output_dir) - figures.append([file_relpath, caption, img_relpath]) + if glob.glob(result_file): + file_path = str(glob.glob(result_file)[0]) + file_relpath = \ + os.path.relpath(file_path, self._outdir) + if result_type == "image": + # Add as a figure, find thumbnail + search = os.path.join(self._outdir, result_img) + if glob.glob(search): + img_path = str(glob.glob(search)[0]) + img_relpath = \ + os.path.relpath(img_path, self._outdir) + figures.append( + [file_relpath, title, img_relpath]) # add as a link otherwise + # TODO: add more fine-grained type support? + # not just image and link else: - links.append([caption, file_relpath]) - + links.append([title, file_relpath]) else: - warnings.append("{} ({})".format(caption, result_file)) - else: - _LOGGER.debug("No custom summarizers were found for this pipeline. Proceeded with default only.") - if warnings: - _LOGGER.warning("Summarizer was unable to find: " + ', '.join(str(x) for x in warnings)) - + warnings.append("{} ({})".format(title, + result_file)) + else: + _LOGGER.debug("No project-level outputs defined in " + "schema: {}".format(schema_paths)) + if warnings: + _LOGGER.warning("Not found: {}". + format([str(x) for x in warnings])) + _LOGGER.debug("collected project-level figures: {}".format(figures)) + _LOGGER.debug("collected project-level links: {}".format(links)) template_vars = dict(figures=figures, links=links) - return render_jinja_template("project_object.html", self.j_env, template_vars) + return render_jinja_template("project_object.html", self.j_env, + template_vars) def create_index_html(self, objs, stats, col_names, navbar, footer, navbar_reports=None): """ @@ -432,7 +462,7 @@ def create_index_html(self, objs, stats, col_names, navbar, footer, navbar_repor :param pandas.DataFrame objs: project level dataframe containing any reported objects for all samples - :param list stats[dict]: a summary file of pipeline statistics for each + :param list[dict] stats: a summary file of pipeline statistics for each analyzed sample :param list col_names: all unique column names used in the stats file :param str navbar: HTML to be included as the navbar in the main summary page @@ -441,6 +471,7 @@ def create_index_html(self, objs, stats, col_names, navbar, footer, navbar_repor """ # set default encoding when running in python2 if sys.version[0] == '2': + from importlib import reload reload(sys) sys.setdefaultencoding("utf-8") _LOGGER.debug("Building index page...") @@ -452,14 +483,14 @@ def create_index_html(self, objs, stats, col_names, navbar, footer, navbar_repor if not objs.dropna().empty: objs.drop_duplicates(keep='last', inplace=True) # Generate parent index.html page path - index_html_path = get_index_html_path(self.prj) + index_html_path = get_file_for_project(self.prj, "summary.html") # Add stats_summary.tsv button link - stats_file_name = os.path.join(self.prj.metadata.output_dir, self.prj.name) + stats_file_name = os.path.join(self._outdir, self.prj.name) if hasattr(self.prj, "subproject") and self.prj.subproject: stats_file_name += '_' + self.prj.subproject stats_file_name += '_stats_summary.tsv' - stats_file_path = os.path.relpath(stats_file_name, self.prj.metadata.output_dir) + stats_file_path = os.path.relpath(stats_file_name, self._outdir) # Add stats summary table to index page and produce individual # sample pages if os.path.isfile(stats_file_name): @@ -510,30 +541,6 @@ def create_index_html(self, objs, stats, col_names, navbar, footer, navbar_repor return index_html_path -def get_reports_dir(prj): - """ - Get the reports directory path depending on the subproject activation status - - :param looper.Project prj: the project to determine the reports directory for - :return str: path to the reports directory - """ - rep_dir_name = "reports" if prj.subproject is None else "reports_" + prj.subproject - return os.path.join(prj.metadata.output_dir, rep_dir_name) - - -def get_index_html_path(prj): - """ - Get the index HTML path depending on the subproject activation status - - :param looper.Project prj: the project to determine the index HTML path for - :return str: path to the index HTML - """ - index_html_root = os.path.join(prj.metadata.output_dir, prj.name) - if prj.subproject is not None: - index_html_root += "_" + prj.subproject - return index_html_root + "_summary.html" - - def render_jinja_template(name, jinja_env, args=dict()): """ Render template in the specified jinja environment using the provided args @@ -747,13 +754,14 @@ def uniqify(seq): def create_status_table(prj, final=True): """ Creates status table, the core of the status page. - It is abstracted into a function so that it can be used in other software packages. - It can produce a table of two types. With links to the samples/log files and without. - The one without can be used to render HTMLs for on-th-fly job status inspection + It is abstracted into a function so that it can be used in other software + packages. It can produce a table of two types. With links to the + samples/log files and without. The one without can be used to render HTMLs + for on-th-fly job status inspection. :param looper.Project prj: project to create the status table for - :param bool final: if the status table is created for a finalized looper run. In such a case, - links to samples and log files will be provided + :param bool final: if the status table is created for a finalized looper + run. In such a case, links to samples and log files will be provided :return str: rendered status HTML file """ status_warning = False @@ -794,23 +802,29 @@ def create_status_table(prj, final=True): row_classes.append(button_class) # get first column data (sample name/link) page_name = sample_name + ".html" - page_path = os.path.join(get_reports_dir(prj), page_name.replace(' ', '_').lower()) - page_relpath = os.path.relpath(page_path, get_reports_dir(prj)) + page_path = os.path.join(get_file_for_project(prj, "reports"), + page_name.replace(' ', '_').lower()) + page_relpath = os.path.relpath(page_path, + get_file_for_project(prj, "reports")) sample_paths.append(page_relpath) sample_link_names.append(sample_name) # get second column data (status/flag) flags.append(flag) # get third column data (log file/link) - log_name = _match_file_for_sample(sample_name, "log.md", prj.results_folder) - log_file_link = _get_relpath_to_file(log_name, sample_name, prj.results_folder, - get_reports_dir(prj)) + log_name = _match_file_for_sample(sample_name, "log.md", + prj.results_folder) + log_file_link = \ + _get_relpath_to_file(log_name, sample_name, prj.results_folder, + get_file_for_project(prj, "reports")) log_link_names.append(log_name) log_paths.append(log_file_link) # get fourth column data (runtime) and fifth column data (memory) - profile_file_path = _match_file_for_sample(sample.sample_name, 'profile.tsv', prj.results_folder, - full_path=True) + profile_file_path = \ + _match_file_for_sample(sample.sample_name, 'profile.tsv', + prj.results_folder, full_path=True) if os.path.exists(profile_file_path): - df = _pd.read_csv(profile_file_path, sep="\t", comment="#", names=PROFILE_COLNAMES) + df = _pd.read_csv(profile_file_path, sep="\t", comment="#", + names=PROFILE_COLNAMES) df['runtime'] = _pd.to_timedelta(df['runtime']) times.append(_get_runtime(df)) mems.append(_get_maxmem(df)) @@ -824,24 +838,21 @@ def create_status_table(prj, final=True): # Alert the user to any warnings generated if status_warning: - warn("The stats table is incomplete, likely because " + - "one or more jobs either failed or is still running.") + _LOGGER.warning("The stats table is incomplete, likely because one or " + "more jobs either failed or is still running.") if sample_warning: - if len(sample_warning) == 1: - warn("{} is not present in {}".format( - ''.join(str(sample) for sample in sample_warning), - prj.results_folder)) - else: - warn_msg = "The following samples are not present in {}: {}" - warn(warn_msg.format( - prj.results_folder, - ' '.join(str(sample) for sample in sample_warning))) - template_vars = dict(sample_link_names=sample_link_names, row_classes=row_classes, flags=flags, times=times, + _LOGGER.warning("{} samples not present in {}: {}".format( + len(sample_warning), prj.results_folder, + str([sample for sample in sample_warning]))) + template_vars = dict(sample_link_names=sample_link_names, + row_classes=row_classes, flags=flags, times=times, mems=mems) template_name = "status_table_no_links.html" if final: template_name = "status_table.html" - template_vars.update(dict(sample_paths=sample_paths, log_link_names=log_link_names, log_paths=log_paths)) + template_vars.update(dict(sample_paths=sample_paths, + log_link_names=log_link_names, + log_paths=log_paths)) return render_jinja_template(template_name, get_jinja_env(), template_vars) diff --git a/looper/jinja_templates/index.html b/looper/jinja_templates/index.html index 64db99fad..38d809770 100644 --- a/looper/jinja_templates/index.html +++ b/looper/jinja_templates/index.html @@ -261,7 +261,7 @@

Looper {{ project_name }} summary
-
+

Plot a column

diff --git a/looper/looper.py b/looper/looper.py index e4e0278a5..69021920a 100755 --- a/looper/looper.py +++ b/looper/looper.py @@ -5,7 +5,6 @@ import abc import csv -from collections import defaultdict import glob import logging import os @@ -16,35 +15,34 @@ else: from collections.abc import Mapping import yaml +import pandas as _pd +from collections import defaultdict # Need specific sequence of actions for colorama imports? from colorama import init init() from colorama import Fore, Style from shutil import rmtree -import pandas as _pd +from jsonschema import ValidationError +from copy import copy -from . import FLAGS, GENERIC_PROTOCOL_KEY, LOGGING_LEVEL, __version__, \ - build_parser, _LEVEL_BY_VERBOSITY +from . import __version__, build_parser, _LEVEL_BY_VERBOSITY from .conductor import SubmissionConductor from .const import * -from .exceptions import JobSubmissionException -from .html_reports import HTMLReportBuilder, get_index_html_path, get_reports_dir -from .pipeline_interface import RESOURCES_KEY -from .project import Project -from .utils import determine_config_path, fetch_flag_files, sample_folder +from .exceptions import JobSubmissionException, MisconfigurationException +from .html_reports import HTMLReportBuilder +from .project import Project, ProjectContext +from .utils import * +from .looper_config import * -from divvy import DEFAULT_COMPUTE_RESOURCES_NAME, NEW_COMPUTE_KEY as COMPUTE_KEY +from divvy import DEFAULT_COMPUTE_RESOURCES_NAME, select_divvy_config from logmuse import init_logger -from peppy import ProjectContext, METADATA_KEY, SAMPLE_EXECUTION_TOGGLE +from peppy.const import * +from eido import validate_sample, validate_config, inspect_project +from ubiquerg.cli_tools import query_yes_no +from ubiquerg.collection import uniqify -from ubiquerg import query_yes_no -SUBMISSION_FAILURE_MESSAGE = "Cluster resource failure" - - -_FAIL_DISPLAY_PROPORTION_THRESHOLD = 0.5 -_MAX_FAIL_SAMPLE_DISPLAY = 20 _PKGNAME = "looper" _LOGGER = logging.getLogger(_PKGNAME) @@ -112,8 +110,9 @@ def __call__(self, flags=None, all_folders=False, max_file_count=30): for flag in flags: try: files = files_by_flag[flag] - except: - # No files for flag. + except Exception as e: + _LOGGER.debug("No files for {} flag. Caught exception: {}". + format(flags, getattr(e, 'message', repr(e)))) continue # If checking on a specific flag, do not limit the number of # reported filepaths, but do not report empty file lists @@ -139,13 +138,14 @@ def __call__(self, args, preview_flag=True): :param argparse.Namespace args: command-line options and arguments :param bool preview_flag: whether to halt before actually removing files """ - _LOGGER.info("Files to clean:") - for sample in self.prj.samples: - _LOGGER.info(self.counter.show(sample.sample_name, sample.protocol)) + _LOGGER.info(self.counter.show(sample.sample_name)) sample_output_folder = sample_folder(self.prj, sample) cleanup_files = glob.glob(os.path.join(sample_output_folder, "*_cleanup.sh")) + if not cleanup_files: + _LOGGER.info("Nothing to clean.") + continue if preview_flag: # Preview: Don't actually clean, just show what will be cleaned. _LOGGER.info("Files to clean: %s", ", ".join(cleanup_files)) @@ -153,22 +153,18 @@ def __call__(self, args, preview_flag=True): for f in cleanup_files: _LOGGER.info(f) subprocess.call(["sh", f]) - if not preview_flag: _LOGGER.info("Clean complete.") return 0 - if args.dry_run: _LOGGER.info("Dry run. No files cleaned.") return 0 - - if not args.force_yes and not query_yes_no("Are you sure you want to permanently delete all " - "intermediate pipeline results for this project?"): + if not args.force_yes and not \ + query_yes_no("Are you sure you want to permanently delete all " + "intermediate pipeline results for this project?"): _LOGGER.info("Clean action aborted by user.") return 1 - self.counter.reset() - return self(args, preview_flag=False) @@ -185,7 +181,7 @@ def __call__(self, args, preview_flag=True): _LOGGER.info("Removing results:") for sample in self.prj.samples: - _LOGGER.info(self.counter.show(sample.sample_name, sample.protocol)) + _LOGGER.info(self.counter.show(sample.sample_name)) sample_output_folder = sample_folder(self.prj, sample) if preview_flag: # Preview: Don't actually delete, just show files. @@ -204,8 +200,9 @@ def __call__(self, args, preview_flag=True): _LOGGER.info("Dry run. No files destroyed.") return 0 - if not args.force_yes and not query_yes_no("Are you sure you want to permanently delete all pipeline " - "results for this project?"): + if not args.force_yes and not query_yes_no( + "Are you sure you want to permanently delete all pipeline " + "results for this project?"): _LOGGER.info("Destroy action aborted by user.") return 1 @@ -215,72 +212,68 @@ def __call__(self, args, preview_flag=True): return self(args, preview_flag=False) -def process_protocols(prj, protocols, resource_setting_kwargs=None, **kwargs): - """ - Create submission conductors and collect by protocol the mapped pipelines. - - :param looper.Project prj: project definition - :param Iterable[str] protocols: names of protocols mapped to pipelines - for which conductors are to be created - :param Mapping resource_setting_kwargs: key-value pairs collection storing - specific compute resource settings - :return Mapping[str, looper.conductor.SubmissionConductor], Mapping[str, list[str]]: - mapping from pipeline key to submission conductor, and mapping from - protocol name to collection of keys for pipelines for that protocol - :raise TypeError: if the project's computing configuration instance isn't - a mapping - """ - # Job submissions are managed on a per-pipeline basis so that - # individual commands (samples) may be lumped into a single job. - submission_conductors = {} - pipe_keys_by_protocol = defaultdict(list) - - if resource_setting_kwargs: - if not isinstance(resource_setting_kwargs, Mapping): - raise TypeError( - "Resource settings argument must be mapping; got {} ({})". - format(resource_setting_kwargs, type(resource_setting_kwargs))) - else: - resource_setting_kwargs = {} +class Collator(Executor): + """" Submitter for project-level pipelines """ + def __init__(self, prj): + """ + Initializes an instance - try: - comp_vars = prj.dcc[COMPUTE_KEY].to_map() - except AttributeError: - if not isinstance(prj.dcc[COMPUTE_KEY], Mapping): - raise TypeError("Project's computing config isn't a mapping: {} ({})". - format(prj.dcc[COMPUTE_KEY], type(prj.dcc[COMPUTE_KEY]))) - from copy import deepcopy - comp_vars = deepcopy(prj.dcc[COMPUTE_KEY]) - comp_vars.update(resource_setting_kwargs or {}) - - _LOGGER.info("Known protocols: {}".format( - ", ".join(prj.interfaces.protocols))) - - for proto in set(protocols) | {GENERIC_PROTOCOL_KEY}: - _LOGGER.debug("Determining sample type, script, and flags for " - "pipeline(s) associated with protocol: %s", proto) - submission_bundles = prj.build_submission_bundles(proto) - if not submission_bundles: - if proto != GENERIC_PROTOCOL_KEY: + :param Project prj: Project with which to work/operate on + """ + super(Executor, self).__init__() + self.prj = prj + + def __call__(self, args, **compute_kwargs): + """ + Matches collators by protocols, creates submission scripts + and submits them + + :param argparse.Namespace args: parsed command-line options and + arguments, recognized by looper + """ + jobs = 0 + project_pifaces = self.prj.project_pipeline_interface_sources + if not project_pifaces: + raise MisconfigurationException( + "Looper requires a pointer to at least one project pipeline. " + "Please refer to the documentation on linking project to a " + "pipeline: " + "http://looper.databio.org/en/latest/defining-a-project") + self.counter = LooperCounter(len(project_pifaces)) + for project_piface in project_pifaces: + try: + project_piface_object = \ + PipelineInterface(project_piface, pipeline_type="project") + except (IOError, ValidationError) as e: _LOGGER.warning( - "No valid pipelines for protocol '{}'".format(proto)) - continue - for pl_iface, sample_subtype, pl_key, script_with_flags in \ - submission_bundles: - _LOGGER.debug("%s: %s", pl_key, sample_subtype.__name__) + "Ignoring invalid pipeline interface source: {}. " + "Caught exception: {}". + format(project_piface, getattr(e, 'message', repr(e)))) + continue + _LOGGER.info(self.counter.show( + name=self.prj.name, type="project", + pipeline_name=project_piface_object.pipeline_name)) conductor = SubmissionConductor( - pl_key, pl_iface, script_with_flags, prj, - sample_subtype=sample_subtype, - compute_variables=comp_vars, **kwargs) - submission_conductors[pl_key] = conductor - pipe_keys_by_protocol[proto].append(pl_key) - return submission_conductors, pipe_keys_by_protocol + pipeline_interface=project_piface_object, + prj=self.prj, + compute_variables=compute_kwargs, + delay=args.time_delay, + extra_args=args.command_extra, + extra_args_override=args.command_extra_override, + ignore_flags=args.ignore_flags, + collate=True + ) + conductor._pool = [None] + conductor.submit() + jobs += conductor.num_job_submissions + _LOGGER.info("\nLooper finished") + _LOGGER.info("Jobs submitted: {}".format(jobs)) class Runner(Executor): """ The true submitter of pipelines """ - def __call__(self, args, remaining_args, rerun=False, **compute_kwargs): + def __call__(self, args, rerun=False, **compute_kwargs): """ Do the Sample submission. @@ -291,147 +284,106 @@ def __call__(self, args, remaining_args, rerun=False, **compute_kwargs): :param bool rerun: whether the given sample is being rerun rather than run for the first time """ - - if not self.prj.interfaces: - pipe_locs = getattr(self.prj[METADATA_KEY], PIPELINE_INTERFACES_KEY, []) - # TODO: should these cases be handled as equally exceptional? - # That is, should they either both raise errors, or both log errors? - if len(pipe_locs) == 0: - raise AttributeError( - "Looper requires at least one pointer to pipeline(s), set " - "with the pipeline_interfaces key in the metadata section " - "of a project config file") - else: - _LOGGER.error("No protocols found; does the PEP point to at " - "least one pipeline interface that exists? " - " Pipeline interfaces: {}". - format(", ".format(pipe_locs))) - return - - protocols = {s.protocol for s in self.prj.samples - if hasattr(s, "protocol")} + max_cmds = sum(list(map(len, self.prj._samples_by_interface.values()))) + self.counter.total = max_cmds failures = defaultdict(list) # Collect problems by sample. processed_samples = set() # Enforce one-time processing. - - _LOGGER.info("Finding pipelines for protocol(s): {}". - format(", ".join(self.prj.protocols))) - - submission_conductors, pipe_keys_by_protocol = process_protocols( - self.prj, protocols, compute_kwargs, dry_run=args.dry_run, - delay=args.time_delay, extra_args=remaining_args, - ignore_flags=args.ignore_flags, - max_cmds=args.lumpn, max_size=args.lump) - mapped_protos = set(pipe_keys_by_protocol.keys()) + submission_conductors = {} + try: + comp_vars = self.prj.dcc[COMPUTE_KEY].to_map() + except AttributeError: + if not isinstance(self.prj.dcc[COMPUTE_KEY], Mapping): + raise TypeError("Project's computing config isn't a mapping: {}" + " ({})".format(self.prj.dcc[COMPUTE_KEY], + type(self.prj.dcc[COMPUTE_KEY]))) + from copy import deepcopy + comp_vars = deepcopy(self.prj.dcc[COMPUTE_KEY]) + comp_vars.update(compute_kwargs or {}) # Determine number of samples eligible for processing. num_samples = len(self.prj.samples) if args.limit is None: upper_sample_bound = num_samples elif args.limit < 0: - raise ValueError( - "Invalid number of samples to run: {}".format(args.limit)) + raise ValueError("Invalid number of samples to run: {}".format(args.limit)) else: upper_sample_bound = min(args.limit, num_samples) - _LOGGER.debug("Limiting to %d of %d samples", - upper_sample_bound, num_samples) + _LOGGER.debug("Limiting to {} of {} samples". + format(upper_sample_bound, num_samples)) num_commands_possible = 0 failed_submission_scripts = [] - for sample in self.prj.samples[:upper_sample_bound]: - # First, step through the samples and determine whether any - # should be skipped entirely, based on sample attributes alone - # and independent of anything about any of its pipelines. - - # Start by displaying the sample index and a fresh collection - # of sample-skipping reasons. - _LOGGER.info(self.counter.show( - sample.sample_name, sample.protocol)) - skip_reasons = [] + # config validation (samples excluded) against all schemas defined + # for every pipeline matched for this project + [validate_config(self.prj, schema_file, True) + for schema_file in self.prj.get_schemas(self.prj.pipeline_interfaces)] - # Don't submit samples with duplicate names unless suppressed. - if sample.sample_name in processed_samples: - if args.allow_duplicate_names: - _LOGGER.warning("Duplicate name detected, but submitting anyway") - else: - skip_reasons.append("Duplicate sample name") - - # Check if sample should be run. - if sample.is_dormant(): - skip_reasons.append( - "Inactive status (via '{}' column/attribute)". - format(SAMPLE_EXECUTION_TOGGLE)) + for piface in self.prj.pipeline_interfaces: + conductor = SubmissionConductor( + pipeline_interface=piface, + prj=self.prj, + compute_variables=comp_vars, + delay=args.time_delay, + extra_args=args.command_extra, + extra_args_override=args.command_extra_override, + ignore_flags=args.ignore_flags, + max_cmds=args.lumpn, + max_size=args.lump + ) + submission_conductors[piface.pipe_iface_file] = conductor - # Get the base protocol-to-pipeline mappings. - try: - protocol = sample.protocol - except AttributeError: - skip_reasons.append("Sample has no protocol") - else: - if protocol not in mapped_protos and \ - GENERIC_PROTOCOL_KEY not in mapped_protos: - skip_reasons.append("No pipeline for protocol") + for sample in self.prj.samples[:upper_sample_bound]: + pl_fails = [] + skip_reasons = [] + sample_pifaces = self.prj.get_sample_piface(sample[SAMPLE_NAME_ATTR]) + if not sample_pifaces: + skip_reasons.append("No pipeline interfaces defined") if skip_reasons: - _LOGGER.warning( - "> Not submitted: {}".format(", ".join(skip_reasons))) - failures[sample.name] = skip_reasons + _LOGGER.warning(NOT_SUB_MSG.format(", ".join(skip_reasons))) + failures[sample.sample_name] = skip_reasons continue - # Processing preconditions have been met. - # Add this sample to the processed collection. - processed_samples.add(sample.sample_name) + # single sample validation against a single schema + # (from sample's piface) + [validate_sample(self.prj, sample.sample_name, schema_file, True) + for schema_file in self.prj.get_schemas(sample_pifaces)] - # At this point, we have a generic Sample; write that to disk - # for reuse in case of many jobs (pipelines) using base Sample. - # Do a single overwrite here, then any subsequent Sample can be sure - # that the file is fresh, with respect to this run of looper. - sample.to_yaml(subs_folder_path=self.prj.submission_folder) + processed_samples.add(sample[SAMPLE_NAME_ATTR]) - pipe_keys = pipe_keys_by_protocol.get(sample.protocol) \ - or pipe_keys_by_protocol.get(GENERIC_PROTOCOL_KEY) - _LOGGER.debug("Considering {} pipeline(s): {}". - format(len(pipe_keys), ", ".join(pipe_keys))) - - pl_fails = [] - for pl_key in pipe_keys: + for sample_piface in sample_pifaces: + _LOGGER.info( + self.counter.show(name=sample.sample_name, + pipeline_name=sample_piface.pipeline_name) + ) num_commands_possible += 1 - # TODO: of interest to track failures by pipeline? - conductor = submission_conductors[pl_key] - # TODO: check return value from add() to determine whether - # TODO (cont.) to grow the failures list. + cndtr = submission_conductors[sample_piface.pipe_iface_file] try: - curr_pl_fails = conductor.add_sample(sample, rerun=rerun) + curr_pl_fails = cndtr.add_sample(sample, rerun=rerun) except JobSubmissionException as e: failed_submission_scripts.append(e.script) else: pl_fails.extend(curr_pl_fails) if pl_fails: - failures[sample.name].extend(pl_fails) + failures[sample.sample_name].extend(pl_fails) job_sub_total = 0 cmd_sub_total = 0 - for conductor in submission_conductors.values(): + + for piface, conductor in submission_conductors.items(): conductor.submit(force=True) job_sub_total += conductor.num_job_submissions cmd_sub_total += conductor.num_cmd_submissions - skipped_sample_scripts = conductor.write_skipped_sample_scripts() - if skipped_sample_scripts: - _LOGGER.debug( - "{} script(s) for skipped samples:\n{}". - format(len(skipped_sample_scripts), - "\n".join(skipped_sample_scripts))) + conductor.write_skipped_sample_scripts() # Report what went down. - max_samples = min(len(self.prj.samples), args.limit or float("inf")) _LOGGER.info("\nLooper finished") - _LOGGER.info("Samples valid for job generation: %d of %d", - len(processed_samples), max_samples) - _LOGGER.info("Successful samples: %d of %d", - max_samples - len(failures), max_samples) - _LOGGER.info("Commands submitted: %d of %d", - cmd_sub_total, num_commands_possible) - _LOGGER.info("Jobs submitted: %d", job_sub_total) + _LOGGER.info("Samples valid for job generation: {} of {}". + format(len(processed_samples), num_samples)) + _LOGGER.info("Commands submitted: {} of {}". + format(cmd_sub_total, max_cmds)) + _LOGGER.info("Jobs submitted: {}".format(job_sub_total)) if args.dry_run: _LOGGER.info("Dry run. No jobs were actually submitted.") @@ -442,20 +394,18 @@ def __call__(self, args, remaining_args, rerun=False, **compute_kwargs): for f in failures: samples_by_reason[f].add(sample) # Collect samples by pipeline with submission failure. - failed_samples_by_pipeline = defaultdict(set) - for pl_key, conductor in submission_conductors.items(): + for piface, conductor in submission_conductors.items(): # Don't add failure key if there are no samples that failed for # that reason. if conductor.failed_samples: fails = set(conductor.failed_samples) samples_by_reason[SUBMISSION_FAILURE_MESSAGE] |= fails - failed_samples_by_pipeline[pl_key] |= fails failed_sub_samples = samples_by_reason.get(SUBMISSION_FAILURE_MESSAGE) if failed_sub_samples: - _LOGGER.info("\n{} samples with at least one failed job submission: {}". - format(len(failed_sub_samples), - ", ".join(failed_sub_samples))) + _LOGGER.info("\n{} samples with at least one failed job submission:" + " {}".format(len(failed_sub_samples), + ", ".join(failed_sub_samples))) # If failure keys are only added when there's at least one sample that # failed for that reason, we can display information conditionally, @@ -463,66 +413,47 @@ def __call__(self, args, remaining_args, rerun=False, **compute_kwargs): if samples_by_reason: _LOGGER.info("\n{} unique reasons for submission failure: {}".format( len(samples_by_reason), ", ".join(samples_by_reason.keys()))) - full_fail_msgs = [create_failure_message(reason, samples) + full_fail_msgs = [_create_failure_message(reason, samples) for reason, samples in samples_by_reason.items()] _LOGGER.info("\nSummary of failures:\n{}". format("\n".join(full_fail_msgs))) -class Summarizer(Executor): - """ Project/Sample output summarizer """ - def __init__(self, prj): - # call the inherited initialization - super(Summarizer, self).__init__(prj) - # pull together all the fits and stats from each sample into project-combined spreadsheets. - self.stats, self.columns = _create_stats_summary(self.prj, self.counter) - self.objs = _create_obj_summary(self.prj, self.counter) - - def __call__(self): - """ Do the summarization. """ - run_custom_summarizers(self.prj) +class Report(Executor): + """ Combine project outputs into a browsable HTML report """ + def __call__(self, args): # initialize the report builder report_builder = HTMLReportBuilder(self.prj) + + # Do the stats and object summarization. + table = Table(self.prj)() # run the report builder. a set of HTML pages is produced - report_path = report_builder(self.objs, self.stats, uniqify(self.columns)) - _LOGGER.info("HTML Report (n=" + str(len(self.stats)) + "): " + report_path) + report_path = report_builder(table.objs, table.stats, + uniqify(table.columns)) + _LOGGER.info("HTML Report (n=" + str(len(table.stats)) + "): " + + report_path) -def run_custom_summarizers(project): - """ - Run custom summarizers if any are defined - :param looper.Project project: the project to be summarized - """ - summarizers_to_run = set() - pipelines = [] - all_protocols = [sample.protocol for sample in project.samples] - for protocol in set(all_protocols): - try: - ifaces = project.get_interfaces(protocol) - except KeyError: - _LOGGER.warning("No interface for protocol '{}', skipping summary".format(protocol)) - continue - for iface in ifaces: - pl = iface.fetch_pipelines(protocol) - pipelines.append(pl) - if pipelines is not None: - for pl in set(pipelines): - pl_summarizers = iface.get_attribute(pl, "summarizers") - if pl_summarizers is not None: - for summarizer in pl_summarizers: - if not os.path.isabs(summarizer): - summarizer = os.path.join(os.path.dirname(iface.pipe_iface_file), summarizer) - try: - _LOGGER.debug("Running custom summarizer: {}".format(summarizer)) - subprocess.call([summarizer, project.config_file]) - except OSError: - _LOGGER.warning("Summarizer was unable to run: " + str(summarizer)) +class Table(Executor): + """ Project/Sample statistics and table output generator """ + def __init__(self, prj): + # call the inherited initialization + super(Table, self).__init__(prj) + self.prj = prj + + def __call__(self): + # pull together all the fits and stats from each sample into + # project-combined spreadsheets. + self.stats, self.columns = _create_stats_summary(self.prj, self.counter) + self.objs = _create_obj_summary(self.prj, self.counter) + return self def _create_stats_summary(project, counter): """ - Create stats spreadsheet and columns to be considered in the report, save the spreadsheet to file + Create stats spreadsheet and columns to be considered in the report, save + the spreadsheet to file :param looper.Project project: the project to be summarized :param looper.LooperCounter counter: a counter object @@ -531,7 +462,7 @@ def _create_stats_summary(project, counter): columns = [] stats = [] project_samples = project.samples - missing_files = 0 + missing_files = [] _LOGGER.info("Creating stats summary...") for sample in project_samples: _LOGGER.info(counter.show(sample.sample_name, sample.protocol)) @@ -543,9 +474,10 @@ def _create_stats_summary(project, counter): # Version 0.3 standardized all stats into a single file stats_file = os.path.join(sample_output_folder, "stats.tsv") if not os.path.isfile(stats_file): - missing_files += 1 + missing_files.append(stats_file) continue - t = _pd.read_csv(stats_file, sep="\t", header=None, names=['key', 'value', 'pl']) + t = _pd.read_csv(stats_file, sep="\t", header=None, + names=['key', 'value', 'pl']) t.drop_duplicates(subset=['key', 'pl'], keep='last', inplace=True) t.loc[:, 'plkey'] = t['pl'] + ":" + t['key'] dupes = t.duplicated(subset=['key'], keep=False) @@ -553,16 +485,19 @@ def _create_stats_summary(project, counter): sample_stats.update(t.set_index('key')['value'].to_dict()) stats.append(sample_stats) columns.extend(t.key.tolist()) + if missing_files: + _LOGGER.warning("Stats files missing for {} samples: {}". + format(len(missing_files),missing_files)) tsv_outfile_path = get_file_for_project(project, 'stats_summary.tsv') - if missing_files > 0: - _LOGGER.warning("Stats files missing for {} samples".format(missing_files)) tsv_outfile = open(tsv_outfile_path, 'w') - tsv_writer = csv.DictWriter(tsv_outfile, fieldnames=uniqify(columns), delimiter='\t', extrasaction='ignore') + tsv_writer = csv.DictWriter(tsv_outfile, fieldnames=uniqify(columns), + delimiter='\t', extrasaction='ignore') tsv_writer.writeheader() for row in stats: tsv_writer.writerow(row) tsv_outfile.close() - _LOGGER.info("Summary (n=" + str(len(stats)) + "): " + tsv_outfile_path) + _LOGGER.info("Statistics summary (n=" + str(len(stats)) + "): " + + tsv_outfile_path) counter.reset() return stats, uniqify(columns) @@ -578,58 +513,33 @@ def _create_obj_summary(project, counter): _LOGGER.info("Creating objects summary...") objs = _pd.DataFrame() # Create objects summary file - missing_files = 0 + missing_files = [] for sample in project.samples: # Process any reported objects _LOGGER.info(counter.show(sample.sample_name, sample.protocol)) sample_output_folder = sample_folder(project, sample) objs_file = os.path.join(sample_output_folder, "objects.tsv") if not os.path.isfile(objs_file): - missing_files += 1 + missing_files.append(objs_file) continue t = _pd.read_csv(objs_file, sep="\t", header=None, - names=['key', 'filename', 'anchor_text', 'anchor_image', 'annotation']) - t['sample_name'] = sample.name + names=['key', 'filename', 'anchor_text', + 'anchor_image', 'annotation']) + t['sample_name'] = sample.sample_name objs = objs.append(t, ignore_index=True) - if missing_files > 0: - _LOGGER.warning("Object files missing for {} samples".format(missing_files)) + if missing_files: + _LOGGER.warning("Object files missing for {} samples: {}". + format(len(missing_files), missing_files)) # create the path to save the objects file in - objs.to_csv(get_file_for_project(project, 'objs_summary.tsv'), sep="\t") + objs_file = get_file_for_project(project, 'objs_summary.tsv') + objs.to_csv(objs_file, sep="\t") + _LOGGER.info("Objects summary (n=" + + str(len(project.samples) - len(missing_files)) + "): " + + objs_file) return objs -def get_file_for_project(prj, appendix): - """ - Create a path to the file for the current project. Takes the possibility of subproject being activated at the time - :param looper.Project prj: project object - :param str appendix: the appendix of the file to create the path for, like 'objs_summary.tsv' for objects summary file - :return str: path to the file - """ - fp = os.path.join(prj.metadata.output_dir, prj.name) - if hasattr(prj, "subproject") and prj.subproject: - fp += '_' + prj.subproject - fp += '_' + appendix - return fp - - -def aggregate_exec_skip_reasons(skip_reasons_sample_pairs): - """ - Collect the reasons for skipping submission/execution of each sample - - :param Iterable[(Iterable[str], str)] skip_reasons_sample_pairs: pairs of - collection of reasons for which a sample was skipped for submission, - and the name of the sample itself - :return Mapping[str, Iterable[str]]: mapping from explanation to - collection of names of samples to which it pertains - """ - samples_by_skip_reason = defaultdict(list) - for skip_reasons, sample in skip_reasons_sample_pairs: - for reason in set(skip_reasons): - samples_by_skip_reason[reason].append(sample) - return samples_by_skip_reason - - -def create_failure_message(reason, samples): +def _create_failure_message(reason, samples): """ Explain lack of submission for a single reason, 1 or more samples. """ color = Fore.LIGHTRED_EX reason_text = color + reason + Style.RESET_ALL @@ -639,10 +549,12 @@ def create_failure_message(reason, samples): def _remove_or_dry_run(paths, dry_run=False): """ - Remove file or directory or just inform what would be removed in case of dry run + Remove file or directory or just inform what would be removed in + case of dry run :param list|str paths: list of paths to files/dirs to be removed - :param bool dry_run: logical indicating whether the files should remain untouched and massage printed + :param bool dry_run: logical indicating whether the files should remain + untouched and massage printed """ paths = paths if isinstance(paths, list) else [paths] for path in paths: @@ -654,7 +566,7 @@ def _remove_or_dry_run(paths, dry_run=False): if os.path.isfile(path): os.remove(path) else: - rmtree(path) + rmtree(path, ignore_errors=True) else: _LOGGER.info(path + " does not exist.") @@ -663,18 +575,10 @@ def destroy_summary(prj, dry_run=False): """ Delete the summary files if not in dry run mode """ - _remove_or_dry_run([get_index_html_path(prj), get_file_for_project(prj, 'stats_summary.tsv'), - get_file_for_project(prj, 'objs_summary.tsv'), get_reports_dir(prj)], dry_run) - - -def uniqify(seq): - """ - Fast way to uniqify while preserving input order. - """ - # http://stackoverflow.com/questions/480214/ - seen = set() - seen_add = seen.add - return [x for x in seq if not (x in seen or seen_add(x))] + _remove_or_dry_run([get_file_for_project(prj, "summary.html"), + get_file_for_project(prj, 'stats_summary.tsv'), + get_file_for_project(prj, 'objs_summary.tsv'), + get_file_for_project(prj, "reports")], dry_run) class LooperCounter(object): @@ -689,7 +593,7 @@ def __init__(self, total): self.count = 0 self.total = total - def show(self, name, protocol): + def show(self, name, type="sample", pipeline_name=None): """ Display sample counts status for a particular protocol type. @@ -697,13 +601,14 @@ def show(self, name, protocol): and as a side-effect of the call, the running count is incremented. :param str name: name of the sample - :param str protocol: name of the protocol + :param str pipeline_name: name of the pipeline :return str: message suitable for logging a status update """ self.count += 1 - return _submission_status_text( - curr=self.count, total=self.total, sample_name=name, - sample_protocol=protocol, color=Fore.CYAN) + return _submission_status_text(type=type, + curr=self.count, total=self.total, name=name, + pipeline_name=pipeline_name, color=Fore.CYAN + ) def reset(self): self.count = 0 @@ -712,52 +617,80 @@ def __str__(self): return "LooperCounter of size {}".format(self.total) -def _submission_status_text(curr, total, sample_name, sample_protocol, color): - return color + \ - "## [{n} of {N}] {sample} ({protocol})".format( - n=curr, N=total, sample=sample_name, protocol=sample_protocol) + \ - Style.RESET_ALL +def _submission_status_text(curr, total, name, pipeline_name=None, + type="sample", color=Fore.CYAN): + """ Generate submission sample text for run or collate """ + txt = color + "## [{n} of {t}] {type}: {name}".\ + format(n=curr, t=total, type=type, name=name) + if pipeline_name: + txt += "; pipeline: {}".format(pipeline_name) + return txt + Style.RESET_ALL -def _proc_resources_spec(spec): +def _proc_resources_spec(args): """ - Process CLI-specified itemized compute resource setting specification. + Process CLI-sources compute setting specification. There are two sources + of compute settings in the CLI alone: + * YAML file (--settings argument) + * itemized compute settings (--compute argument) + + The itemized compute specification is given priority - :param str | NoneType spec: itemized resource specification from CLI + :param argparse.Namespace: arguments namespace :return Mapping[str, str]: binding between resource setting name and value :raise ValueError: if interpretation of the given specification as encoding of key-value pairs fails """ + spec = getattr(args, "compute", None) + try: + settings_data = read_yaml_file(args.settings) or {} + except yaml.YAMLError: + _LOGGER.warning("Settings file ({}) does not follow YAML format," + " disregarding".format(args.settings)) + settings_data = {} if not spec: - return {} - kvs = spec.strip().split(",") - pairs = [(kv, kv.split("=")) for kv in kvs] - bads, data = [], {} + return settings_data + pairs = [(kv, kv.split("=")) for kv in spec] + bads = [] for orig, pair in pairs: try: k, v = pair except ValueError: bads.append(orig) else: - data[k] = v + settings_data[k] = v if bads: - raise ValueError("Could not completely parse itemized resource " - "specification; these failed as key-value pairs; " - "please check usage: {}".format(", ".join(bads))) - return data + raise ValueError( + "Could not correctly parse itemized compute specification. " + "Correct format: " + EXAMPLE_COMPUTE_SPEC_FMT) + return settings_data def main(): """ Primary workflow """ - - parser = build_parser() + global _LOGGER + parsers = build_parser() + parser = parsers[0] + aux_parser = parsers[1] + aux_parser.suppress_defaults() args, remaining_args = parser.parse_known_args() - - try: - conf_file = args.config_file - except AttributeError: + if args.command is None: parser.print_help(sys.stderr) sys.exit(1) + if args.config_file is None: + m = "No project config defined" + try: + setattr(args, "config_file", read_cfg_from_dotfile()) + except OSError: + print(m + " and dotfile does not exist: {}".format(dotfile_path())) + parser.print_help(sys.stderr) + sys.exit(1) + else: + print(m + ", using: {}. Read from dotfile ({}).". + format(read_cfg_from_dotfile(), dotfile_path())) + if args.command == "init": + sys.exit(int(not init_dotfile(dotfile_path(), args.config_file, args.force))) + args = enrich_args_via_cfg(args, aux_parser) # Set the logging level. if args.dbg: @@ -771,68 +704,80 @@ def main(): level = LOGGING_LEVEL # Establish the project-root logger and attach one for this module. - logger_kwargs = {"level": level, "logfile": args.logfile, "devmode": args.dbg} + logger_kwargs = {"level": level, + "logfile": args.logfile, + "devmode": args.dbg} init_logger(name="peppy", **logger_kwargs) - global _LOGGER + init_logger(name="divvy", **logger_kwargs) + init_logger(name="eido", **logger_kwargs) _LOGGER = init_logger(name=_PKGNAME, **logger_kwargs) + # lc = LooperConfig(select_looper_config(filename=args.looper_config)) + # _LOGGER.debug("Determined genome config: {}".format(lc)) + + _LOGGER.info("Looper version: {}\nCommand: {}". + format(__version__, args.command)) + if len(remaining_args) > 0: - _LOGGER.debug("Remaining arguments passed to pipelines: {}". + _LOGGER.warning("Unrecognized arguments: {}". format(" ".join([str(x) for x in remaining_args]))) - _LOGGER.info("Command: {} (Looper version: {})". - format(args.command, __version__)) - # Initialize project + divcfg = select_divvy_config(filepath=args.divvy) \ + if hasattr(args, "divvy") else None - # Although the value of args.env might be None, the actual env variable used will be DIVCFG or PEPENV - # (checked in this very order), which is implemented in divvy, the underlying package for - # computing environment configuration - if getattr(args, 'env', None) is None: - _LOGGER.debug("compute_env_file: DIVCFG or PEPENV") - else: - _LOGGER.debug("compute_env_file: " + str(getattr(args, 'env', None))) + # Initialize project _LOGGER.debug("Building Project") try: - prj = Project( - determine_config_path(conf_file), subproject=args.subproject, - file_checks=args.file_checks, compute_env_file=getattr(args, 'env', None)) + p = Project(config_file=args.config_file, + amendments=args.amend, + divcfg_path=divcfg, + runp=args.command == "runp", + **{attr: getattr(args, attr) for attr in CLI_PROJ_ATTRS if attr in args}) except yaml.parser.ParserError as e: _LOGGER.error("Project config parse failed -- {}".format(e)) sys.exit(1) - compute_cli_spec = getattr(args, COMPUTE_KEY, None) - if compute_cli_spec and compute_cli_spec != DEFAULT_COMPUTE_RESOURCES_NAME: - prj.dcc.activate_package(compute_cli_spec) + selected_compute_pkg = p.selected_compute_package \ + or DEFAULT_COMPUTE_RESOURCES_NAME + if p.dcc is not None and not p.dcc.activate_package(selected_compute_pkg): + _LOGGER.info("Failed to activate '{}' computing package. " + "Using the default one".format(selected_compute_pkg)) - _LOGGER.debug("Results subdir: " + prj.results_folder) - - with ProjectContext(prj, - selector_attribute=args.selector_attribute, - selector_include=args.selector_include, - selector_exclude=args.selector_exclude) as prj: + with ProjectContext(prj=p, + selector_attribute=args.sel_attr, + selector_include=args.sel_incl, + selector_exclude=args.sel_excl) as prj: if args.command in ["run", "rerun"]: run = Runner(prj) try: - compute_kwargs = _proc_resources_spec( - getattr(args, RESOURCES_KEY, "")) - run(args, remaining_args, - rerun=(args.command == "rerun"), **compute_kwargs) + compute_kwargs = _proc_resources_spec(args) + run(args, rerun=(args.command == "rerun"), **compute_kwargs) except IOError: - _LOGGER.error("{} pipeline_interfaces: '{}'".format( - prj.__class__.__name__, prj.metadata.pipeline_interfaces)) + _LOGGER.error("{} pipeline_interfaces: '{}'". + format(prj.__class__.__name__, + prj.pipeline_interface_sources)) raise + if args.command == "runp": + compute_kwargs = _proc_resources_spec(args) + collate = Collator(prj) + collate(args, **compute_kwargs) + if args.command == "destroy": return Destroyer(prj)(args) - if args.command == "summarize": - Summarizer(prj)() + if args.command == "table": + Table(prj)() + + if args.command == "report": + Report(prj)(args) if args.command == "check": - # TODO: hook in fixed samples once protocol differentiation is - # TODO (continued) figured out (related to #175). Checker(prj)(flags=args.flags) if args.command == "clean": return Cleaner(prj)(args) + + if args.command == "inspect": + inspect_project(p, args.snames, args.attr_limit) diff --git a/looper/looper_config.py b/looper/looper_config.py new file mode 100644 index 000000000..f55fa0bc1 --- /dev/null +++ b/looper/looper_config.py @@ -0,0 +1,65 @@ +""" Looper configuration file manager """ +import os +from yacman import YacAttMap, select_config +from logging import getLogger +from ubiquerg import expandpath, is_url + +from .const import * +from .pipeline_interface import PipelineInterface + +_LOGGER = getLogger(__name__) + + +class LooperConfig(YacAttMap): + def __init__(self, filepath=None, entries=None): + """ + + :param str filepath: + :param Mapping entries: + """ + super(LooperConfig, self).__init__(filepath=filepath, entries=entries) + + def get_pipeline_interface(self, protocol, raw=False): + """ + + :param str protocol: + :param bool raw: + :return PipelineInterface: pipeline interface object matched + by the specified protocol + """ + if PROTOMAP_KEY in self: + if protocol in self[PROTOMAP_KEY]: + return self[PROTOMAP_KEY][protocol] if raw else \ + PipelineInterface(config=self[PROTOMAP_KEY][protocol]) + return None + + def add_protocol_mapping(self, protocol, loc): + """ + + :param str protocol: protocol key + :param str loc: path to an existing pipeline interface file + """ + path = expandpath(loc) + if not os.path.exists(path): + if not is_url(loc): + _LOGGER.warning("Ignoring nonexistent pipeline interface " + "location: {}".format(loc)) + return + else: + if protocol in self[PROTOMAP_KEY]: + _LOGGER.info("Overwriting existing protocol mapping with: " + "{}:{}".format(protocol, loc)) + self[PROTOMAP_KEY].update({protocol: loc}) + + +def select_looper_config(filename=None, conf_env_vars=CFG_ENV_VARS, **kwargs): + """ + Get path to looper configuration file. + + :param str filename: name/path of looper configuration file + :param Iterable[str] conf_env_vars: names of environment + variables to consider, a prioritized search list + :return str: path to looper configuration file + """ + return select_config(filename, conf_env_vars, **kwargs) + diff --git a/looper/parser_types.py b/looper/parser_types.py index b5741f6b8..3a2760114 100644 --- a/looper/parser_types.py +++ b/looper/parser_types.py @@ -27,9 +27,11 @@ def html_checkbox(caravel=False, checked=False): :return callable: argument to the type parameter of an argparse.ArgumentParser's add_argument method. """ - caravel_data = PathExAttMap({"element_type": "checkbox", "element_args": {}}) + caravel_data = \ + PathExAttMap({"element_type": "checkbox", "element_args": {}}) if checked: caravel_data.add_entries({"element_args": {"checked": True}}) + def fun(x=None, caravel_data=caravel_data, caravel=caravel): return caravel_data if caravel else eval(x) return fun @@ -47,9 +49,11 @@ def html_select(choices, caravel=False): """ if not isinstance(choices, list): raise TypeError( - "Argument to choices parameter must be list, got {}.".format(type(choices))) + "Argument to choices parameter must be list, got {}.". + format(type(choices))) caravel_data = PathExAttMap( {"element_type": "select", "element_args": {"option": choices}}) + def fun(x=None, caravel_data=caravel_data, caravel=caravel): return caravel_data if caravel else x return fun diff --git a/looper/pipeline_interface.py b/looper/pipeline_interface.py index c68627e28..5b09b21c5 100644 --- a/looper/pipeline_interface.py +++ b/looper/pipeline_interface.py @@ -1,36 +1,25 @@ """ Model the connection between a pipeline and a project or executor. """ -from collections import Iterable, Mapping, OrderedDict -import inspect -import logging import os -import warnings +import jsonschema +import pandas as pd -import yaml -from yaml import SafeLoader +from collections import Mapping +from logging import getLogger -from .const import PIPELINE_REQUIREMENTS_KEY -from .exceptions import InvalidResourceSpecificationException, \ - MissingPipelineConfigurationException, PipelineInterfaceConfigError, \ - PipelineInterfaceRequirementsError -from .pipereqs import create_pipeline_requirement, RequiredExecutable -from .sample import Sample -from .utils import get_logger from attmap import PathExAttMap as PXAM -from divvy import DEFAULT_COMPUTE_RESOURCES_NAME, NEW_COMPUTE_KEY as COMPUTE_KEY -from divvy.const import OLD_COMPUTE_KEY +from eido import read_schema from peppy import utils as peputil -from peppy.sample import SAMPLE_YAML_FILE_KEY -from ubiquerg import expandpath, is_command_callable +from ubiquerg import expandpath, is_url +from yacman import load_yaml +from .const import * +from .exceptions import InvalidResourceSpecificationException -_LOGGER = get_logger(__name__) +__author__ = "Michal Stolarczyk" +__email__ = "michal@virginia.edu" - -PL_KEY = "pipelines" -PROTOMAP_KEY = "protocol_mapping" -RESOURCES_KEY = "resources" -SUBTYPE_MAPPING_SECTION = "sample_subtypes" +_LOGGER = getLogger(__name__) @peputil.copy @@ -43,758 +32,239 @@ class PipelineInterface(PXAM): :param str | Mapping config: path to file from which to parse configuration data, or pre-parsed configuration data. + :param str pipeline_type: type of the pipeline, + must be either 'sample' or 'project'. """ - - REQUIRED_SECTIONS = [PL_KEY, PROTOMAP_KEY] - - def __init__(self, config): + def __init__(self, config, pipeline_type=None): super(PipelineInterface, self).__init__() if isinstance(config, Mapping): self.pipe_iface_file = None self.source = None else: - _LOGGER.debug("Parsing '%s' for %s config data", - config, self.__class__.__name__) + _LOGGER.debug("Reading {} from: {}". + format(self.__class__.__name__, config)) self.pipe_iface_file = config self.source = config - try: - with open(config, 'r') as f: - config = yaml.load(f, SafeLoader) - except yaml.parser.ParserError: - with open(config, 'r') as f: - _LOGGER.error( - "Failed to parse YAML from {}:\n{}". - format(config, "".join(f.readlines()))) - raise - - # Check presence of 2 main sections (protocol mapping and pipelines). - missing = [s for s in self.REQUIRED_SECTIONS if s not in config] - if missing: - raise PipelineInterfaceConfigError(missing) + config = load_yaml(config) + self.update(config) + self._validate(PIFACE_SCHEMA_SRC, flavor=pipeline_type) + self._expand_pipeline_paths() - # Format and add the protocol mappings and individual interfaces. - config = expand_pl_paths(config) - assert PROTOMAP_KEY in config, \ - "For protocol mapping standardization, pipeline interface data " \ - "must contain key '{}'".format(PROTOMAP_KEY) - - for k, v in config.items(): - if k in ["pipe_iface_file", "source"]: - continue - assert k not in self, \ - "Interface key already mapped: {} ({})".format(k, self[k]) - self[k] = v - - def __repr__(self): - """ String representation """ - source = self.pipe_iface_file or "Mapping" - num_pipelines = len(self.pipelines) - # TODO: could use 'name' here - pipelines = ", ".join(self.pipelines.keys()) - return "{} from {}, with {} pipeline(s): {}".format( - self.__class__.__name__, source, num_pipelines, pipelines) + def get_pipeline_schemas(self, schema_key=INPUT_SCHEMA_KEY): + """ + Get path to the pipeline schema. - def __setitem__(self, key, value): - if key == PIPELINE_REQUIREMENTS_KEY: - super(PipelineInterface, self).__setitem__( - key, read_pipe_reqs(value), finalize=False) - elif key == PL_KEY: - assert isinstance(value, Mapping) or not value, \ - "If non-null, value for key '{}' in interface specification " \ - "must be a mapping; got {}".format(key, type(value).__name__) - m = PXAM() - for k, v in value.items(): - assert isinstance(v, Mapping), \ - "Value for pipeline {} is {}, not mapping".\ - format(k, type(v).__name__) - m_sub = PXAM() - for k_sub, v_sub in v.items(): - if k_sub == PIPELINE_REQUIREMENTS_KEY: - m_sub.__setitem__(k_sub, read_pipe_reqs(v_sub), finalize=False) - else: - m_sub.__setitem__(k_sub, v_sub, finalize=True) - m.__setitem__(k, m_sub, finalize=False) - super(PipelineInterface, self).__setitem__(key, m) - else: - super(PipelineInterface, self).__setitem__(key, value) + :param str schema_key: where to look for schemas in the pipeline iface + :return str: absolute path to the pipeline schema file + """ + schema_source = None + if schema_key in self: + schema_source = self[schema_key] + if schema_source: + _LOGGER.debug("Got schema source: {}".format(schema_source)) + if is_url(schema_source): + return schema_source + elif not os.path.isabs(schema_source): + schema_source = os.path.join( + os.path.dirname(self.pipe_iface_file), schema_source) + return schema_source - def choose_resource_package(self, pipeline_name, file_size): + def choose_resource_package(self, namespaces, file_size): """ Select resource bundle for given input file size to given pipeline. - :param str pipeline_name: Name of pipeline. :param float file_size: Size of input data (in gigabytes). + :param Mapping[Mapping[str]] namespaces: namespaced variables to pass + as a context for fluid attributes command rendering :return MutableMapping: resource bundle appropriate for given pipeline, for given input file size :raises ValueError: if indicated file size is negative, or if the file size value specified for any resource package is negative - :raises _InvalidResourceSpecificationException: if no default + :raises InvalidResourceSpecificationException: if no default resource package specification is provided """ - - # Ensure that we have a numeric value before attempting comparison. - file_size = float(file_size) - - if file_size < 0: - raise ValueError("Attempted selection of resource package for " - "negative file size: {}".format(file_size)) - - def notify(msg): - msg += " for pipeline {}".format(pipeline_name) - if self.pipe_iface_file is not None: - msg += " in interface {}".format(self.pipe_iface_file) - _LOGGER.debug(msg) - - pl = self.select_pipeline(pipeline_name) - - try: - universal_compute = pl[COMPUTE_KEY] - except KeyError: - notify("No compute settings (by {})".format(COMPUTE_KEY)) - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", category=DeprecationWarning) - try: - universal_compute = pl[OLD_COMPUTE_KEY] - except KeyError: - universal_compute = PXAM() - else: - warnings.warn( - "To declare pipeline compute section, use {} rather " - "than {}".format(COMPUTE_KEY, OLD_COMPUTE_KEY), - DeprecationWarning) - _LOGGER.debug("Universal compute (for {}): {}". - format(pipeline_name, universal_compute)) - - try: - resources = universal_compute[RESOURCES_KEY] - except KeyError: - try: - resources = pl[RESOURCES_KEY] - except KeyError: - notify("No resources") - return {} - else: - if RESOURCES_KEY in pl: - _LOGGER.warning( - "{rk} section found in both {c} section and top-level " - "pipelines section of pipeline interface; {c} section " - "version will be used".format(rk=RESOURCES_KEY, c=COMPUTE_KEY)) - - # Require default resource package specification. - try: - default_resource_package = \ - resources[DEFAULT_COMPUTE_RESOURCES_NAME] - except KeyError: - raise InvalidResourceSpecificationException( - "Pipeline resources specification lacks '{}' section". - format(DEFAULT_COMPUTE_RESOURCES_NAME)) - - # Parse min file size to trigger use of a resource package. - def file_size_ante(name, data): + def _file_size_ante(name, data): # Retrieve this package's minimum file size. # Retain backwards compatibility while enforcing key presence. try: - fsize = data["min_file_size"] + fsize = float(data[FILE_SIZE_COLNAME]) except KeyError: - fsize = data["file_size"] - fsize = float(fsize) + raise InvalidResourceSpecificationException( + "Required column '{}' does not exist in resource " + "specification TSV.".format(FILE_SIZE_COLNAME)) # Negative file size is illogical and problematic for comparison. if fsize < 0: - raise ValueError( - "Negative file size threshold for resource package " - "'{}': {}".format(name, fsize)) + raise InvalidResourceSpecificationException( + "Found negative value () in '{}' column; package '{}'". + format(fsize, FILE_SIZE_COLNAME, name) + ) return fsize - # Enforce default package minimum of 0. - if "file_size" in default_resource_package: - del default_resource_package["file_size"] - resources[DEFAULT_COMPUTE_RESOURCES_NAME]["min_file_size"] = 0 - - try: - # Sort packages by descending file size minimum to return first - # package for which given file size satisfies the minimum. - resource_packages = sorted( - resources.items(), - key=lambda name_and_data: file_size_ante(*name_and_data), - reverse=True) - except ValueError: - _LOGGER.error("Unable to use file size to prioritize " - "resource packages: {}".format(resources)) - raise - - # "Descend" packages by min file size, choosing minimally-sufficient. - for rp_name, rp_data in resource_packages: - size_ante = file_size_ante(rp_name, rp_data) - if file_size >= size_ante: - _LOGGER.debug( - "Selected '{}' package with min file size {} Gb for file " - "of size {} Gb.".format(rp_name, size_ante, file_size)) - rp_data.update(universal_compute) - return rp_data - - def finalize_pipeline_key_and_paths(self, pipeline_key): - """ - Determine pipeline's full path, arguments, and strict key. - - This handles multiple ways in which to refer to a pipeline (by key) - within the mapping that contains the data that defines a - PipelineInterface. It also ensures proper handling of the path to the - pipeline (i.e., ensuring that it's absolute), and that the text for - the arguments are appropriately dealt parsed and passed. - - :param str pipeline_key: the key in the pipeline interface file used - for the protocol_mappings section. Previously was the script name. - :return (str, str, str): more precise version of input key, along with - absolute path for pipeline script, and full script path + options - - """ - - # The key may contain extra command-line flags; split key from flags. - # The strict key was previously the script name itself, something like - # "ATACseq.py", but now is typically just something like "atacseq". - strict_pipeline_key, _, pipeline_key_args = pipeline_key.partition(' ') - - full_pipe_path = \ - self.get_attribute(strict_pipeline_key, "path") - - if full_pipe_path: - script_path_only = os.path.expanduser( - os.path.expandvars(full_pipe_path[0].strip())) - if os.path.isdir(script_path_only): - script_path_only = os.path.join(script_path_only, pipeline_key) - script_path_with_flags = \ - "{} {}".format(script_path_only, pipeline_key_args) - else: - # backwards compatibility w/ v0.5 - script_path_only = strict_pipeline_key - script_path_with_flags = pipeline_key - - # Clear trailing whitespace. - script_path_only = script_path_only.rstrip() - - # TODO: determine how to deal with pipelines_path (i.e., could be null) - if not os.path.isabs(script_path_only) and not \ - is_command_callable(script_path_only): - _LOGGER.whisper("Expanding non-absolute script path: '%s'", - script_path_only) - script_path_only = os.path.join( - self.pipelines_path, script_path_only) - _LOGGER.whisper("Absolute script path: '%s'", script_path_only) - script_path_with_flags = os.path.join( - self.pipelines_path, script_path_with_flags) - _LOGGER.whisper("Absolute script path with flags: '%s'", - script_path_with_flags) - - return strict_pipeline_key, script_path_only, script_path_with_flags - - def get_arg_string(self, pipeline_name, sample, - submission_folder_path="", **null_replacements): - """ - For a given pipeline and sample, return the argument string. - - :param str pipeline_name: Name of pipeline. - :param Sample sample: current sample for which job is being built - :param str submission_folder_path: path to folder in which files - related to submission of this sample will be placed. - :param dict null_replacements: mapping from name of Sample attribute - name to value to use in arg string if Sample attribute's value - is null - :return str: command-line argument string for pipeline - """ - - def update_argtext(argtext, option, argument): - if argument is None or "" == argument: - _LOGGER.debug("Skipping null/empty argument for option " - "'{}': {}".format(option, type(argument))) - return argtext - _LOGGER.debug("Adding argument for pipeline option '{}': {}". - format(option, argument)) - return "{} {} {}".format(argtext, option, argument) - - default_filepath = os.path.join( - submission_folder_path, sample.generate_filename()) - _LOGGER.debug("Default sample filepath: '%s'", default_filepath) - proxies = {SAMPLE_YAML_FILE_KEY: default_filepath} - proxies.update(null_replacements) - - _LOGGER.debug("Building arguments string") - config = self.select_pipeline(pipeline_name) - argstring = "" - - if "arguments" not in config: - _LOGGER.info("No arguments found for '%s' in '%s'", - pipeline_name, self.pipe_iface_file) - return argstring - - args = config["arguments"] - for pipe_opt, sample_attr in args.iteritems(): - if sample_attr is None: - _LOGGER.debug("Option '%s' is not mapped to a sample " - "attribute, so it will be added to the pipeline " - "argument string as a flag-like option.", - str(pipe_opt)) - argstring += " {}".format(pipe_opt) - continue + def _notify(msg): + msg += " for pipeline" + if self.pipe_iface_file is not None: + msg += " in interface {}".format(self.pipe_iface_file) + _LOGGER.debug(msg) - try: - arg = getattr(sample, sample_attr) - except AttributeError: + def _load_fluid_attrs(pipeline): + """ + Render command string (jinja2 template), execute it in a subprocess + and return its result (JSON object) as a dict + + :param Mapping pipeline: pipeline dict + :return Mapping: a dict with attributes returned in the JSON + by called command + """ + def _log_raise_latest(): + """ Log error info and raise latest handled exception """ _LOGGER.error( - "Error (missing attribute): '%s' requires sample " - "attribute '%s' for option '%s'", - pipeline_name, sample_attr, pipe_opt) + "Could not retrieve JSON via command: '{}'".format( + pipeline[COMPUTE_KEY][DYN_VARS_KEY])) raise - - # It's undesirable to put a null value in the argument string. - if arg is None: - _LOGGER.debug("Null value for sample attribute: '%s'", - sample_attr) - try: - arg = proxies[sample_attr] - except KeyError: - reason = "No default for null sample attribute: '{}'".\ - format(sample_attr) - raise ValueError(reason) - _LOGGER.debug("Found default for '{}': '{}'". - format(sample_attr, arg)) - - argstring = update_argtext( - argstring, option=pipe_opt, argument=arg) - - # Add optional arguments - if "optional_arguments" in config: - _LOGGER.debug("Processing options") - args = config["optional_arguments"] - missing_optional_args = [] - for pipe_opt, sample_attr in args.iteritems(): - _LOGGER.debug("Option '%s' maps to sample attribute '%s'", - pipe_opt, sample_attr) - if sample_attr is None or sample_attr == "": - _LOGGER.debug("Null/empty sample attribute name for " - "pipeline option '{}'".format(pipe_opt)) - continue + json = None + if COMPUTE_KEY in pipeline \ + and DYN_VARS_KEY in pipeline[COMPUTE_KEY]: + from subprocess import check_output, CalledProcessError + from json import loads + from .utils import jinja_render_cmd_strictly try: - arg = getattr(sample, sample_attr) - except AttributeError: - missing_optional_args.append((pipeline_name, sample_attr, pipe_opt)) - continue - argstring = update_argtext( - argstring, option=pipe_opt, argument=arg) - - if len(missing_optional_args) > 0: - warning_msg = {} - for pipeline_name, sample_attr, pipe_opt in missing_optional_args: - msg = "{arg}: '{attr}';".format(attr=sample_attr, - arg=pipe_opt) - if not pipeline_name in warning_msg.keys(): - warning_msg[pipeline_name] = [msg] - else: - warning_msg[pipeline_name].append(msg) - - for pipeline_name, msg in warning_msg.items(): - n_missing = len(msg) - if n_missing > 5: - _LOGGER.info( - "> NOTE: {} missing optional attributes for pipeline '{}'.".format(n_missing, - pipeline_name)) - else: - _LOGGER.info( - "> NOTE: {} missing optional attributes for pipeline '{}': {}".format(n_missing, - pipeline_name, " ".join(msg))) - - _LOGGER.debug("Script args: '%s'", argstring) - - return argstring - - def fetch_pipelines(self, protocol): - """ - Fetch the mapping for a particular protocol, null if unmapped. - - :param str protocol: name/key for the protocol for which to fetch the - pipeline(s) - :return str | Iterable[str] | NoneType: pipeline(s) to which the given - protocol is mapped, otherwise null - """ - return self.protocol_mapping.get(protocol) - - def fetch_sample_subtype( - self, protocol, strict_pipe_key, full_pipe_path): - """ - Determine the interface and Sample subtype for a protocol and pipeline. - - :param str protocol: name of the relevant protocol - :param str strict_pipe_key: key for specific pipeline in a pipeline - interface mapping declaration; this must exactly match a key in - the PipelineInterface (or the Mapping that represent it) - :param str full_pipe_path: (absolute, expanded) path to the - pipeline script - :return type: Sample subtype to use for jobs for the given protocol, - that use the pipeline indicated - :raises KeyError: if given a pipeline key that's not mapped in the - pipelines section of this PipelineInterface - """ - - subtype = None - - this_pipeline_data = self.pipelines[strict_pipe_key] - - try: - subtypes = this_pipeline_data[SUBTYPE_MAPPING_SECTION] - except KeyError: - _LOGGER.debug("Configuration (from %s) doesn't define section '%s' " - "for pipeline '%s'", self.source, - SUBTYPE_MAPPING_SECTION, strict_pipe_key) - # Without a subtypes section, if pipeline module defines a single - # Sample subtype, we'll assume that type is to be used when in - # this case, when the interface section for this pipeline lacks - # an explicit subtypes section specification. - subtype_name = None - else: - if subtypes is None: - # Designate lack of need for import attempt and provide - # class with name to format message below. - subtype = Sample - _LOGGER.debug("Null %s subtype(s) section specified for " - "pipeline: '%s'; using base %s type", - subtype.__name__, strict_pipe_key, - subtype.__name__) - elif isinstance(subtypes, str): - subtype_name = subtypes - _LOGGER.debug("Single subtype name for pipeline '%s' " - "in interface from '%s': '%s'", subtype_name, - strict_pipe_key, self.source) + cmd = jinja_render_cmd_strictly( + cmd_template=pipeline[COMPUTE_KEY][DYN_VARS_KEY], + namespaces=namespaces + ) + json = loads(check_output(cmd, shell=True)) + except CalledProcessError as e: + print(e.output) + _log_raise_latest() + except Exception: + _log_raise_latest() + else: + _LOGGER.debug( + "Loaded resources from JSON returned by a command for" + " pipeline '{}':\n{}".format(self.pipeline_name, json)) + return json + + def _load_size_dep_vars(piface): + """ + Read the resources from a TSV provided in the pipeline interface + + :param looper.PipelineInterface piface: currently processed piface + :param str section: section of pipeline interface to process + :return pandas.DataFrame: resources + """ + df = None + if COMPUTE_KEY in piface \ + and SIZE_DEP_VARS_KEY in piface[COMPUTE_KEY]: + resources_tsv_path = piface[COMPUTE_KEY][SIZE_DEP_VARS_KEY] + if not os.path.isabs(resources_tsv_path): + resources_tsv_path = os.path.join( + os.path.dirname(piface.pipe_iface_file), + resources_tsv_path) + df = pd.read_csv(resources_tsv_path, sep='\t', header=0).\ + fillna(float("inf")) + df[ID_COLNAME] = df.index + df.set_index(ID_COLNAME) + _LOGGER.debug("Loaded resources ({}) for pipeline '{}':\n{}". + format(resources_tsv_path, piface.pipeline_name, df)) else: - try: - subtype_name = subtypes[protocol] - except KeyError: - # Designate lack of need for import attempt and provide - # class with name to format message below. - subtype = Sample - _LOGGER.debug("No %s subtype specified in interface from " - "'%s': '%s', '%s'; known: %s", - subtype.__name__, self.source, - strict_pipe_key, protocol, - ", ".join(subtypes.keys())) - - # subtype_name is defined if and only if subtype remained null. - # The import helper function can return null if the import attempt - # fails, so provide the base Sample type as a fallback. - subtype = subtype or \ - _import_sample_subtype(full_pipe_path, subtype_name) or \ - Sample - _LOGGER.debug("Using Sample subtype: %s", subtype.__name__) - return subtype - - def get_attribute(self, pipeline_name, attribute_key, path_as_list=True): - """ - Return the value of the named attribute for the pipeline indicated. - - :param str pipeline_name: name of the pipeline of interest - :param str attribute_key: name of the pipeline attribute of interest - :param bool path_as_list: whether to ensure that a string attribute - is returned as a list; this is useful for safe iteration over - the returned value. - """ - config = self.select_pipeline(pipeline_name) - value = config.get(attribute_key) - return [value] if isinstance(value, str) and path_as_list else value - - def get_pipeline_name(self, pipeline): - """ - Translate a pipeline name (e.g., stripping file extension). - - :param str pipeline: Pipeline name or script (top-level key in - pipeline interface mapping). - :return str: translated pipeline name, as specified in config or by - stripping the pipeline's file extension - """ - config = self.select_pipeline(pipeline) - try: - return config["name"] - except KeyError: - _LOGGER.debug("No 'name' for pipeline '{}'".format(pipeline)) - return os.path.splitext(pipeline)[0] - - def iterpipes(self): - """ - Iterate over pairs of pipeline key and interface data. - - :return iterator of (str, Mapping): Iterator over pairs of pipeline - key and interface data - """ - return iter(self.pipelines.items()) - - def missing_requirements(self, pipeline): - """ - Determine which requirements--if any--declared by a pipeline are unmet. + _notify("No '{}' defined".format(SIZE_DEP_VARS_KEY)) + return df - :param str pipeline: key for pipeline for which to determine unmet reqs - :return Iterable[looper.PipelineRequirement]: unmet requirements - """ - reqs_data = {name: req for name, req in - self.get(PIPELINE_REQUIREMENTS_KEY, {}).items()} - reqs_data.update(self.select_pipeline(pipeline).get(PIPELINE_REQUIREMENTS_KEY, {})) - return [v.req for v in reqs_data.values() if not v.satisfied] - - @property - def pipeline_names(self): - """ - Names of pipelines about which this interface is aware. - - :return Iterable[str]: names of pipelines about which this - interface is aware - """ - # TODO: could consider keying on name. - return list(self.pipelines.keys()) - - @property - def pipelines_path(self): - """ - Path to pipelines folder. - - :return str | None: Path to pipelines folder, if configured with - file rather than with raw mapping. - """ - try: - return os.path.dirname(self.pipe_iface_file) - except (AttributeError, TypeError): - return None - - @property - def pipe_iface(self): - """ - Old-way access to pipeline key-to-interface mapping - - :return Mapping: Binding between pipeline key and interface data - """ - warnings.warn("On {} pi, use pi.pipelines instead of pi.pipe_iface " - "to access mapping from pipeline key to interface.". - format(self.__class__.__name__), DeprecationWarning) - return self.pipelines - - @property - def protomap(self): - """ - Access protocol mapping portion of this composite interface. - - :return Mapping: binding between protocol name and pipeline key. - """ - warnings.warn("Protomap access is deprecated; please use {}" - .format(PROTOMAP_KEY), DeprecationWarning) - return self.protocol_mapping - - def select_pipeline(self, pipeline_name): - """ - Check to make sure that pipeline has an entry and if so, return it. + # Ensure that we have a numeric value before attempting comparison. + file_size = float(file_size) + assert file_size >= 0, ValueError("Attempted selection of resource " + "package for negative file size: {}". + format(file_size)) + + fluid_resources = _load_fluid_attrs(self) + if fluid_resources is not None: + return fluid_resources + resources_df = _load_size_dep_vars(self) + resources_data = {} + if resources_df is not None: + resources = resources_df.to_dict('index') + try: + # Sort packages by descending file size minimum to return first + # package for which given file size satisfies the minimum. + resource_packages = sorted( + resources.items(), + key=lambda name_and_data: _file_size_ante(*name_and_data), + reverse=False) + except ValueError: + _LOGGER.error("Unable to use file size to prioritize " + "resource packages: {}".format(resources)) + raise - :param str pipeline_name: Name of pipeline. - :return Mapping: configuration data for pipeline indicated - :raises MissingPipelineConfigurationException: if there's no - configuration data for the indicated pipeline + # choose minimally-sufficient package + for rp_name, rp_data in resource_packages: + size_ante = _file_size_ante(rp_name, rp_data) + if file_size <= size_ante: + _LOGGER.debug( + "Selected '{}' package with file size {}Gb for file " + "of size {}Gb.".format(rp_name, size_ante, file_size)) + _LOGGER.debug("Selected resource package data:\n{}". + format(rp_data)) + resources_data = rp_data + break + + if COMPUTE_KEY in self: + resources_data.update(self[COMPUTE_KEY]) + + project = namespaces["project"] + if LOOPER_KEY in project and COMPUTE_KEY in project[LOOPER_KEY] \ + and RESOURCES_KEY in project[LOOPER_KEY][COMPUTE_KEY]: + # overwrite with values from project.looper.compute.resources + resources_data.\ + update(project[LOOPER_KEY][COMPUTE_KEY][RESOURCES_KEY]) + return resources_data + + def _expand_pipeline_paths(self): + """ + Expand path to each pipeline in pipelines and collators subsection + of pipeline interface """ try: - # For unmapped pipeline, Return empty interface instead of None. - return self[PL_KEY][pipeline_name] or dict() + raw_path = self["path"] except KeyError: - names = ["'{}'".format(p) for p in self.pipelines.keys()] - _LOGGER.error( - "Missing pipeline description: '{}' not found ({} known: {})". - format(pipeline_name, len(names), ", ".join(names))) - # TODO: use defaults or force user to define this? - raise MissingPipelineConfigurationException(pipeline_name) - - def uses_looper_args(self, pipeline_name): - """ - Determine whether indicated pipeline accepts looper arguments. - - :param str pipeline_name: Name of pipeline to check for looper - argument acceptance. - :return bool: Whether indicated pipeline accepts looper arguments. - """ - config = self.select_pipeline(pipeline_name) - return "looper_args" in config and config["looper_args"] + return + split_path = raw_path.split(" ") + if len(split_path) > 1: + _LOGGER.warning( + "Pipeline path ({}) contains spaces. Use command_template " + "section to construct the pipeline command. Using the first" + " part as path: {}".format(raw_path, split_path[0])) + path = split_path[0] + pipe_path = expandpath(path) + if not os.path.isabs(pipe_path) and self.pipe_iface_file: + abs = os.path.join(os.path.dirname( + self.pipe_iface_file), pipe_path) + if os.path.exists(abs): + _LOGGER.debug( + "Pipeline path relative to pipeline interface" + " made absolute: {}".format(abs)) + self["path"] = abs + return + _LOGGER.debug("Expanded path: {}".format(pipe_path)) + self["path"] = pipe_path - def validate(self, pipeline): + def _validate(self, schema_src, exclude_case=False, flavor=None): """ - Determine whether any declared requirements are unmet. + Generic function to validate object against a schema - :param str pipeline: key for the pipeline to validate - :return bool: whether any declared requirements are unmet - :raise MissingPipelineConfigurationException: if the requested pipeline - is not defined in this interface + :param str schema_src: schema source to validate against, URL or path + :param bool exclude_case: whether to exclude validated objects + from the error. Useful when used ith large projects + :param str flavor: type of the pipeline schema to use """ - return not self.missing_requirements(pipeline) - - -def expand_pl_paths(piface): - """ - Expand path to each pipeline in a declared mapping - - :param Mapping piface: Key-value mapping in which one value is a collection - of pipeline manifests, i.e. in the pipelines section of a pipeline - interface config file - :return Mapping: Same as input, but with any pipeline path expanded - """ - assert PL_KEY in piface, "For pipeline path expansion, pipeline interface" \ - "data must contain key '{}'".format(PL_KEY) - for pipe_data in piface[PL_KEY].values(): - if "path" in pipe_data: - pipe_path = pipe_data["path"] - _LOGGER.whisper("Expanding path: '%s'", pipe_path) - pipe_path = expandpath(pipe_path) - _LOGGER.whisper("Expanded: '%s'", pipe_path) - pipe_data["path"] = pipe_path - return piface - - -def read_pipe_reqs(reqs_data): - """ - Read/parse a requirements section or subsection of a pipeline interface config. - - :param Mapping reqs_data: the data to parse; this should be a collection - of strings (names/paths of executables), or a mapping of requirements - declarations, keyed on name/path with each key mapping to a string - that indicates the kind of requirement (file, folder, executable). - If nothing's specified (list rather than dict) of requirements, or if - the value for a requirement is empty/null, the requirement is assumed - to be the declaration of an executable. - :return attmap.PathExAttMap[str, looper.pipereqs.PipelineRequirement]: a - binding between requirement name/path and validation instance - """ - reqs_data = reqs_data or {} - if isinstance(reqs_data, str): - reqs_data = [reqs_data] - if isinstance(reqs_data, Mapping): - newval, errors = OrderedDict(), {} - for r, t in reqs_data.items(): + schema_source = schema_src.format(flavor if flavor else "generic") + schemas = read_schema(schema_source) + for schema in schemas: try: - newval[r] = create_pipeline_requirement(r, typename=t) - except ValueError: - errors[r] = t - if errors: - raise PipelineInterfaceRequirementsError(errors) - elif isinstance(reqs_data, Iterable): - newval = OrderedDict([(r, RequiredExecutable(r)) for r in reqs_data]) - else: - raise TypeError( - "Non-iterable pipeline requirements (key '{}'): {}". - format(PIPELINE_REQUIREMENTS_KEY, type(reqs_data).__name__)) - return PXAM(newval) - - -def _import_sample_subtype(pipeline_filepath, subtype_name=None): - """ - Import a particular Sample subclass from a Python module. - - :param str pipeline_filepath: path to file to regard as Python module - :param str subtype_name: name of the target class (which must derive from - the base Sample class in order for it to be used), optional; if - unspecified, if the module defines a single subtype, then that will - be used; otherwise, the base Sample type will be used. - :return type: the imported class, defaulting to base Sample in case of - failure with the import or other logic - """ - base_type = Sample - - _, ext = os.path.splitext(pipeline_filepath) - if ext != ".py": - return base_type - - try: - _LOGGER.debug("Attempting to import module defined by {}". - format(pipeline_filepath)) - - # TODO: consider more fine-grained control here. What if verbose - # TODO: logging is only to file, not to stdout/err? - - # Redirect standard streams during the import to prevent noisy - # error messaging in the shell that may distract or confuse a user. - if _LOGGER.getEffectiveLevel() > logging.DEBUG: - with open(os.devnull, 'w') as temp_standard_streams: - with peputil.standard_stream_redirector(temp_standard_streams): - pipeline_module = peputil.import_from_source(pipeline_filepath) - else: - pipeline_module = peputil.import_from_source(pipeline_filepath) - - except SystemExit: - # SystemExit would be caught as BaseException, but SystemExit is - # particularly suggestive of an a script without a conditional - # check on __main__, and as such warrant a tailored message. - _LOGGER.warning("'%s' appears to attempt to run on import; " - "does it lack a conditional on '__main__'? " - "Using base type: %s", - pipeline_filepath, base_type.__name__) - return base_type - - except (BaseException, Exception) as e: - _LOGGER.debug("Can't import subtype from '%s', using base %s: %r", - pipeline_filepath, base_type.__name__, e) - return base_type - - else: - _LOGGER.debug("Successfully imported pipeline module '%s', " - "naming it '%s'", pipeline_filepath, - pipeline_module.__name__) - - def class_names(cs): - return ", ".join([c.__name__ for c in cs]) - - # Find classes from pipeline module and determine which derive from Sample. - classes = _fetch_classes(pipeline_module) - _LOGGER.debug("Found %d classes: %s", len(classes), class_names(classes)) - - # Base Sample could be imported; we want the true subtypes. - proper_subtypes = _proper_subtypes(classes, base_type) - _LOGGER.debug("%d proper %s subtype(s): %s", len(proper_subtypes), - base_type.__name__, class_names(proper_subtypes)) - - # Determine course of action based on subtype request and number found. - if not subtype_name: - _LOGGER.debug("No specific subtype is requested from '%s'", - pipeline_filepath) - if len(proper_subtypes) == 1: - # No specific request and single subtype --> use single subtype. - subtype = proper_subtypes[0] - _LOGGER.debug("Single %s subtype found in '%s': '%s'", - base_type.__name__, pipeline_filepath, - subtype.__name__) - return subtype - else: - # We can't arbitrarily select from among 0 or multiple subtypes. - # Note that this text is used in the tests, as validation of which - # branch of the code in this function is being hit in order to - # return the base Sample type. If it changes, the corresponding - # tests will also need to change. - _LOGGER.debug("%s subtype cannot be selected from %d found in " - "'%s'; using base type", base_type.__name__, - len(proper_subtypes), pipeline_filepath) - return base_type - else: - # Specific subtype request --> look for match. - for st in proper_subtypes: - if st.__name__ == subtype_name: - _LOGGER.debug("Successfully imported %s from '%s'", - subtype_name, pipeline_filepath) - return st - raise ValueError( - "'{}' matches none of the {} {} subtype(s) defined " - "in '{}': {}".format(subtype_name, len(proper_subtypes), - base_type.__name__, pipeline_filepath, - class_names(proper_subtypes))) - - -def _fetch_classes(mod): - """ Return the classes defined in a module. """ - try: - _, classes = zip(*inspect.getmembers( - mod, lambda o: inspect.isclass(o))) - except ValueError: - return [] - return list(classes) - - -def _proper_subtypes(types, supertype): - """ Determine the proper subtypes of a supertype. """ - return list(filter( - lambda t: issubclass(t, supertype) and t != supertype, types)) + jsonschema.validate(self, schema) + _LOGGER.debug("Successfully validated {} against schema: {}". + format(self.__class__.__name__, schema_source)) + except jsonschema.exceptions.ValidationError as e: + if not exclude_case: + raise e + raise jsonschema.exceptions.ValidationError(e.message) \ No newline at end of file diff --git a/looper/pipereqs.py b/looper/pipereqs.py deleted file mode 100644 index 8fc1310ef..000000000 --- a/looper/pipereqs.py +++ /dev/null @@ -1,130 +0,0 @@ -""" Pipeline requirements declaration """ - -import os -from ubiquerg import expandpath, is_command_callable - -__author__ = "Vince Reuter" -__email__ = "vreuter@virginia.edu" - -__all__ = ["create_pipeline_requirement", "PipelineRequirement", - "RequiredExecutable", "RequiredPath"] - - -KEY_EXEC_REQ = "executable" -KEY_FILE_REQ = "file" -KEY_FOLDER_REQ = "folder" - - -class PipelineRequirement(object): - """ Requirement that must be satisfied for a pipeline to run. """ - - def __init__(self, req, check): - """ - Create the requirement by specifying name/path and validation function. - - :param str req: the requirement to eventually verify - :param function(str) check: how to perform the verification - """ - def _checkattr(trait_attr, trait_name): - if not hasattr(check, trait_attr): - raise TypeError("Validator isn't {} ({})". - format(trait_name, type(check).__name__)) - self.req = req - _checkattr("__call__", "callable") - _checkattr("__hash__", "hashable") - self.check = check - - def __eq__(self, other): - """ Equality treats each instance as a product type. """ - return type(self) is type(other) and \ - self.req == other.req and self.check == other.check - - def __hash__(self): - """ Hash as for product type. """ - return hash((self.req, self.check)) - - def __repr__(self): - """ Print type and requirement value> """ - return "{}: {}".format(type(self).__name__, self.req) - - def _finalize_for_check(self): - """ Expand any user or env vars in requirement. """ - return expandpath(self.req) - - @property - def satisfied(self): - """ - Determine whether the requirement is satisfied acc. to the validation. - - :return bool: whether the requirement is satisfied acc. to the validation - """ - return self.check(self._finalize_for_check()) - - -class RequiredPath(PipelineRequirement): - """ A single file or folder requirement """ - - def __init__(self, p, check=None, folder=None): - """ - Create the path requirement by specifying the path and how to verify. - - :param str p: the path on which to base the requirement - :param function(str) -> bool check: how to verify the requirement; - required if and only if no folder flag is given - :param bool folder: whether the path is a folder (not file); - required if and only if no validation function is provided - :raise ValueError: if no validation strategy is specified, and no - argument to folder parameter is given - :raise TypeError: if no validation strategy is specified, and the - argument to the folder parameter is not a Boolean - """ - if (check is not None and folder is not None) or \ - (check is None and folder is None): - raise ValueError( - "Either validation function or folder flag--but not both--must " - "be provided") - if check is None: - if type(folder) is not bool: - raise TypeError("Folder flag must be boolean; got {}". - format(type(folder).__name__)) - check = os.path.isdir if folder else os.path.isfile - super(RequiredPath, self).__init__(p, check) - - -class RequiredExecutable(PipelineRequirement): - """ A requirement that should be executable as a command """ - - def __init__(self, cmd, check=None): - """ - Create the requirement by specifying the command and validation. - - :param str cmd: the command requirement to validate as executable - :param function(str) -> bool check: how to verify that the command - requirement is in fact satisfied by executability; defaults to - the callability function in ubiquerg - """ - super(RequiredExecutable, self).__init__(cmd, check or is_command_callable) - - -def create_pipeline_requirement(req, typename, **kwargs): - """ - Create a single requirement instance for a pipeline - - :param str req: name/path that specifices the requirement, e.g. samtools - :param str typename: keyword indicating the kind of requirement to be - created - :param dict kwargs: variable keyword arguments to the RequiredExecutable - constructor - :return looper.pipereqs.PipelineRequirement: requirement as named, and - typed according to the keyword provided - :raise ValueError: if the given typename is unrecognized, raise ValueError. - """ - typename = typename or KEY_EXEC_REQ - if typename == KEY_EXEC_REQ: - return RequiredExecutable(req, **kwargs) - if typename == KEY_FILE_REQ: - return RequiredPath(req, folder=False) - elif typename == KEY_FOLDER_REQ: - return RequiredPath(req, folder=True) - else: - raise ValueError("Invalid requirement typename: '{}'".format(typename)) diff --git a/looper/processed_project.py b/looper/processed_project.py new file mode 100644 index 000000000..707185aa9 --- /dev/null +++ b/looper/processed_project.py @@ -0,0 +1,131 @@ +""" +Processed Project manipulation functions. +Will be moved to a separate package +""" +import os +from logging import getLogger + +from eido.const import * +from eido.exceptions import * + +from peppy.sample import Sample +from peppy.project import Project + +__author__ = "Michal Stolarczyk" +__email__ = "michal@virginia.edu" + +_LOGGER = getLogger(__name__) +PATH_KEY = "path" +THUMB_PATH_KEY = "thumbnail_path" +PATH_LIKE = [PATH_KEY, THUMB_PATH_KEY] + + +def _get_path_sect_keys(mapping, keys=[PATH_KEY]): + """ + Get names of subsections in a mapping that contain collection of keys + + :param Mapping mapping: schema subsection to search for paths + :param Iterable[str] keys: collection of keys to check for + :return Iterable[str]: collection of keys to path-like sections + """ + return [k for k, v in mapping.items() if bool(set(keys) & set(mapping[k]))] + + +def _populate_paths(object, schema, check_exist): + """ + Populate path-like object attributes with other object attributes + based on a defined template, e.g. '/Users/x/test_{name}/{genome}_file.txt' + + :param Mapping object: object with attributes to populate path template with + :param dict schema: schema with path attributes defined, e.g. + output of read_schema function + :param bool check_exist: whether the paths should be check for existence + :return Mapping: object with path templates populated + """ + if PROP_KEY not in schema: + raise EidoSchemaInvalidError("Schema is missing properties section.") + missing = [] + s = schema[PROP_KEY] + path_sects = _get_path_sect_keys(s) + for ps in path_sects: + templ = s[ps][PATH_KEY] + try: + populated = templ.format(**dict(object.items())) + except Exception as e: + _LOGGER.warning("Caught exception: {}.\n" + "Could not populate path: {}". + format(getattr(e, 'message', repr(e)), templ)) + else: + setattr(object, ps, populated) + _LOGGER.debug("Path set to: {}".format(object[ps])) + if check_exist and not os.path.exists(object[ps]): + missing.append(object[ps]) + if missing: + raise PathAttrNotFoundError("Path attributes not found:\n- {}". + format("\n- ".join(missing))) + + +def populate_sample_paths(sample, schema, check_exist=False): + """ + Populate path-like Sample attributes with other object attributes + based on a defined template, e.g. '/Users/x/test_{name}/{genome}_file.txt' + + :param peppy.Sample sample: sample to populate paths in + :param Iterable[dict] schema: schema with path attributes defined, e.g. + output of read_schema function + :param bool check_exist: whether the paths should be check for existence + :return Mapping: Sample with path templates populated + """ + if not isinstance(sample, Sample): + raise TypeError("Can only populate paths in peppy.Sample objects") + # schema = schema[-1] # use only first schema, in case there are imports + if PROP_KEY in schema and "samples" in schema[PROP_KEY]: + _populate_paths(sample, schema[PROP_KEY]["samples"]["items"], + check_exist) + + +def populate_project_paths(project, schema, check_exist=False): + """ + Populate path-like Project attributes with other object attributes + based on a defined template, e.g. '/Users/x/test_{name}/{genome}_file.txt' + + :param peppy.Project project: project to populate paths in + :param dict schema: schema with path attributes defined, e.g. + output of read_schema function + :param bool check_exist: whether the paths should be check for existence + :return Mapping: Project with path templates populated + """ + if not isinstance(project, Project): + raise TypeError("Can only populate paths in peppy.Project objects") + _populate_paths(project, schema, check_exist) + + +def get_project_outputs(project, schema): + """ + Get project level outputs with path-like attributes populated with + project attributes + + :param peppy.Project project: + :param Iterable[dict] schema: + :return attmap.PathExAttMap: mapping with populated path-like attributes + """ + from attmap import PathExAttMap + # if not any([isinstance(project, Project), + # issubclass(type(project), Project)]): + # raise TypeError("Can only populate paths in peppy.Project " + # "objects or it subclasses") + schema = schema[-1] # use only first schema, in case there are imports + if PROP_KEY not in schema: + raise EidoSchemaInvalidError("Schema is missing properties section.") + res = {} + s = schema[PROP_KEY] + path_sects = _get_path_sect_keys(s, keys=PATH_LIKE) + for ps in path_sects: + res[ps] = s[ps] + for p in PATH_LIKE: + try: + res[ps][p] = s[ps][p].format(**dict(project.items())) + except Exception as e: + _LOGGER.debug("Caught exception: {}.\n Could not populate {} " + "path".format(p, str(e))) + return PathExAttMap(res) diff --git a/looper/project.py b/looper/project.py index 3d1926232..de5cc0b7c 100644 --- a/looper/project.py +++ b/looper/project.py @@ -1,56 +1,329 @@ """ Looper version of NGS project model. """ -from collections import namedtuple -from functools import partial import itertools import os -import peppy -from peppy import METADATA_KEY, OUTDIR_KEY -from ubiquerg import is_command_callable +from jsonschema import ValidationError +from pandas.core.common import flatten +from logging import getLogger + +from peppy import SAMPLE_NAME_ATTR, OUTDIR_KEY, CONFIG_KEY, \ + Project as peppyProject +from eido import read_schema, PathAttrNotFoundError +from divvy import ComputingConfiguration +from ubiquerg import is_command_callable, expandpath + +from .processed_project import populate_sample_paths, populate_project_paths from .const import * -from .exceptions import DuplicatePipelineKeyException, \ - PipelineInterfaceRequirementsError -from .pipeline_interface import PROTOMAP_KEY -from .project_piface_group import ProjectPifaceGroup -from .utils import get_logger, partition +from .exceptions import * +from .utils import * +from .pipeline_interface import PipelineInterface + +__all__ = ["Project"] + +_LOGGER = getLogger(__name__) -__author__ = "Vince Reuter" -__email__ = "vreuter@virginia.edu" +class ProjectContext(object): + """ Wrap a Project to provide protocol-specific Sample selection. """ + + def __init__(self, prj, selector_attribute=None, + selector_include=None, selector_exclude=None): + """ Project and what to include/exclude defines the context. """ + if not isinstance(selector_attribute, str): + raise TypeError( + "Name of attribute for sample selection isn't a string: {} " + "({})".format(selector_attribute, type(selector_attribute))) + self.prj = prj + self.include = selector_include + self.exclude = selector_exclude + self.attribute = selector_attribute + + def __getattr__(self, item): + """ Samples are context-specific; other requests are handled + locally or dispatched to Project. """ + if item == "samples": + return fetch_samples(prj=self.prj, + selector_attribute=self.attribute, + selector_include=self.include, + selector_exclude=self.exclude) + if item in ["prj", "include", "exclude"]: + # Attributes requests that this context/wrapper handles + return self.__dict__[item] + else: + # Dispatch attribute request to Project. + return getattr(self.prj, item) -__all__ = ["Project", "process_pipeline_interfaces"] + def __getitem__(self, item): + """ Provide the Mapping-like item access to the instance's Project. """ + return self.prj[item] + def __enter__(self): + """ References pass through this instance as needed, so the context + provided is the instance itself. """ + return self -_LOGGER = get_logger(__name__) + def __repr__(self): + return self.prj.__repr__() + def __exit__(self, *args): + """ Context teardown. """ + pass -class Project(peppy.Project): + +class Project(peppyProject): """ - Looper-specific NGS Project. + Looper-specific Project. :param str config_file: path to configuration file with data from which Project is to be built - :param str subproject: name indicating subproject to use, optional + :param Iterable[str] amendments: name indicating amendment to use, optional + :param str divcfg_path: path to an environment configuration YAML file + specifying compute settings. + :param bool permissive: Whether a error should be thrown if + a sample input file(s) do not exist or cannot be open. + :param str compute_env_file: Environment configuration YAML file specifying + compute settings. """ - def __init__(self, config_file, subproject=None, **kwargs): - super(Project, self).__init__( - config_file, subproject=subproject, - no_environment_exception=RuntimeError, - no_compute_exception=RuntimeError, **kwargs) - self.interfaces = process_pipeline_interfaces( - self[METADATA_KEY][PIPELINE_INTERFACES_KEY]) + def __init__(self, config_file, amendments=None, divcfg_path=None, + runp=False, **kwargs): + super(Project, self).__init__(config_file, amendments=amendments) + setattr(self, EXTRA_KEY, dict()) + for attr_name in CLI_PROJ_ATTRS: + if attr_name in kwargs: + setattr(self[EXTRA_KEY], attr_name, kwargs[attr_name]) + if not runp: + self._samples_by_interface = \ + self._samples_by_piface(self.piface_key) + self._interfaces_by_sample = self._piface_by_samples() + if FILE_CHECKS_KEY in self[EXTRA_KEY]: + setattr(self, "file_checks", not self[EXTRA_KEY][FILE_CHECKS_KEY]) + if DRY_RUN_KEY in self[EXTRA_KEY]: + setattr(self, DRY_RUN_KEY, self[EXTRA_KEY][DRY_RUN_KEY]) + self.dcc = None if divcfg_path is None else \ + ComputingConfiguration(filepath=divcfg_path) + if hasattr(self, DRY_RUN_KEY) and not self[DRY_RUN_KEY]: + _LOGGER.debug("Ensuring project directories exist") + self.make_project_dirs() + + @property + def piface_key(self): + """ + Name of the pipeline interface attribute for this project + + :return str: name of the pipeline interface attribute + """ + return self._extra_cli_or_cfg(PIFACE_KEY_SELECTOR) \ + or PIPELINE_INTERFACES_KEY + + @property + def toggle_key(self): + """ + Name of the toggle attribute for this project + + :return str: name of the toggle attribute + """ + return self._extra_cli_or_cfg(TOGGLE_KEY_SELECTOR) or SAMPLE_TOGGLE_ATTR + + @property + def selected_compute_package(self): + """ + Compute package name specified in object constructor + + :return str: compute package name + """ + return self._extra_cli_or_cfg(COMPUTE_PACKAGE_KEY) + + @property + def cli_pifaces(self): + """ + Collection of pipeline interface sources specified in object constructor + + :return list[str]: collection of pipeline interface sources + """ + x = self._extra_cli_or_cfg(self.piface_key) + return list(flatten([x] if not isinstance(x, list) else x)) + + @property + def output_dir(self): + """ + Output directory for the project, specified in object constructor + + :return str: path to the output directory + """ + return self._extra_cli_or_cfg(OUTDIR_KEY, strict=True) + + def _extra_cli_or_cfg(self, attr_name, strict=False): + """ + Get attribute value provided in kwargs in object constructor of from + looper section in the configuration file + + :param str attr_name: name of the attribute to get value for + :param bool strict: whether a non-existent attribute is exceptional + :raise MisconfigurationException: in strict mode, when no attribute + found + """ + try: + result = getattr(self[EXTRA_KEY], attr_name) + except (AttributeError, KeyError): + return + if result is not None: + return result + if CONFIG_KEY in self and LOOPER_KEY in self[CONFIG_KEY] \ + and attr_name in self[CONFIG_KEY][LOOPER_KEY]: + return self[CONFIG_KEY][LOOPER_KEY][attr_name] + else: + if strict: + raise MisconfigurationException( + "'{}' is missing. Provide it in the '{}' section of the " + "project configuration file".format(attr_name, LOOPER_KEY)) + return + + @property + def results_folder(self): + """ + Path to the results folder for the project + + :return str: path to the results folder in the output folder + """ + return self._out_subdir_path(RESULTS_SUBDIR_KEY, + default="results_pipeline") + + @property + def submission_folder(self): + """ + Path to the submission folder for the project + + :return str: path to the submission in the output folder + """ + return self._out_subdir_path(SUBMISSION_SUBDIR_KEY, + default="submission") + + def _out_subdir_path(self, key, default): + """ + Create a system path relative to the project output directory. + The values for the names of the subdirectories are sourced from + kwargs passed to the object constructor. + + :param str key: name of the attribute mapped to the value of interest + :param str default: if key not specified, a default to use + :return str: path to the folder + """ + return os.path.join(getattr(self, OUTDIR_KEY), + getattr(self[EXTRA_KEY], key) or default) + + def make_project_dirs(self): + """ + Create project directory structure if it doesn't exist. + """ + for folder_key in ["results_folder", "submission_folder"]: + folder_path = getattr(self, folder_key) + _LOGGER.debug("Ensuring project dir exists: '{}'". + format(folder_path)) + if not os.path.exists(folder_path): + _LOGGER.debug("Attempting to create project folder: '{}'". + format(folder_path)) + try: + os.makedirs(folder_path) + except OSError as e: + _LOGGER.warning("Could not create project folder: '{}'". + format(str(e))) @property - def project_folders(self): - """ Critical project folder keys """ - return {OUTDIR_KEY: OUTDIR_KEY, RESULTS_SUBDIR_KEY: "results_pipeline", - SUBMISSION_SUBDIR_KEY: "submission"} + def project_pipeline_interface_sources(self): + """ + Get a list of all valid project-level pipeline interface sources + associated with this project. Sources that are file paths are expanded + + :return list[str]: collection of valid pipeline interface sources: + """ + return [expandpath(src) for src in self.cli_pifaces] \ + if self.cli_pifaces is not None else [] @property - def required_metadata(self): - """ Which metadata attributes are required. """ - return [OUTDIR_KEY] + def project_pipeline_interfaces(self): + """ + Flat list of all valid project-level interface objects associated + with this Project + + Note that only valid pipeline interfaces will show up in the + result (ones that exist on disk/remotely and validate successfully + against the schema) + + :return list[looper.PipelineInterface]: list of pipeline interfaces + """ + return [PipelineInterface(pi, pipeline_type="project") + for pi in self.project_pipeline_interface_sources] + + @property + def pipeline_interfaces(self): + """ + Flat list of all valid interface objects associated with this Project + + Note that only valid pipeline interfaces will show up in the + result (ones that exist on disk/remotely and validate successfully + against the schema) + + :return list[looper.PipelineInterface]: list of pipeline interfaces + """ + return [i for s in self._interfaces_by_sample.values() for i in s] + + @property + def pipeline_interface_sources(self): + """ + Get a list of all valid pipeline interface sources associated + with this project. Sources that are file paths are expanded + + :return list[str]: collection of valid pipeline interface sources + """ + return self._samples_by_interface.keys() + + # def _overwrite_sample_pifaces_with_cli(self, pifaces): + # """ + # Overwrite sample pipeline interface sources with the provided ones + # + # :param Iterable[str] | str | NoneType pifaces: collection of pipeline + # interface sources + # """ + # _LOGGER.debug("CLI-specified pifaces: {}".format(pifaces)) + # valid_pi = [] + # if not pifaces: + # # No CLI-specified pipeline interface sources + # return + # if isinstance(pifaces, str): + # pifaces = [pifaces] + # for piface in pifaces: + # pi = expandpath(piface) + # try: + # PipelineInterface(pi, pipeline_type="sample") + # except Exception as e: + # _LOGGER.warning("Provided pipeline interface source ({}) is " + # "invalid. Caught exception: {}". + # format(pi, getattr(e, 'message', repr(e)))) + # else: + # valid_pi.append(pi) + # [setattr(s, self.piface_key, valid_pi) for s in self.samples] + # if valid_pi: + # _LOGGER.info("Provided valid pipeline interface sources ({}) " + # "set in all samples".format(", ".join(valid_pi))) + + def get_sample_piface(self, sample_name): + """ + Get a list of pipeline interfaces associated with the specified sample. + + Note that only valid pipeline interfaces will show up in the + result (ones that exist on disk/remotely and validate successfully + against the schema) + + :param str sample_name: name of the sample to retrieve list of + pipeline interfaces for + :return list[looper.PipelineInterface]: collection of valid + pipeline interfaces associated with selected sample + """ + try: + return self._interfaces_by_sample[sample_name] + except KeyError: + return None def build_submission_bundles(self, protocol, priority=True): """ @@ -83,197 +356,88 @@ def build_submission_bundles(self, protocol, priority=True): # the locations indicated in the project configuration file) as a # sort of pool of information about possible ways in which to submit # pipeline(s) for sample(s) of the indicated protocol. - try: - pipeline_interfaces = self.get_interfaces(protocol) - except KeyError: - # Messaging can be done by the caller. - _LOGGER.debug("No interface for protocol: %s", protocol) - return [] + pifaces = self.interfaces.get_pipeline_interface(protocol) + if not pifaces: + raise PipelineInterfaceConfigError( + "No interfaces for protocol: {}".format(protocol)) + + # coonvert to a list, in the future we might allow to match multiple + pifaces = pifaces if isinstance(pifaces, str) else [pifaces] job_submission_bundles = [] - pipeline_keys_used = set() - _LOGGER.debug("Building pipelines for {} interface(s)...". - format(len(pipeline_interfaces))) - - bundle_by_strict_pipe_key = {} - - for pipe_iface in pipeline_interfaces: - # "Break"-like mechanism for short-circuiting if we care only - # about the highest-priority match for pipeline submission. - # That is, if the intent is to submit pipeline(s) from a single - # location for each sample of the given protocol, we can stop - # searching the pool of pipeline interface information once we've - # found a match for the protocol. - if priority and len(job_submission_bundles) > 0: - return job_submission_bundles[0] - - this_protocol_pipelines = pipe_iface.fetch_pipelines(protocol) - if not this_protocol_pipelines: - _LOGGER.debug("No pipelines; available: {}".format( - ", ".join(pipe_iface.protocol_mapping.keys()))) - continue + new_jobs = [] - # TODO: update once dependency-encoding logic is in place. - # The proposed dependency-encoding format uses a semicolon - # between pipelines for which the dependency relationship is - # serial. For now, simply treat those as multiple independent - # pipelines by replacing the semicolon with a comma, which is the - # way in which multiple independent pipelines for a single protocol - # are represented in the mapping declaration. - pipeline_keys = \ - this_protocol_pipelines.replace(";", ",") \ - .strip(" ()\n") \ - .split(",") - # These cleaned pipeline keys are what's used to resolve the path - # to the pipeline to run. - pipeline_keys = [pk.strip() for pk in pipeline_keys] - - # Skip over pipelines already mapped by another location. - already_mapped, new_scripts = \ - partition(pipeline_keys, - partial(_is_member, items=pipeline_keys_used)) - pipeline_keys_used |= set(pipeline_keys) - - # Attempt to validate that partition yielded disjoint subsets. - try: - disjoint_partition_violation = \ - set(already_mapped) & set(new_scripts) - except TypeError: - _LOGGER.debug("Unable to hash partitions for validation") - else: - assert not disjoint_partition_violation, \ - "Partitioning {} with membership in {} as " \ - "predicate produced intersection: {}".format( - pipeline_keys, pipeline_keys_used, - disjoint_partition_violation) - - if len(already_mapped) > 0: - _LOGGER.debug("Skipping {} already-mapped script name(s): {}". - format(len(already_mapped), already_mapped)) - _LOGGER.debug("{} new scripts for protocol {} from " - "pipeline(s) location '{}': {}". - format(len(new_scripts), protocol, - pipe_iface.source, new_scripts)) - - # For each pipeline script to which this protocol will pertain, - # create the new jobs/submission bundles. - new_jobs = [] - for pipeline_key in new_scripts: - # Determine how to reference the pipeline and where it is. - strict_pipe_key, full_pipe_path, full_pipe_path_with_flags = \ - pipe_iface.finalize_pipeline_key_and_paths(pipeline_key) - - # Skip and warn about nonexistent alleged pipeline path. - if not (os.path.exists(full_pipe_path) or - is_command_callable(full_pipe_path)): - _LOGGER.warning( - "Missing pipeline script: '%s'", full_pipe_path) - continue - - if not pipe_iface.validate(pipeline_key): - unmet = pipe_iface.missing_requirements(pipeline_key) - _LOGGER.warning( - "{n} requirements unsatisfied for pipeline '{p}' " - "(interface from {s}): {data}".format( - n=len(unmet), p=pipeline_key, s=pipe_iface.source, - data=unmet)) - continue - - # Determine which interface and Sample subtype to use. - sample_subtype = \ - pipe_iface.fetch_sample_subtype( - protocol, strict_pipe_key, full_pipe_path) - - # Package the pipeline's interface, subtype, command, and key. - submission_bundle = SubmissionBundle( - pipe_iface, sample_subtype, strict_pipe_key, - full_pipe_path_with_flags) - - # Enforce bundle uniqueness for each strict pipeline key. - maybe_new_bundle = (full_pipe_path_with_flags, - sample_subtype, pipe_iface) - old_bundle = bundle_by_strict_pipe_key.setdefault( - strict_pipe_key, maybe_new_bundle) - if old_bundle != maybe_new_bundle: - errmsg = "Strict pipeline key '{}' maps to more than " \ - "one combination of pipeline script + flags, " \ - "sample subtype, and pipeline interface. " \ - "'{}'\n{}".format( - strict_pipe_key, maybe_new_bundle, old_bundle) - raise ValueError(errmsg) - - # Add this bundle to the collection of ones relevant for the - # current PipelineInterface. - new_jobs.append(submission_bundle) + _LOGGER.debug("Building pipelines matched by protocol: {}". + format(protocol)) - job_submission_bundles.append(new_jobs) + for pipe_iface in pifaces: + # Determine how to reference the pipeline and where it is. + path = pipe_iface["path"] + if not (os.path.exists(path) or is_command_callable(path)): + _LOGGER.warning("Missing pipeline script: {}".format(path)) + continue - # Repeat logic check of short-circuit conditional to account for - # edge case in which it's satisfied during the final iteration. - if priority and len(job_submission_bundles) > 1: - return job_submission_bundles[0] - else: - return list(itertools.chain(*job_submission_bundles)) + # Add this bundle to the collection of ones relevant for the + # current PipelineInterface. + new_jobs.append(pipe_iface) + job_submission_bundles.append(new_jobs) + return list(itertools.chain(*job_submission_bundles)) - def get_interfaces(self, protocol): + @staticmethod + def get_schemas(pifaces, schema_key=INPUT_SCHEMA_KEY): """ - Get the pipeline interfaces associated with the given protocol. + Get the list of unique schema paths for a list of pipeline interfaces - :param str protocol: name of the protocol for which to get interfaces - :return Iterable[looper.PipelineInterface]: collection of pipeline - interfaces associated with the given protocol - :raise KeyError: if the given protocol is not (perhaps yet) mapped - to any pipeline interface + :param str | Iterable[str] pifaces: pipeline interfaces to search + schemas for + :param str schema_key: where to look for schemas in the piface + :return Iterable[str]: unique list of schema file paths """ - return self.interfaces[protocol] - - def get_outputs(self, skip_sample_less=True): + if isinstance(pifaces, str): + pifaces = [pifaces] + schema_set = set() + for piface in pifaces: + schema_file = piface.get_pipeline_schemas(schema_key) + if schema_file: + schema_set.update([schema_file]) + return list(schema_set) + + def populate_pipeline_outputs(self, check_exist=False): """ - Map pipeline identifier to collection of output specifications. - - This method leverages knowledge of two collections of different kinds - of entities that meet in the manifestation of a Project. The first - is a collection of samples, which is known even in peppy.Project. The - second is a mapping from protocol/assay/library strategy to a collection - of pipeline interfaces, in which kinds of output may be declared. - - Knowledge of these two items is here harnessed to map the identifier - for each pipeline about which this Project is aware to a collection of - pairs of identifier for a kind of output and the collection of - this Project's samples for which it's applicable (i.e., those samples - with protocol that maps to the corresponding pipeline). + Populate project and sample output attributes based on output schemas + that pipeline interfaces point to. Additionally, if requested, check + for the constructed paths existence on disk + """ + for sample in self.samples: + sample_piface = self.get_sample_piface(sample[SAMPLE_NAME_ATTR]) + if sample_piface: + paths = self.get_schemas(sample_piface, OUTPUT_SCHEMA_KEY) + for path in paths: + schema = read_schema(path)[-1] + try: + populate_project_paths(self, schema, check_exist) + populate_sample_paths(sample, schema, check_exist) + except PathAttrNotFoundError: + _LOGGER.error( + "Missing outputs of pipelines matched by protocol: " + "{}".format(sample.protocol) + ) + raise + + def _piface_by_samples(self): + """ + Create a mapping of all defined interfaces in this Project by samples. - :param bool skip_sample_less: whether to omit pipelines that are for - protocols of which the Project has no Sample instances - :return Mapping[str, Mapping[str, namedtuple]]: collection of bindings - between identifier for pipeline and collection of bindings between - name for a kind of output and pair in which first component is a - path template and the second component is a collection of - sample names - :raise TypeError: if argument to sample-less pipeline skipping parameter - is not a Boolean + :return list[str]: a collection of pipeline interfaces keyed by + sample name """ - if not isinstance(skip_sample_less, bool): - raise TypeError( - "Non-Boolean argument to sample-less skip flag: {} ({})". - format(skip_sample_less, type(skip_sample_less))) - prots_data_pairs = _gather_ifaces(self.interfaces) - m = {} - for name, (prots, data) in prots_data_pairs.items(): - try: - outs = data[OUTKEY] - except KeyError: - _LOGGER.debug("No {} declared for pipeline: {}". - format(OUTKEY, name)) - continue - snames = [s.name for s in self.samples if s.protocol in prots] - if not snames and skip_sample_less: - _LOGGER.debug("No samples matching protocol(s): {}". - format(", ".join(prots))) - continue - m[name] = {path_key: (path_val, snames) - for path_key, path_val in outs.items()} - return m + pifaces_by_sample = {} + for source, sample_names in self._samples_by_interface.items(): + for sample_name in sample_names: + pifaces_by_sample.setdefault(sample_name, []) + pifaces_by_sample[sample_name].\ + append(PipelineInterface(source, pipeline_type="sample")) + return pifaces_by_sample def _omit_from_repr(self, k, cls): """ @@ -284,84 +448,134 @@ def _omit_from_repr(self, k, cls): """ return super(Project, self)._omit_from_repr(k, cls) or k == "interfaces" + def _samples_by_piface(self, piface_key): + """ + Create a collection of all samples with valid pipeline interfaces -def _gather_ifaces(ifaces): - """ - For each pipeline map identifier to protocols and interface data. - - :param Iterable[looper.PipelineInterface] ifaces: collection of pipeline - interface objects - :return Mapping[str, (set[str], attmap.AttMap)]: collection of bindings - between pipeline identifier and pair in which first component is - collection of associated protocol names, and second component is a - collection of interface data for pipeline identified by the key - :raise looper.DuplicatePipelineKeyException: if the same identifier (key or - name) points to collections of pipeline interface data (for a - particular pipeline) that are not equivalent - """ - specs = {} - for pi in ifaces: - protos_by_name = {} - for p, names in pi[PROTOMAP_KEY].items(): - if isinstance(names, str): - names = [names] - for n in names: - protos_by_name.setdefault(n, set()).add(p) - for k, dat in pi.iterpipes(): - name = dat.get("name") or k - try: - old_prots, old_dat = specs[name] - except KeyError: - old_prots = set() - else: - if dat != old_dat: - raise DuplicatePipelineKeyException(name) - new_prots = protos_by_name.get(name, set()) | \ - protos_by_name.get(k, set()) - specs[name] = (old_prots | new_prots, dat) - return specs - - -def process_pipeline_interfaces(pipeline_interface_locations): + :param str piface_key: name of the attribute that holds pipeline + interfaces + :return list[str]: a collection of samples keyed by pipeline interface + source + """ + def _resolve_path(pth): + """ + Expand provided path to the pipeline interface and make it absolute + using project config path + + :param str pth: path, possibly including env vars and/or relative + :return str: absolute path + """ + pth = expandpath(pth) + if not os.path.isabs(pth): + pth = os.path.realpath(os.path.join(os.path.dirname( + self.config_file), pth)) + _LOGGER.debug("Relative path to pipeline interface source made " + "absolute: {}".format(pth)) + return pth + samples_by_piface = {} + msgs = set() + for sample in self.samples: + if piface_key in sample and sample[piface_key]: + piface_srcs = sample[piface_key] + if isinstance(piface_srcs, str): + piface_srcs = [piface_srcs] + for source in piface_srcs: + source = _resolve_path(source) + try: + PipelineInterface(source, pipeline_type="sample") + except (ValidationError, IOError) as e: + msg = "Ignoring invalid pipeline interface source: " \ + "{}. Caught exception: {}".\ + format(source, getattr(e, 'message', repr(e))) + msgs.add(msg) + continue + else: + samples_by_piface.setdefault(source, set()) + samples_by_piface[source].add(sample[SAMPLE_NAME_ATTR]) + for msg in msgs: + _LOGGER.warning(msg) + return samples_by_piface + + +def fetch_samples(prj, selector_attribute=None, selector_include=None, + selector_exclude=None): """ - Create a PipelineInterface for each pipeline location given. - - :param Iterable[str] pipeline_interface_locations: locations, each of - which should be either a directory path or a filepath, that specifies - pipeline interface and protocol mappings information. Each such file - should have a pipelines section and a protocol mappings section. - :return Mapping[str, Iterable[PipelineInterface]]: mapping from protocol - name to interface(s) for which that protocol is mapped + Collect samples of particular protocol(s). + + Protocols can't be both positively selected for and negatively + selected against. That is, it makes no sense and is not allowed to + specify both selector_include and selector_exclude protocols. On the + other hand, if + neither is provided, all of the Project's Samples are returned. + If selector_include is specified, Samples without a protocol will be + excluded, + but if selector_exclude is specified, protocol-less Samples will be + included. + + :param Project prj: the Project with Samples to fetch + :param str selector_attribute: name of attribute on which to base the + fetch + :param Iterable[str] | str selector_include: protocol(s) of interest; + if specified, a Sample must + :param Iterable[str] | str selector_exclude: protocol(s) to include + :return list[Sample]: Collection of this Project's samples with + protocol that either matches one of those in selector_include, + or either + lacks a protocol or does not match one of those in selector_exclude + :raise TypeError: if both selector_include and selector_exclude + protocols are + specified; TypeError since it's basically providing two arguments + when only one is accepted, so remain consistent with vanilla + Python2; + also possible if name of attribute for selection isn't a string """ - iface_group = ProjectPifaceGroup() - for loc in pipeline_interface_locations: - if not os.path.exists(loc): - _LOGGER.warning("Ignoring nonexistent pipeline interface location: " - "{}".format(loc)) - continue - fs = [loc] if os.path.isfile(loc) else \ - [os.path.join(loc, f) for f in os.listdir(loc) - if os.path.splitext(f)[1] in [".yaml", ".yml"]] - for f in fs: - _LOGGER.debug("Processing interface definition: {}".format(f)) - try: - iface_group.update(f) - except PipelineInterfaceRequirementsError as e: - _LOGGER.warning("Cannot build pipeline interface from {} ({})". - format(f, str(e))) - return iface_group - - -OutputGroup = namedtuple("OutputGroup", field_names=["path", "samples"]) - - -# Collect PipelineInterface, Sample type, pipeline path, and script with flags. -SubmissionBundle = namedtuple( - "SubmissionBundle", - field_names=["interface", "subtype", "pipeline", "pipeline_with_flags"]) -SUBMISSION_BUNDLE_PIPELINE_KEY_INDEX = 2 - - -def _is_member(item, items): - """ Determine whether an item is a member of a collection. """ - return item in items + if selector_attribute is None or \ + (not selector_include and not selector_exclude): + # Simple; keep all samples. In this case, this function simply + # offers a list rather than an iterator. + return list(prj.samples) + + if not isinstance(selector_attribute, str): + raise TypeError( + "Name for attribute on which to base selection isn't string: " + "{} " + "({})".format(selector_attribute, type(selector_attribute))) + + # At least one of the samples has to have the specified attribute + if prj.samples and not any( + [hasattr(s, selector_attribute) for s in prj.samples]): + raise AttributeError( + "The Project samples do not have the attribute '{attr}'". + format(attr=selector_attribute)) + + # Intersection between selector_include and selector_exclude is + # nonsense user error. + if selector_include and selector_exclude: + raise TypeError( + "Specify only selector_include or selector_exclude parameter, " + "not both.") + + # Ensure that we're working with sets. + def make_set(items): + if isinstance(items, str): + items = [items] + return items + + # Use the attr check here rather than exception block in case the + # hypothetical AttributeError would occur; we want such + # an exception to arise, not to catch it as if the Sample lacks + # "protocol" + if not selector_include: + # Loose; keep all samples not in the selector_exclude. + def keep(s): + return not hasattr(s, selector_attribute) \ + or getattr(s, selector_attribute) \ + not in make_set(selector_exclude) + else: + # Strict; keep only samples in the selector_include. + def keep(s): + return hasattr(s, selector_attribute) \ + and getattr(s, selector_attribute) \ + in make_set(selector_include) + + return list(filter(keep, prj.samples)) diff --git a/looper/project_piface_group.py b/looper/project_piface_group.py deleted file mode 100644 index f2792c059..000000000 --- a/looper/project_piface_group.py +++ /dev/null @@ -1,117 +0,0 @@ -""" Group of Project's PipelineInterface instances """ - -import sys -if sys.version_info < (3, 3): - from collections import Mapping -else: - from collections.abc import Mapping -from .pipeline_interface import PipelineInterface, PROTOMAP_KEY -from .utils import get_logger - -__author__ = "Vince Reuter" -__email__ = "vreuter@virginia.edu" - - -_LOGGER = get_logger(__name__) - - -class ProjectPifaceGroup(object): - """ Collection of PipelineInterface instances and lookup-by-protocol. """ - - def __init__(self, piface=None): - """ - Create the group, either empty or with initial data. - - :param str | Mapping | looper.PipelineInterface piface: either pipeline - interface file, pipeline interface, or interface-defining mapping - """ - self._interfaces = [] - self._indices_by_protocol = {} - piface and self.update(piface) - - def __eq__(self, other): - """ - Instances are equivalent iff interfaces and protocol mappings are. - - :param looper.project_piface_group.ProjectPifaceGroup other: the group - to compare to this one - :return bool: whether this group is equivalent to the compared one - """ - return isinstance(other, ProjectPifaceGroup) and \ - self._interfaces == other._interfaces and \ - self._indices_by_protocol == other._indices_by_protocol - - def __ne__(self, other): - """ Leverage the overridden equivalence operator. """ - return not self == other - - def __getitem__(self, item): - """ - Retrieve interfaces for given protocol name. - - :param str item: name of protocol for which to fetch interfaces. - :return Iterable[looper.PipelineInterface]: - """ - return [self._interfaces[i] for i in self._indices_by_protocol[item]] - - def __iter__(self): - """ - Iteration is over the interfaces. - - :return Iterable[looper.PipelineInterface]: iterator over this group's - PipelineInterface instances - """ - return iter(self._interfaces) - - def __len__(self): - """ - Group size is the number of interfaces. - - :return int: number of interfaces in this group - """ - return sum(1 for _ in iter(self)) - - @property - def protocols(self): - """ - Get the collection of names of protocols mapping into this group. - - :return list[str]: collection of protocol names that map to at least - one pipeline represented by an interface in this group - """ - return [p for p in self._indices_by_protocol] - - def update(self, piface): - """ - Add a pipeline interface to this group. - - :param str | Mapping | looper.PipelineInterface piface: either pipeline - interface file, pipeline interface, or interface-defining mapping - :return looper.project_piface_group.ProjectPifaceGroup: updated instance - :raise TypeError: if the argument to the piface parameter is neither - text (filepath) nor a PipelineInterface or Mapping; additional - exception cases may arise from ensuing attempt to create a - PipelineInterface from the argument if the argument itself is not - already a PipelineInterface. - """ - if isinstance(piface, PipelineInterface): - _LOGGER.debug("Interface group argument is already an interface.") - elif isinstance(piface, (str, Mapping)): - piface = PipelineInterface(piface) - elif not isinstance(piface, PipelineInterface): - raise TypeError( - "Update value must be {obj}-defining filepath or {obj} itself; " - "got {argtype}".format( - obj=PipelineInterface.__name__, argtype=type(piface))) - assert isinstance(piface, PipelineInterface) - for curr in self._interfaces: - if curr == piface: - _LOGGER.whisper("Found existing {} match: {}".format( - PipelineInterface.__class__.__name__, piface)) - break - else: - self._interfaces.append(piface) - i = len(self._interfaces) - 1 - for p in piface[PROTOMAP_KEY]: - self._indices_by_protocol.setdefault(p, []).append(i) - return self diff --git a/looper/sample.py b/looper/sample.py deleted file mode 100644 index cfe18cd05..000000000 --- a/looper/sample.py +++ /dev/null @@ -1,291 +0,0 @@ -""" Extension of peppy.Sample to support looper-specific operations. """ - -import os -from operator import itemgetter -from peppy import Sample as PeppySample -from peppy.const import * -from peppy.sample import SAMPLE_YAML_EXT -from peppy.utils import get_logger -from ngstk import get_file_size, parse_ftype, \ - peek_read_lengths_and_paired_counts_from_bam - -__author__ = "Vince Reuter" -__email__ = "vreuter@virginia.edu" - -__all__ = ["Sample"] - -_LOGGER = get_logger(__name__) - - -class Sample(PeppySample): - - def __init__(self, series, prj=None): - super(Sample, self).__init__(series, prj) - - def determine_missing_requirements(self): - """ - Determine which of this Sample's required attributes/files are missing. - - :return (type, str): hypothetical exception type along with message - about what's missing; null and empty if nothing exceptional - is detected - """ - - null_return = (None, "", "") - - # set_pipeline_attributes must be run first. - if not hasattr(self, "required_inputs"): - _LOGGER.warning("You must run set_pipeline_attributes before " - "determine_missing_requirements") - return null_return - - if not self.required_inputs: - _LOGGER.debug("No required inputs") - return null_return - - # First, attributes - missing, empty = [], [] - for file_attribute in self.required_inputs_attr: - _LOGGER.whisper("Checking '{}'".format(file_attribute)) - try: - attval = getattr(self, file_attribute) - except AttributeError: - _LOGGER.whisper( - "Missing required input attribute '%s'", file_attribute) - missing.append(file_attribute) - continue - if attval == "": - _LOGGER.whisper( - "Empty required input attribute '%s'", file_attribute) - empty.append(file_attribute) - else: - _LOGGER.whisper( - "'{}' is valid: '{}'".format(file_attribute, attval)) - - if missing: - reason_key = "Missing attribute" - reason_detail = "Missing: {}".format(", ".join(missing)) - return AttributeError, reason_key, reason_detail - - if empty: - reason_key = "Empty attribute" - reason_detail = "Empty: {}".format(",".join(empty)) - return AttributeError, reason_key, reason_detail - - # Second, files - missing_files = [] - for paths in self.required_inputs: - _LOGGER.whisper("Text to split and check paths: '%s'", paths) - # There can be multiple, space-separated values here. - for path in paths.split(" "): - _LOGGER.whisper("Checking path: '{}'".format(path)) - if not os.path.exists(path): - _LOGGER.whisper( - "Missing required input file: '{}'".format(path)) - missing_files.append(path) - - if not missing_files: - return null_return - else: - reason_key = "Missing file(s)" - reason_detail = ", ".join(missing_files) - return IOError, reason_key, reason_detail - - def generate_filename(self, delimiter="_"): - """ - Create a name for file in which to represent this Sample. - - This uses knowledge of the instance's subtype, sandwiching a delimiter - between the name of this Sample and the name of the subtype before the - extension. If the instance is a base Sample type, then the filename - is simply the sample name with an extension. - - :param str delimiter: what to place between sample name and name of - subtype; this is only relevant if the instance is of a subclass - :return str: name for file with which to represent this Sample on disk - """ - base = self.name if type(self) is Sample else \ - "{}{}{}".format(self.name, delimiter, type(self).__name__) - return "{}{}".format(base, SAMPLE_YAML_EXT) - - def set_pipeline_attributes( - self, pipeline_interface, pipeline_name, permissive=True): - """ - Set pipeline-specific sample attributes. - - Some sample attributes are relative to a particular pipeline run, - like which files should be considered inputs, what is the total - input file size for the sample, etc. This function sets these - pipeline-specific sample attributes, provided via a PipelineInterface - object and the name of a pipeline to select from that interface. - - :param PipelineInterface pipeline_interface: A PipelineInterface - object that has the settings for this given pipeline. - :param str pipeline_name: Which pipeline to choose. - :param bool permissive: whether to simply log a warning or error - message rather than raising an exception if sample file is not - found or otherwise cannot be read, default True - """ - - # Settings ending in _attr are lists of attribute keys. - # These attributes are then queried to populate values - # for the primary entries. - req_attr_names = [("ngs_input_files", "ngs_inputs_attr"), - ("required_input_files", REQUIRED_INPUTS_ATTR_NAME), - ("all_input_files", ALL_INPUTS_ATTR_NAME)] - for name_src_attr, name_dst_attr in req_attr_names: - _LOGGER.whisper("Value of '%s' will be assigned to '%s'", - name_src_attr, name_dst_attr) - value = pipeline_interface.get_attribute( - pipeline_name, name_src_attr) - _LOGGER.whisper("Assigning '{}': {}".format(name_dst_attr, value)) - setattr(self, name_dst_attr, value) - - # Post-processing of input attribute assignments. - # Ensure that there's a valid all_inputs_attr. - if not getattr(self, ALL_INPUTS_ATTR_NAME): - required_inputs = getattr(self, REQUIRED_INPUTS_ATTR_NAME) - setattr(self, ALL_INPUTS_ATTR_NAME, required_inputs) - # Convert attribute keys into values. - if self.ngs_inputs_attr: - _LOGGER.whisper("Handling NGS input attributes: '%s'", self.name) - # NGS data inputs exit, so we can add attributes like - # read_type, read_length, paired. - self.ngs_inputs = self.get_attr_values("ngs_inputs_attr") - - set_rtype_reason = "" - if not hasattr(self, "read_type"): - set_rtype_reason = "read_type not yet set" - elif not self.read_type or self.read_type.lower() \ - not in VALID_READ_TYPES: - set_rtype_reason = "current read_type is invalid: '{}'". \ - format(self.read_type) - if set_rtype_reason: - _LOGGER.debug( - "Setting read_type for %s '%s': %s", - self.__class__.__name__, self.name, set_rtype_reason) - self.set_read_type(permissive=permissive) - else: - _LOGGER.debug("read_type is already valid: '%s'", - self.read_type) - else: - _LOGGER.whisper("No NGS inputs: '%s'", self.name) - - # Assign values for actual inputs attributes. - self.required_inputs = self.get_attr_values(REQUIRED_INPUTS_ATTR_NAME) - self.all_inputs = self.get_attr_values(ALL_INPUTS_ATTR_NAME) - _LOGGER.debug("All '{}' inputs: {}".format(self.name, self.all_inputs)) - self.input_file_size = get_file_size(self.all_inputs) - - def set_read_type(self, rlen_sample_size=10, permissive=True): - """ - For a sample with attr `ngs_inputs` set, this sets the - read type (single, paired) and read length of an input file. - - :param int rlen_sample_size: Number of reads to sample to infer read type, - default 10. - :param bool permissive: whether to simply log a warning or error message - rather than raising an exception if sample file is not found or - otherwise cannot be read, default True. - """ - - # TODO: determine how return is being used and standardized (null vs. bool) - - # Initialize the parameters in case there is no input_file, so these - # attributes at least exist - as long as they are not already set! - for attr in ["read_length", "read_type", "paired"]: - if not hasattr(self, attr): - _LOGGER.whisper("Setting null for missing attribute: '%s'", - attr) - setattr(self, attr, None) - - # ngs_inputs must be set - if not self.ngs_inputs: - return False - - ngs_paths = " ".join(self.ngs_inputs) - - # Determine extant/missing filepaths. - existing_files = list() - missing_files = list() - for path in ngs_paths.split(" "): - if not os.path.exists(path): - missing_files.append(path) - else: - existing_files.append(path) - _LOGGER.debug("{} extant file(s): {}". - format(len(existing_files), existing_files)) - _LOGGER.debug("{} missing file(s): {}". - format(len(missing_files), missing_files)) - - # For samples with multiple original BAM files, check all. - files = list() - check_by_ftype = {"bam": peek_read_lengths_and_paired_counts_from_bam, - "fastq": _check_fastq} - for input_file in existing_files: - try: - file_type = parse_ftype(input_file) - read_lengths, paired = check_by_ftype[file_type]( - input_file, rlen_sample_size) - except (KeyError, TypeError): - message = "Input file type should be one of: {}".format( - check_by_ftype.keys()) - if not permissive: - raise TypeError(message) - _LOGGER.error(message) - return - except NotImplementedError as e: - if not permissive: - raise - _LOGGER.warning(str(e)) - return - except IOError: - if not permissive: - raise - _LOGGER.error("Input file does not exist or " - "cannot be read: %s", str(input_file)) - for feat_name in self._FEATURE_ATTR_NAMES: - if not hasattr(self, feat_name): - setattr(self, feat_name, None) - return - - # Determine most frequent read length among sample. - rlen, _ = sorted(read_lengths.items(), key=itemgetter(1))[-1] - _LOGGER.log(5, - "Selected {} as most frequent read length from " - "sample read length distribution: {}".format( - rlen, read_lengths)) - - # Decision about paired-end status is majority-rule. - if paired > (rlen_sample_size / 2): - read_type = "paired" - paired = True - else: - read_type = "single" - paired = False - - files.append([rlen, read_type, paired]) - - # Check agreement between different files - # if all values are equal, set to that value; - # if not, set to None and warn the user about the inconsistency - for i, feature in enumerate(self._FEATURE_ATTR_NAMES): - feature_values = set(f[i] for f in files) - if 1 == len(feature_values): - feat_val = files[0][i] - else: - _LOGGER.whisper("%d values among %d files for feature '%s'", - len(feature_values), len(files), feature) - feat_val = None - _LOGGER.whisper("Setting '%s' on %s to %s", - feature, self.__class__.__name__, feat_val) - setattr(self, feature, feat_val) - - if getattr(self, feature) is None and len(existing_files) > 0: - _LOGGER.warning( - "Not all input files agree on '%s': '%s'", feature, self.name) - - -def _check_fastq(fastq, o): - raise NotImplementedError( - "Detection of read type/length for fastq input is not yet implemented.") diff --git a/looper/schemas/pipeline_interface_schema_generic.yaml b/looper/schemas/pipeline_interface_schema_generic.yaml new file mode 100644 index 000000000..930051100 --- /dev/null +++ b/looper/schemas/pipeline_interface_schema_generic.yaml @@ -0,0 +1,37 @@ +description: pipeline interface schema + +properties: + pipeline_name: + type: string + pattern: "^\\S*$" + description: "name of the pipeline with no whitespaces" + pipeline_type: + type: string + enum: ["project", "sample"] + description: "type of the pipeline, either 'project' or 'sample'" + command_template: + type: string + description: "Jinja2-like template to construct the command to run" + path: + type: string + description: "path to the pipeline program. Relative to pipeline interface file or absolute." + compute: + type: object + description: "Section that defines compute environment settings" + properties: + dynamic_variables_command_template: + type: string + description: "Jinja2-like template to construct the command that returns a JSON object used to populate compute environment settings" + size_dependent_variables: + type: string + description: "Path to the TSV-formatted file with compute environment settings" + bulker_crate: + type: string + description: "Bulker registry path idendifying the crate to use" + docker_image: + type: string + description: "Docker image identifier" + singularity_image: + type: string + description: "Singularity image identifier" +required: [pipeline_name, pipeline_type, command_template] \ No newline at end of file diff --git a/looper/schemas/pipeline_interface_schema_project.yaml b/looper/schemas/pipeline_interface_schema_project.yaml new file mode 100644 index 000000000..32c995d8e --- /dev/null +++ b/looper/schemas/pipeline_interface_schema_project.yaml @@ -0,0 +1,37 @@ +description: project pipeline interface schema + +properties: + pipeline_name: + type: string + pattern: "^\\S*$" + description: "name of the pipeline with no whitespaces" + pipeline_type: + type: string + const: "project" + description: "type of the pipeline, must be 'project'" + command_template: + type: string + description: "Jinja2-like template to construct the command to run" + path: + type: string + description: "path to the pipeline program. Relative to pipeline interface file or absolute." + compute: + type: object + description: "Section that defines compute environment settings" + properties: + dynamic_variables_command_template: + type: string + description: "Jinja2-like template to construct the command that returns a JSON object used to populate compute environment settings" + size_dependent_variables: + type: string + description: "Path to the TSV-formatted file with compute environment settings" + bulker_crate: + type: string + description: "Bulker registry path idendifying the crate to use" + docker_image: + type: string + description: "Docker image identifier" + singularity_image: + type: string + description: "Singularity image identifier" +required: [pipeline_name, pipeline_type, command_template] \ No newline at end of file diff --git a/looper/schemas/pipeline_interface_schema_sample.yaml b/looper/schemas/pipeline_interface_schema_sample.yaml new file mode 100644 index 000000000..0a814763d --- /dev/null +++ b/looper/schemas/pipeline_interface_schema_sample.yaml @@ -0,0 +1,37 @@ +description: sample pipeline interface schema + +properties: + pipeline_name: + type: string + pattern: "^\\S*$" + description: "name of the pipeline with no whitespaces" + pipeline_type: + type: string + const: "sample" + description: "type of the pipeline, must be 'sample'" + command_template: + type: string + description: "Jinja2-like template to construct the command to run" + path: + type: string + description: "path to the pipeline program. Relative to pipeline interface file or absolute." + compute: + type: object + description: "Section that defines compute environment settings" + properties: + dynamic_variables_command_template: + type: string + description: "Jinja2-like template to construct the command that returns a JSON object used to populate compute environment settings" + size_dependent_variables: + type: string + description: "Path to the TSV-formatted file with compute environment settings" + bulker_crate: + type: string + description: "Bulker registry path idendifying the crate to use" + docker_image: + type: string + description: "Docker image identifier" + singularity_image: + type: string + description: "Singularity image identifier" +required: [pipeline_name, pipeline_type, command_template] \ No newline at end of file diff --git a/looper/utils.py b/looper/utils.py index 42c44153d..eeee58de1 100644 --- a/looper/utils.py +++ b/looper/utils.py @@ -1,121 +1,19 @@ """ Helpers without an obvious logical home. """ from collections import defaultdict, Iterable -import copy +from logging import getLogger import glob import os - -from peppy import \ - FLAGS, SAMPLE_INDEPENDENT_PROJECT_SECTIONS, SAMPLE_NAME_COLNAME -from peppy.utils import get_logger from .const import * +from .exceptions import MisconfigurationException +from peppy.const import * +from peppy import Project as peppyProject +import jinja2 +import yaml +import argparse +from ubiquerg import convert_value, expandpath - -DEFAULT_METADATA_FOLDER = "metadata" -DEFAULT_CONFIG_SUFFIX = "_config.yaml" - - -_LOGGER = get_logger(__name__) - - -def create_looper_args_text(pl_key, submission_settings, prj): - """ - - :param str pl_key: Strict/exact pipeline key, the hook into the project's - pipeline configuration data - :param dict submission_settings: Mapping from settings - key to value, used to determine resource request - :param Project prj: Project data, used for metadata and pipeline - configuration information - :return str: text representing the portion of a command generated by - looper options and arguments - """ - - # Start with copied settings and empty arguments text - submission_settings = copy.deepcopy(submission_settings) - opt_arg_pairs = [("-O", prj.results_folder)] - - if hasattr(prj, "pipeline_config"): - # Index with 'pl_key' instead of 'pipeline' - # because we don't care about parameters here. - if hasattr(prj.pipeline_config, pl_key): - # First priority: pipeline config in project config - pl_config_file = getattr(prj.pipeline_config, pl_key) - # Make sure it's a file (it could be provided as null.) - if pl_config_file: - if not os.path.isfile(pl_config_file): - _LOGGER.error( - "Pipeline config file specified " - "but not found: %s", pl_config_file) - raise IOError(pl_config_file) - _LOGGER.info("Found config file: %s", pl_config_file) - # Append arg for config file if found - opt_arg_pairs.append(("-C", pl_config_file)) - else: - _LOGGER.debug("No pipeline configuration: %s", pl_key) - else: - _LOGGER.debug("Project lacks pipeline configuration") - - num_cores = int(submission_settings.setdefault("cores")) - if num_cores > 1: - opt_arg_pairs.append(("-P", num_cores)) - - try: - mem_alloc = submission_settings["mem"] - except KeyError: - _LOGGER.warning("Submission settings lack memory specification") - else: - if float(mem_alloc) > 1: - opt_arg_pairs.append(("-M", mem_alloc)) - - looper_argtext = " ".join(["{} {}".format(opt, arg) - for opt, arg in opt_arg_pairs]) - return looper_argtext - - -def determine_config_path( - root, folders=(DEFAULT_METADATA_FOLDER, ), - patterns=("*" + DEFAULT_CONFIG_SUFFIX, )): - """ - Determine path to Project config file, allowing folder-based specification. - - :param str root: path to file or main (e.g., project, folder) - :param Iterable[str] folders: collection of names of subfolders to consider - :param Iterable[str] patterns: collection of filename patterns to consider - :return str: unique path to extant Project config file - :raise ValueError: if the given root path doesn't exist, or if multiple - matching files are found - """ - - # Base cases - if not os.path.exists(root): - raise ValueError("Path doesn't exist: {}".format(root)) - if os.path.isfile(root): - return root - - # Deal with single-string argument. - if isinstance(folders, str): - folders = (folders, ) - if isinstance(patterns, str): - patterns = (patterns, ) - - # Search particular folder for any pattern - def search(path): - return [m for p in patterns for m in glob.glob(os.path.join(path, p))] - - # Search results - top_res = search(root) - sub_res = [m for sub in folders for m in search(os.path.join(root, sub))] - all_res = top_res + sub_res - - # Deal with the 3 match count cases. - if len(all_res) > 1: - raise ValueError("Multiple ({}) config paths: {}".format( - len(all_res), ", ".join(map(str, all_res)))) - try: - return all_res[0] - except IndexError: - return None +_LOGGER = getLogger(__name__) def fetch_flag_files(prj=None, results_folder="", flags=FLAGS): @@ -166,30 +64,24 @@ def fetch_flag_files(prj=None, results_folder="", flags=FLAGS): return files_by_flag -def fetch_sample_flags(prj, sample, pl_names=None): +def fetch_sample_flags(prj, sample, pl_name): """ Find any flag files present for a sample associated with a project :param looper.Project prj: project of interest - :param peppy.Sample | str sample: sample object or sample name of interest - :param str | Iterable[str] pl_names: name of the pipeline for which flag(s) - should be found + :param peppy.Sample sample: sample object of interest + :param str pl_name: name of the pipeline for which flag(s) should be found :return Iterable[str]: collection of flag file path(s) associated with the given sample for the given project """ - sfolder = os.path.join(prj.results_folder, sample) if isinstance(sample, str) \ - else sample_folder(prj=prj, sample=sample) + sfolder = sample_folder(prj=prj, sample=sample) if not os.path.isdir(sfolder): - _LOGGER.debug("Folder doesn't exist for sample {}: {}".format(str(sample), sfolder)) + _LOGGER.debug("Results folder ({}) doesn't exist for sample {}". + format(sfolder, str(sample))) return [] - if not pl_names: - pl_match = lambda _: True - else: - if isinstance(pl_names, str): - pl_names = [pl_names] - pl_match = lambda n: any(n.startswith(pl) for pl in pl_names) - return [os.path.join(sfolder, f) for f in os.listdir(sfolder) - if os.path.splitext(f)[1] == ".flag" and pl_match(f)] + folder_contents = [os.path.join(sfolder, f) for f in os.listdir(sfolder)] + return [x for x in folder_contents if os.path.splitext(x)[1] == ".flag" + and os.path.basename(x).startswith(pl_name)] def grab_project_data(prj): @@ -206,47 +98,16 @@ def grab_project_data(prj): :param Project prj: Project from which to grab data :return Mapping: Sample-independent data sections from given Project """ - if not prj: return {} - data = {} - for section in SAMPLE_INDEPENDENT_PROJECT_SECTIONS: - try: - data[section] = prj[section] - except KeyError: - _LOGGER.debug("Project lacks section '%s', skipping", section) - + try: + data = prj[CONFIG_KEY] + except KeyError: + _LOGGER.debug("Project lacks section '%s', skipping", CONFIG_KEY) return data -def partition(items, test): - """ - Partition items into a pair of disjoint multisets, - based on the evaluation of each item as input to boolean test function. - There are a couple of evaluation options here. One builds a mapping - (assuming each item is hashable) from item to boolean test result, then - uses that mapping to partition the elements on a second pass. - The other simply is single-pass, evaluating the function on each item. - A time-costly function suggests the two-pass, mapping-based approach while - a large input suggests a single-pass approach to conserve memory. We'll - assume that the argument is not terribly large and that the function is - cheap to compute and use a simpler single-pass approach. - - :param Sized[object] items: items to partition - :param function(object) -> bool test: test to apply to each item to - perform the partitioning procedure - :return: list[object], list[object]: partitioned items sequences - """ - passes, fails = [], [] - _LOGGER.whisper("Testing {} items: {}".format(len(items), items)) - for item in items: - _LOGGER.whisper("Testing item {}".format(item)) - group = passes if test(item) else fails - group.append(item) - return passes, fails - - def sample_folder(prj, sample): """ Get the path to this Project's root folder for the given Sample. @@ -257,4 +118,218 @@ def sample_folder(prj, sample): :return str: this Project's root folder for the given Sample """ return os.path.join(prj.results_folder, - sample[SAMPLE_NAME_COLNAME]) + sample[SAMPLE_NAME_ATTR]) + + +def get_file_for_project(prj, appendix): + """ + Create a path to the file for the current project. + Takes the possibility of amendment being activated at the time + + :param looper.Project prj: project object + :param str appendix: the appendix of the file to create the path for, + like 'objs_summary.tsv' for objects summary file + :return str: path to the file + """ + fp = os.path.join(prj.output_dir, prj[NAME_KEY]) + if hasattr(prj, AMENDMENTS_KEY) and getattr(prj, AMENDMENTS_KEY): + fp += '_' + '_'.join(getattr(prj, AMENDMENTS_KEY)) + fp += '_' + appendix + return fp + + +def jinja_render_cmd_strictly(cmd_template, namespaces): + """ + Render a command string in the provided namespaces context. + + Strictly, which means that all the requested attributes must be + available in the namespaces + + :param str cmd_template: command template do be filled in with the + variables in the provided namespaces. For example: + "prog.py --name {project.name} --len {sample.len}" + :param Mapping[Mapping[str] namespaces: context for command rendering. + Possible namespaces are: looper, project, sample, pipeline + :return str: rendered command + """ + def _finfun(x): + """ + A callable that can be used to process the result of a variable + expression before it is output. Joins list elements + """ + return " ".join(x) if isinstance(x, list) else x + + env = jinja2.Environment(undefined=jinja2.StrictUndefined, + variable_start_string="{", + variable_end_string="}", + finalize=_finfun) + template = env.from_string(cmd_template) + try: + rendered = template.render(**namespaces) + except jinja2.exceptions.UndefinedError: + _LOGGER.error("Missing sample, project or pipeline attributes" + " required by command template: '{}'" + .format(cmd_template)) + raise + _LOGGER.debug("rendered arg str: {}".format(rendered)) + return rendered + + +def read_yaml_file(filepath): + """ + Read a YAML file + + :param str filepath: path to the file to read + :return dict: read data + """ + data = None + if os.path.exists(filepath): + with open(filepath, 'r') as f: + data = yaml.safe_load(f) + return data + + +def enrich_args_via_cfg(parser_args, aux_parser): + """ + Read in a looper dotfile and set arguments. + + Priority order: CLI > dotfile/config > parser default + + :param argparse.Namespace parser_args: parsed args by the original parser + :param argparse.Namespace aux_parser: parsed args by the a parser + with defaults suppressed + :return argparse.Namespace: selected argument values + """ + cfg_args_all = \ + _get_subcommand_args(parser_args) \ + if os.path.exists(parser_args.config_file) else dict() + result = argparse.Namespace() + cli_args, _ = aux_parser.parse_known_args() + for dest in vars(parser_args): + if dest not in POSITIONAL or not hasattr(result, dest): + if dest in cli_args: + x = getattr(cli_args, dest) + r = convert_value(x) if isinstance(x, str) else x + elif dest in cfg_args_all: + if isinstance(cfg_args_all[dest], list): + r = [convert_value(i) for i in cfg_args_all[dest]] + else: + r = convert_value(cfg_args_all[dest]) + else: + r = getattr(parser_args, dest) + setattr(result, dest, r) + return result + + +def _get_subcommand_args(parser_args): + """ + Get the union of values for the subcommand arguments from + Project.looper, Project.looper.cli. and Project.looper.cli.all. + If any are duplicated, the above is the selection priority order. + + Additionally, convert the options strings to destinations (replace '-' + with '_'), which strongly relies on argument parser using default + destinations. + + :param argparser.Namespace parser_args: argument namespace + :return dict: mapping of argument destinations to their values + """ + args = dict() + cfg = peppyProject(parser_args.config_file, + defer_samples_creation=True, + amendments=parser_args.amend) + if CONFIG_KEY in cfg and LOOPER_KEY in cfg[CONFIG_KEY] \ + and CLI_KEY in cfg[CONFIG_KEY][LOOPER_KEY]: + try: + cfg_args = cfg[CONFIG_KEY][LOOPER_KEY][CLI_KEY] or dict() + args = cfg_args[ALL_SUBCMD_KEY] or dict() \ + if ALL_SUBCMD_KEY in cfg_args else dict() + args.update(cfg_args[parser_args.command] or dict() + if parser_args.command in cfg_args else dict()) + except (TypeError, KeyError, AttributeError, ValueError) as e: + raise MisconfigurationException( + "Invalid '{}.{}' section in the config. Caught exception: {}". + format(LOOPER_KEY, CLI_KEY, getattr(e, 'message', repr(e)))) + if CONFIG_KEY in cfg and LOOPER_KEY in cfg[CONFIG_KEY]: + try: + if CLI_KEY in cfg[CONFIG_KEY][LOOPER_KEY]: + del cfg[CONFIG_KEY][LOOPER_KEY][CLI_KEY] + args.update(cfg[CONFIG_KEY][LOOPER_KEY]) + except (TypeError, KeyError, AttributeError, ValueError) as e: + raise MisconfigurationException( + "Invalid '{}' section in the config. Caught exception: {}". + format(LOOPER_KEY, getattr(e, 'message', repr(e)))) + args = {k.replace("-", "_"): v for k, v in args.items()} if args else None + return args + + +def init_dotfile(path, cfg_path, force=False): + """ + Initialize looper dotfile + + :param str path: absolute path to the file to initialize + :param str cfg_path: path to the config file. Absolute or relative to 'path' + :param bool force: whether the existing file should be overwritten + :return bool: whether the file was initialized + """ + if os.path.exists(path) and not force: + print("Can't initialize, file exists: {}".format(path)) + return False + cfg_path = expandpath(cfg_path) + if not os.path.isabs(cfg_path): + cfg_path = os.path.join(os.path.dirname(path), cfg_path) + assert os.path.exists(cfg_path), \ + OSError("Provided config path is invalid. You must provide path " + "that is either absolute or relative to: {}". + format(os.path.dirname(path))) + relpath = os.path.relpath(cfg_path, os.path.dirname(path)) + with open(path, 'w') as dotfile: + yaml.dump({DOTFILE_CFG_PTH_KEY: relpath}, dotfile) + print("Initialized looper dotfile: {}".format(path)) + return True + + +def read_cfg_from_dotfile(): + """ + Read file path to the config file from the dotfile + + :return str: path to the config file read from the dotfile + :raise MisconfigurationException: if the dotfile does not consist of the + required key pointing to the PEP + """ + dp = dotfile_path(must_exist=True) + with open(dp, 'r') as dotfile: + dp_data = yaml.safe_load(dotfile) + if DOTFILE_CFG_PTH_KEY in dp_data: + return os.path.join(os.path.dirname(dp), + str(os.path.join(dp_data[DOTFILE_CFG_PTH_KEY]))) + else: + raise MisconfigurationException( + "Looper dotfile ({}) is missing '{}' key". + format(dp, DOTFILE_CFG_PTH_KEY)) + + +def dotfile_path(directory=os.getcwd(), must_exist=False): + """ + Get the path to the looper dotfile + + If file existence is forced this function will look for it in + the directory parents + + :param str directory: directory path to start the search in + :param bool must_exist: whether the file must exist + :return str: path to the dotfile + :raise OSError: if the file does not exist + """ + cur_dir = directory + if not must_exist: + return os.path.join(cur_dir, LOOPER_DOTFILE_NAME) + while True: + parent_dir = os.path.dirname(cur_dir) + if LOOPER_DOTFILE_NAME in os.listdir(cur_dir): + return os.path.join(cur_dir, LOOPER_DOTFILE_NAME) + if cur_dir == parent_dir: + # root, file does not exist + raise OSError("Looper dotfile ({}) not found in '{}' and all " + "its parents".format(LOOPER_DOTFILE_NAME, directory)) + cur_dir = parent_dir diff --git a/mkdocs.yml b/mkdocs.yml index 9df34d4f7..d73dc5356 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -10,16 +10,20 @@ nav: - Features at-a-glance: features.md - Hello world: hello-world.md - How-to guides: - - Defining a project: define-your-project.md - - Defining looper config sections: project-config-looper.md - - Linking a project to a pipeline: linking-a-pipeline.md - - Linking to multiple pipelines: linking-multiple-pipelines.md + - Defining a project: defining-a-project.md + - Running a pipeline: running-a-pipeline.md + - Initializing a repository: initialize.md + - Parameterizing pipelines: parameterizing-pipelines.md + - Running on a cluster: running-on-a-cluster.md - Grouping many jobs into one: grouping-jobs.md - - Writing a pipeline interface: pipeline-interface.md - - Running on a cluster: cluster-computing.md - Running jobs in containers: containers.md - Handling multiple input files: how-to-merge-inputs.md + - Writing a pipeline interface: writing-a-pipeline-interface.md - Reference: + - Pipeline interface specification: pipeline-interface-specification.md + - Pipeline tiers: pipeline-tiers.md + - Looper variable namespaces: variable-namespaces.md + - Concentric templates: concentric-templates.md - Usage: usage.md - Configuration files: config-files.md - API: autodoc_build/looper.md diff --git a/tests/models/__init__.py b/oldtests/__init__.py similarity index 100% rename from tests/models/__init__.py rename to oldtests/__init__.py diff --git a/tests/conftest.py b/oldtests/conftest.py similarity index 98% rename from tests/conftest.py rename to oldtests/conftest.py index ac8188d97..934d7c008 100644 --- a/tests/conftest.py +++ b/oldtests/conftest.py @@ -1,7 +1,7 @@ """Fixtures for pytest-based units. Constants and helper functions can also be defined here. Doing so seems to -necessitate provision of an __init__.py file in this tests/ directory +necessitate provision of an __init__.py file in this oldtests/ directory such that Python considers it a package, but if that's already in place and test execution is not deleteriously affected, then it should be no problem. @@ -183,7 +183,7 @@ # TODO: split models conftest stuff into its own subdirectory. -# Provide some basic atomic-type data for models tests. +# Provide some basic atomic-type data for models oldtests. _BASE_KEYS = ("epigenomics", "H3K", "ac", "EWS", "FLI1") _BASE_VALUES = \ ("topic", "residue", "acetylation", "RNA binding protein", "FLI1") @@ -212,7 +212,7 @@ def pytest_addoption(parser): """ Facilitate command-line test behavior adjustment. """ parser.addoption("--logging-level", default="WARN", - help="Project root logger level to use for tests") + help="Project root logger level to use for oldtests") def pytest_generate_tests(metafunc): @@ -236,7 +236,7 @@ def conf_logs(request): level = request.config.getoption("--logging-level") init_logger(name=_LOGNAME, level=level, devmode=True) logging.getLogger(_LOGNAME).info( - "Configured looper logger at level %s; attaching tests' logger %s", + "Configured looper logger at level %s; attaching oldtests' logger %s", str(level), __name__) global _LOGGER _LOGGER = logging.getLogger("looper.{}".format(__name__)) @@ -474,9 +474,9 @@ def write_project_files(request): def _write_test_data_files(tempdir): """ - Write the temporary data files used by the tests. + Write the temporary data files used by the oldtests. - :param str tempdir: path to tests' primary temporary directory, + :param str tempdir: path to oldtests' primary temporary directory, within which temp data files may be placed directly or within subdirectory/ies. """ diff --git a/tests/data/d-bamfile.bam b/oldtests/data/d-bamfile.bam similarity index 100% rename from tests/data/d-bamfile.bam rename to oldtests/data/d-bamfile.bam diff --git a/tests/data/methyl_piface.yaml b/oldtests/data/methyl_piface.yaml similarity index 99% rename from tests/data/methyl_piface.yaml rename to oldtests/data/methyl_piface.yaml index af8623e7f..30f827b3e 100644 --- a/tests/data/methyl_piface.yaml +++ b/oldtests/data/methyl_piface.yaml @@ -1,7 +1,6 @@ protocol_mapping: RRBS: rrbs WGBS: wgbs - BS: rrbs;wgbs EG: wgbs pipelines: diff --git a/tests/data/src/rrbs.py b/oldtests/data/src/rrbs.py similarity index 100% rename from tests/data/src/rrbs.py rename to oldtests/data/src/rrbs.py diff --git a/tests/data/src/wgbs.py b/oldtests/data/src/wgbs.py similarity index 100% rename from tests/data/src/wgbs.py rename to oldtests/data/src/wgbs.py diff --git a/tests/helpers.py b/oldtests/helpers.py similarity index 100% rename from tests/helpers.py rename to oldtests/helpers.py diff --git a/tests/integration/def test_project_iface_sample_interaction.py b/oldtests/integration/def test_project_iface_sample_interaction.py similarity index 98% rename from tests/integration/def test_project_iface_sample_interaction.py rename to oldtests/integration/def test_project_iface_sample_interaction.py index c5ea0ba0b..b86109970 100644 --- a/tests/integration/def test_project_iface_sample_interaction.py +++ b/oldtests/integration/def test_project_iface_sample_interaction.py @@ -5,9 +5,9 @@ import os import pytest import looper -from tests.conftest import \ +from oldtests.conftest import \ NUM_SAMPLES, NGS_SAMPLE_INDICES, PIPELINE_TO_REQD_INFILES_BY_SAMPLE -from tests.helpers import named_param +from oldtests.helpers import named_param __author__ = "Vince Reuter" diff --git a/tests/integration/test_project_get_interfaces.py b/oldtests/integration/test_project_get_interfaces.py similarity index 100% rename from tests/integration/test_project_get_interfaces.py rename to oldtests/integration/test_project_get_interfaces.py diff --git a/tests/integration/test_project_pipeline_interface_interaction.py b/oldtests/integration/test_project_pipeline_interface_interaction.py similarity index 97% rename from tests/integration/test_project_pipeline_interface_interaction.py rename to oldtests/integration/test_project_pipeline_interface_interaction.py index 3c9711312..8100943eb 100644 --- a/tests/integration/test_project_pipeline_interface_interaction.py +++ b/oldtests/integration/test_project_pipeline_interface_interaction.py @@ -12,8 +12,8 @@ from looper.pipeline_interface import PL_KEY, PROTOMAP_KEY from looper.pipereqs import KEY_EXEC_REQ, KEY_FILE_REQ, KEY_FOLDER_REQ from looper.project_piface_group import ProjectPifaceGroup -import tests -from tests.helpers import build_pipeline_iface +import oldtests +from oldtests.helpers import build_pipeline_iface from ubiquerg import powerset __author__ = "Vince Reuter" @@ -31,7 +31,7 @@ BAD_REQS_LISTS = list(map(list, powerset([r for r, _ in BAD_PATH_REQS_DATA], nonempty=True))) ANNS_FILE_NAME = "anns.csv" -DATA_FOLDER_PATH = os.path.join(os.path.dirname(tests.__file__), "data") +DATA_FOLDER_PATH = os.path.join(os.path.dirname(oldtests.__file__), "data") INTERFACE_FILEPATH = os.path.join(DATA_FOLDER_PATH, "methyl_piface.yaml") """ diff --git a/tests/interactive.py b/oldtests/interactive.py similarity index 76% rename from tests/interactive.py rename to oldtests/interactive.py index 3331dfb21..9ccf55ce8 100644 --- a/tests/interactive.py +++ b/oldtests/interactive.py @@ -1,7 +1,7 @@ # This is just a little helper script to set up an interactive session # to help writing test cases. -# You must be in the looper tests dir: -# cd $CODEBASE/looper/tests +# You must be in the looper oldtests dir: +# cd $CODEBASE/looper/oldtests # ipython import conftest diff --git a/tests/models/pipeline_interface/__init__.py b/oldtests/models/__init__.py similarity index 100% rename from tests/models/pipeline_interface/__init__.py rename to oldtests/models/__init__.py diff --git a/oldtests/models/pipeline_interface/__init__.py b/oldtests/models/pipeline_interface/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/models/pipeline_interface/conftest.py b/oldtests/models/pipeline_interface/conftest.py similarity index 99% rename from tests/models/pipeline_interface/conftest.py rename to oldtests/models/pipeline_interface/conftest.py index 9db6d7ba4..fdc905f30 100644 --- a/tests/models/pipeline_interface/conftest.py +++ b/oldtests/models/pipeline_interface/conftest.py @@ -1,4 +1,4 @@ -""" Configuration for modules with independent tests of models. """ +""" Configuration for modules with independent oldtests of models. """ from collections import OrderedDict import copy diff --git a/tests/models/pipeline_interface/test_PipelineInterface.py b/oldtests/models/pipeline_interface/test_PipelineInterface.py similarity index 99% rename from tests/models/pipeline_interface/test_PipelineInterface.py rename to oldtests/models/pipeline_interface/test_PipelineInterface.py index 6e764defc..a37847797 100644 --- a/tests/models/pipeline_interface/test_PipelineInterface.py +++ b/oldtests/models/pipeline_interface/test_PipelineInterface.py @@ -25,7 +25,7 @@ from ubiquerg import powerset from .conftest import ATAC_PROTOCOL_NAME, write_config_data -from tests.helpers import remove_piface_requirements +from oldtests.helpers import remove_piface_requirements __author__ = "Vince Reuter" diff --git a/tests/models/pipeline_interface/test_PipelineInterface_requirements.py b/oldtests/models/pipeline_interface/test_PipelineInterface_requirements.py similarity index 99% rename from tests/models/pipeline_interface/test_PipelineInterface_requirements.py rename to oldtests/models/pipeline_interface/test_PipelineInterface_requirements.py index d0585175e..15167e761 100644 --- a/tests/models/pipeline_interface/test_PipelineInterface_requirements.py +++ b/oldtests/models/pipeline_interface/test_PipelineInterface_requirements.py @@ -8,8 +8,8 @@ from looper.pipereqs import KEY_EXEC_REQ, KEY_FILE_REQ, KEY_FOLDER_REQ import pytest import yaml -from tests.helpers import build_pipeline_iface -from tests.models.pipeline_interface.conftest import \ +from oldtests.helpers import build_pipeline_iface +from oldtests.models.pipeline_interface.conftest import \ ATAC_PIPE_NAME, ATAC_PROTOCOL_NAME from veracitools import ExpectContext diff --git a/tests/models/pipeline_interface/test_PipelineInterface_sample_subtypes.py b/oldtests/models/pipeline_interface/test_PipelineInterface_sample_subtypes.py similarity index 100% rename from tests/models/pipeline_interface/test_PipelineInterface_sample_subtypes.py rename to oldtests/models/pipeline_interface/test_PipelineInterface_sample_subtypes.py diff --git a/tests/models/sample/test_sample_to_yaml.py b/oldtests/models/sample/test_sample_to_yaml.py similarity index 98% rename from tests/models/sample/test_sample_to_yaml.py rename to oldtests/models/sample/test_sample_to_yaml.py index a7ffaafe3..b9100eb04 100644 --- a/tests/models/sample/test_sample_to_yaml.py +++ b/oldtests/models/sample/test_sample_to_yaml.py @@ -7,12 +7,12 @@ from looper import Project as LProject, Sample as LSample from looper.const import OUTKEY from looper.pipeline_interface import PL_KEY -from tests.conftest import write_temp, \ +from oldtests.conftest import write_temp, \ PIPELINE_INTERFACES_KEY, PIPELINE_TO_REQD_INFILES_BY_SAMPLE, \ PIPELINE_INTERFACE_CONFIG_LINES, PROJECT_CONFIG_LINES, ANNOTATIONS_FILENAME, \ SAMPLE_ANNOTATION_LINES, SAMPLE_SUBANNOTATIONS_KEY -from tests.helpers import process_protocols +from oldtests.helpers import process_protocols from peppy import Sample as PSample, OUTDIR_KEY, SAMPLE_NAME_COLNAME from peppy.sample import SAMPLE_YAML_EXT, SAMPLE_YAML_FILE_KEY from ubiquerg import powerset diff --git a/tests/pipe_nolooperargs/config/pipeline_interface.yaml b/oldtests/pipe_nolooperargs/config/pipeline_interface.yaml similarity index 100% rename from tests/pipe_nolooperargs/config/pipeline_interface.yaml rename to oldtests/pipe_nolooperargs/config/pipeline_interface.yaml diff --git a/tests/pipe_nolooperargs/config/protocol_mappings.yaml b/oldtests/pipe_nolooperargs/config/protocol_mappings.yaml similarity index 100% rename from tests/pipe_nolooperargs/config/protocol_mappings.yaml rename to oldtests/pipe_nolooperargs/config/protocol_mappings.yaml diff --git a/tests/pipe_nolooperargs/pipelines/testngs.sh b/oldtests/pipe_nolooperargs/pipelines/testngs.sh similarity index 100% rename from tests/pipe_nolooperargs/pipelines/testngs.sh rename to oldtests/pipe_nolooperargs/pipelines/testngs.sh diff --git a/tests/pipe_nolooperargs/pipelines/testpipeline.sh b/oldtests/pipe_nolooperargs/pipelines/testpipeline.sh similarity index 100% rename from tests/pipe_nolooperargs/pipelines/testpipeline.sh rename to oldtests/pipe_nolooperargs/pipelines/testpipeline.sh diff --git a/tests/pipelines/config/protocol_mappings.yaml b/oldtests/pipelines/config/protocol_mappings.yaml similarity index 100% rename from tests/pipelines/config/protocol_mappings.yaml rename to oldtests/pipelines/config/protocol_mappings.yaml diff --git a/tests/pipelines/pipelines/testngs.sh b/oldtests/pipelines/pipelines/testngs.sh similarity index 100% rename from tests/pipelines/pipelines/testngs.sh rename to oldtests/pipelines/pipelines/testngs.sh diff --git a/tests/pipelines/pipelines/testpipeline.sh b/oldtests/pipelines/pipelines/testpipeline.sh similarity index 100% rename from tests/pipelines/pipelines/testpipeline.sh rename to oldtests/pipelines/pipelines/testpipeline.sh diff --git a/tests/specific_use_cases/test_cli_prj_pipe_args_collision.py b/oldtests/specific_use_cases/test_cli_prj_pipe_args_collision.py similarity index 99% rename from tests/specific_use_cases/test_cli_prj_pipe_args_collision.py rename to oldtests/specific_use_cases/test_cli_prj_pipe_args_collision.py index b522bf93c..ce499fde3 100644 --- a/tests/specific_use_cases/test_cli_prj_pipe_args_collision.py +++ b/oldtests/specific_use_cases/test_cli_prj_pipe_args_collision.py @@ -10,7 +10,7 @@ from looper.pipeline_interface import PL_KEY, PROTOMAP_KEY from peppy.const import * from peppy.utils import count_repeats -from tests.helpers import randconf +from oldtests.helpers import randconf from ubiquerg import powerset __author__ = "Vince Reuter" diff --git a/tests/test_basic_interface_group.py b/oldtests/test_basic_interface_group.py similarity index 100% rename from tests/test_basic_interface_group.py rename to oldtests/test_basic_interface_group.py diff --git a/tests/test_looper.py b/oldtests/test_looper.py similarity index 96% rename from tests/test_looper.py rename to oldtests/test_looper.py index 678ee86e0..100124c23 100644 --- a/tests/test_looper.py +++ b/oldtests/test_looper.py @@ -1,6 +1,6 @@ -"""Initial, broad-scope looper tests. +"""Initial, broad-scope looper oldtests. -Along with tests/tests.py, this is one of the initial unit test modules. +Along with oldtests/oldtests.py, this is one of the initial unit test modules. The primary function under test here is the creation of a project instance. """ @@ -15,8 +15,8 @@ from looper import build_parser from looper.looper import aggregate_exec_skip_reasons -from tests.conftest import LOOPER_ARGS_BY_PIPELINE -from tests.helpers import named_param +from oldtests.conftest import LOOPER_ARGS_BY_PIPELINE +from oldtests.helpers import named_param _LOGGER = logging.getLogger("looper.{}".format(__name__)) diff --git a/tests/test_sample_subtypes.py b/oldtests/test_sample_subtypes.py similarity index 100% rename from tests/test_sample_subtypes.py rename to oldtests/test_sample_subtypes.py diff --git a/tests/test_submission_scripts.py b/oldtests/test_submission_scripts.py similarity index 99% rename from tests/test_submission_scripts.py rename to oldtests/test_submission_scripts.py index 1ab2a488b..a85a99ee8 100644 --- a/tests/test_submission_scripts.py +++ b/oldtests/test_submission_scripts.py @@ -17,7 +17,7 @@ from looper.looper import Project from looper.pipeline_interface import PROTOMAP_KEY, RESOURCES_KEY from looper.utils import fetch_sample_flags, sample_folder -from tests.helpers import process_protocols +from oldtests.helpers import process_protocols __author__ = "Vince Reuter" __email__ = "vreuter@virginia.edu" diff --git a/tests/test_utils.py b/oldtests/test_utils.py similarity index 98% rename from tests/test_utils.py rename to oldtests/test_utils.py index ffc26cb1f..ae6d253bf 100644 --- a/tests/test_utils.py +++ b/oldtests/test_utils.py @@ -6,7 +6,7 @@ import pytest from looper.utils import determine_config_path, DEFAULT_CONFIG_SUFFIX, \ DEFAULT_METADATA_FOLDER -from tests.helpers import randstr, LETTERS_AND_DIGITS +from oldtests.helpers import randstr, LETTERS_AND_DIGITS __author__ = "Vince Reuter" diff --git a/tests/test_with_microtest_as_smoketest.py b/oldtests/test_with_microtest_as_smoketest.py similarity index 100% rename from tests/test_with_microtest_as_smoketest.py rename to oldtests/test_with_microtest_as_smoketest.py diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt index 0a563d6d6..b612bd675 100644 --- a/requirements/requirements-all.txt +++ b/requirements/requirements-all.txt @@ -3,8 +3,9 @@ colorama>=0.3.9 logmuse>=0.2.0 pandas>=0.20.2 pyyaml>=3.12 -divvy>=0.3.1 -peppy>=0.22.2 -ubiquerg>=0.4.5 +divvy>=0.5.0 +peppy>=0.30.1 +ubiquerg>=0.5.2 ngstk>=0.0.1rc1 jinja2 +eido>=0.1.0 diff --git a/requirements/requirements-test.txt b/requirements/requirements-test.txt index 7f4458baa..1371faf1a 100644 --- a/requirements/requirements-test.txt +++ b/requirements/requirements-test.txt @@ -1,3 +1,3 @@ -coveralls==1.1 -pytest-cov==2.4.0 +coveralls>=1.1 +pytest-cov==2.6.1 pytest-remotedata diff --git a/setup.py b/setup.py index 3e1d14a74..6a3b0fb00 100644 --- a/setup.py +++ b/setup.py @@ -63,7 +63,7 @@ def get_static(name, condition=None): long_description = open('README.md').read() setup( - name="loopercli", + name="looper", packages=["looper"], version=version, description="A pipeline submission engine that parses sample inputs and submits pipelines for each sample.", diff --git a/tests/data/annotation_sheet.csv b/tests/data/annotation_sheet.csv new file mode 100644 index 000000000..fdccccfb0 --- /dev/null +++ b/tests/data/annotation_sheet.csv @@ -0,0 +1,4 @@ +sample_name,protocol,data_source,SRR,Sample_geo_accession,read1,read2 +sample1,PROTO1,SRA,SRR5210416,GSM2471255,SRA_1,SRA_2 +sample2,PROTO1,SRA,SRR5210450,GSM2471300,SRA_1,SRA_2 +sample3,PROTO2,SRA,SRR5210398,GSM2471249,SRA_1,SRA_2 \ No newline at end of file diff --git a/tests/data/output_schema.yaml b/tests/data/output_schema.yaml new file mode 100644 index 000000000..8f3bde07b --- /dev/null +++ b/tests/data/output_schema.yaml @@ -0,0 +1,27 @@ +description: Sample objects produced by test pipeline. +properties: + samples: + type: array + items: + type: object + properties: + test_property: + type: string + description: "Test sample property" + path: "~/sample/{sample_name}_file.txt" + test_property1: + type: string + description: "Test sample property" + path: "~/sample/{sample_name}_file1.txt" + test_property: + type: image + title: "Test title" + description: "Test project property" + thumbnail_path: "~/test_{name}.png" + path: "~/test_{name}.pdf" + test_property1: + type: image + title: "Test title1" + description: "Test project property1" + thumbnail_path: "~/test_{name}.png" + path: "~/test_{name}1.pdf" diff --git a/tests/data/pipeline_interface1_project.yaml b/tests/data/pipeline_interface1_project.yaml new file mode 100644 index 000000000..49abb26a9 --- /dev/null +++ b/tests/data/pipeline_interface1_project.yaml @@ -0,0 +1,10 @@ +pipeline_name: PIPELINE1 +pipeline_type: project +path: pipelines/col_pipeline1.py +output_schema: output_schema.yaml +command_template: > + {pipeline.path} --project-name {project.name} + +bioconductor: + readFunName: readData + readFunPath: readData.R \ No newline at end of file diff --git a/tests/data/pipeline_interface1_sample.yaml b/tests/data/pipeline_interface1_sample.yaml new file mode 100644 index 000000000..e0d766dc4 --- /dev/null +++ b/tests/data/pipeline_interface1_sample.yaml @@ -0,0 +1,11 @@ +pipeline_name: PIPELINE1 +pipeline_type: sample +path: pipelines/pipeline1.py +input_schema: https://schema.databio.org/pep/2.0.0.yaml +output_schema: output_schema.yaml +command_template: > + {pipeline.path} --sample-name {sample.sample_name} --req-attr {sample.attr} + +bioconductor: + readFunName: readData + readFunPath: readData.R diff --git a/tests/data/pipeline_interface2_project.yaml b/tests/data/pipeline_interface2_project.yaml new file mode 100644 index 000000000..b77a621da --- /dev/null +++ b/tests/data/pipeline_interface2_project.yaml @@ -0,0 +1,12 @@ +pipeline_name: OTHER_PIPELINE2 +pipeline_type: project +path: pipelines/col_pipeline2.py +output_schema: output_schema.yaml +command_template: > + {pipeline.path} --project-name {project.name} +compute: + size_dependent_variables: resources-project.tsv + +bioconductor: + readFunName: readData + readFunPath: readData.R diff --git a/tests/data/pipeline_interface2_sample.yaml b/tests/data/pipeline_interface2_sample.yaml new file mode 100644 index 000000000..93186f6d2 --- /dev/null +++ b/tests/data/pipeline_interface2_sample.yaml @@ -0,0 +1,12 @@ +pipeline_name: OTHER_PIPELINE2 +pipeline_type: sample +path: pipelines/other_pipeline2.py +output_schema: output_schema.yaml +command_template: > + {pipeline.path} --sample-name {sample.sample_name} --req-attr {sample.attr} +compute: + size_dependent_variables: resources-sample.tsv + +bioconductor: + readFunName: readData + readFunPath: readData.R diff --git a/tests/data/project_config.yaml b/tests/data/project_config.yaml new file mode 100644 index 000000000..3ed167ac4 --- /dev/null +++ b/tests/data/project_config.yaml @@ -0,0 +1,16 @@ +pep_version: "2.0.0" +name: test + +sample_table: annotation_sheet.csv +looper: + all: + output_dir: ../output + +sample_modifiers: + append: + attr: "val" + derive: + attributes: [read1, read2] + sources: + SRA_1: "{SRR}_1.fastq.gz" + SRA_2: "{SRR}_2.fastq.gz" diff --git a/tests/data/readData.R b/tests/data/readData.R new file mode 100644 index 000000000..89557a11b --- /dev/null +++ b/tests/data/readData.R @@ -0,0 +1,10 @@ +readData = function(project, sampleName="sample1") { + lapply(getOutputsBySample(project, sampleName), function(x) { + lapply(x, function(x1){ + message("Reading: ", basename(x1)) + df = read.table(x1, stringsAsFactors=F) + colnames(df)[1:3] = c('chr', 'start', 'end') + GenomicRanges::GRanges(df) + }) + }) +} diff --git a/tests/data/resources-project.tsv b/tests/data/resources-project.tsv new file mode 100644 index 000000000..4efd0f19c --- /dev/null +++ b/tests/data/resources-project.tsv @@ -0,0 +1,6 @@ +max_file_size cores mem time +0.05 1 12000 00-01:00:00 +0.5 1 16000 00-01:00:00 +1 1 16000 00-01:00:00 +10 1 16000 00-01:00:00 +NaN 1 32000 00-02:00:00 diff --git a/tests/data/resources-sample.tsv b/tests/data/resources-sample.tsv new file mode 100644 index 000000000..20ec284b6 --- /dev/null +++ b/tests/data/resources-sample.tsv @@ -0,0 +1,7 @@ +max_file_size cores mem time +0.001 1 8000 00-04:00:00 +0.05 2 12000 00-08:00:00 +0.5 4 16000 00-12:00:00 +1 8 16000 00-24:00:00 +10 16 32000 02-00:00:00 +NaN 32 32000 04-00:00:00 diff --git a/tests/integration/test_project_get_outputs.py b/tests/integration/test_project_get_outputs.py deleted file mode 100644 index 2d18721cc..000000000 --- a/tests/integration/test_project_get_outputs.py +++ /dev/null @@ -1,736 +0,0 @@ -""" Tests for interaction between Project and PipelineInterface """ - -from copy import deepcopy -import itertools -import os -import random -import string -import pytest -import yaml -from looper import Project as LP -from looper.const import * -from looper.exceptions import DuplicatePipelineKeyException -from looper.pipeline_interface import PL_KEY, PROTOMAP_KEY, RESOURCES_KEY -from attmap import AttMap -from divvy import DEFAULT_COMPUTE_RESOURCES_NAME as DEF_RES -from peppy.const import * -from peppy.utils import count_repeats -from tests.helpers import LETTERS_AND_DIGITS, randstr, randconf - -__author__ = "Vince Reuter" -__email__ = "vreuter@virginia.edu" - - -MAIN_META_KEY = "main_meta" -SUBS_META_KEY = "subs_meta" -SECTION_BY_FIXTURE = { - MAIN_META_KEY: METADATA_KEY, SUBS_META_KEY: SUBPROJECTS_SECTION} -BASE_META = {OUTDIR_KEY: "arbitrary"} -DECLARED_OUTPUTS = {"smooth_bw": "a_{sample.name}/b_{sample.protocol}.txt", - "unalign": "u_{sample.name}_{sample.protocol}.txt"} -WGBS_NAME = "WGBS" -RRBS_NAME = "RRBS" -WGBS_KEY = "wgbs" -RRBS_KEY = "rrbs" - -PROTO_NAMES = {WGBS_KEY: WGBS_NAME, RRBS_KEY: RRBS_NAME} - -WGBS_IFACE_LINES = """name: {n} -path: src/wgbs.py -required_input_files: [data_source] -ngs_input_files: [data_source] -arguments: - "--sample-name": sample_name - "--genome": genome - "--input": data_source - "--single-or-paired": read_type -{r}: - {d}: - file_size: "0" - cores: "4" - mem: "4000" - time: "0-02:00:00" -""".format(n=WGBS_NAME, r=RESOURCES_KEY, d=DEF_RES).splitlines(True) - -RRBS_IFACE_LINES = """name: {n} -path: src/rrbs.py -required_input_files: [data_source] -all_input_files: [data_source, read1, read2] -ngs_input_files: [data_source, read1, read2] -arguments: - "--sample-name": sample_name - "--genome": genome - "--input": data_source - "--single-or-paired": read_type -{r}: - {d}: - file_size: "0" - cores: "4" - mem: "4000" - time: "0-02:00:00" -""".format(n=RRBS_NAME, r=RESOURCES_KEY, d=DEF_RES).splitlines(True) - - -PROTOMAP = {RRBS_NAME: RRBS_KEY, WGBS_NAME: WGBS_KEY, "EG": WGBS_KEY} -IFACE_LINES = {WGBS_KEY: WGBS_IFACE_LINES, RRBS_KEY: RRBS_IFACE_LINES} - -RNASEQ = "RNA-seq" -KALLISTO_ABUNDANCES_KEY = "abundances" -KALLISTO_ABUNDANCES_TEMPLATE = "{sample.name}_isoforms.txt" - - -def pytest_generate_tests(metafunc): - """ Test case generation and parameterization for this module. """ - skip_empty_flag = "skip_sample_less" - if skip_empty_flag in metafunc.fixturenames: - metafunc.parametrize(skip_empty_flag, [False, True]) - - -def augmented_metadata(metadata, extra=None): - """ Augment base metadata with additional data. """ - assert METADATA_KEY not in metadata, \ - "Found {k} in metadata argument itself; pass just the data/values to " \ - "use as {k}, not the whole mapping".format(k=METADATA_KEY) - m = AttMap({METADATA_KEY: BASE_META}) - m[METADATA_KEY] = m[METADATA_KEY].add_entries(metadata) - return m.add_entries(extra or {}).to_map() - - -def get_conf_data(req): - """ - Get Project config data for a test case. - - :param pytest.FixtureRequest req: test case requesting Project config data - :return dict: Project config data - """ - m = {key: req.getfixturevalue(fix) for fix, key - in SECTION_BY_FIXTURE.items() if fix in req.fixturenames} - return m - - -@pytest.fixture(scope="function") -def prj(request, tmpdir): - """ Provide a test case with a Project instance. """ - conf_file = tmpdir.join(randconf()).strpath - return _write_and_build_prj(conf_file, conf_data=get_conf_data(request)) - - -@pytest.mark.parametrize(MAIN_META_KEY, [BASE_META]) -def test_no_pifaces(prj, main_meta): - """ No pipeline interfaces --> the outputs data mapping is empty.""" - assert {} == prj.get_outputs() - - -@pytest.mark.parametrize("name_cfg_file", [randconf()]) -@pytest.mark.parametrize("ifaces", [ - [{WGBS_KEY: WGBS_IFACE_LINES}], [{RRBS_KEY: RRBS_IFACE_LINES}], - [{WGBS_KEY: WGBS_IFACE_LINES}, {RRBS_KEY: RRBS_IFACE_LINES}]]) -def test_no_outputs(tmpdir, name_cfg_file, ifaces, skip_sample_less): - """ Pipeline interfaces without outputs --> no Project outputs """ - cfg = tmpdir.join(name_cfg_file).strpath - iface_paths = [tmpdir.join(randconf()).strpath for _ in ifaces] - rep_paths = count_repeats(iface_paths) - assert [] == rep_paths, "Repeated temp filepath(s): {}".format(rep_paths) - for data, path in zip(ifaces, iface_paths): - with open(path, 'w') as f: - yaml.dump(data, f) - md = deepcopy(BASE_META) - md[PIPELINE_INTERFACES_KEY] = iface_paths - - # DEBUG - print("Metadata: {}".format(md)) - - for path, data in zip(iface_paths, ifaces): - _write_iface_file(path, data) - prj = _write_and_build_prj(cfg, {METADATA_KEY: md}) - assert {} == prj.get_outputs(skip_sample_less) - - -@pytest.mark.parametrize("name_cfg_file", [randconf()]) -@pytest.mark.parametrize(["ifaces", "prot_pool"], [ - ([{WGBS_KEY: WGBS_IFACE_LINES}], [WGBS_NAME]), - ([{RRBS_KEY: RRBS_IFACE_LINES}], [RRBS_NAME]), - ([{WGBS_KEY: WGBS_IFACE_LINES}, {RRBS_KEY: RRBS_IFACE_LINES}], - [WGBS_NAME, RRBS_NAME])]) -@pytest.mark.parametrize("declared_outputs", [None, ["out1", "out2"]]) -def test_malformed_outputs( - tmpdir, name_cfg_file, ifaces, prot_pool, - declared_outputs, skip_sample_less): - """ Invalid outputs declaration format is exceptional. """ - - cfg = tmpdir.join(name_cfg_file).strpath - - iface_paths = [tmpdir.join(randconf()).strpath for _ in ifaces] - rep_paths = count_repeats(iface_paths) - assert [] == rep_paths, "Repeated temp filepath(s): {}".format(rep_paths) - - for data, path in zip(ifaces, iface_paths): - with open(path, 'w') as f: - yaml.dump(data, f) - md = deepcopy(BASE_META) - md[PIPELINE_INTERFACES_KEY] = iface_paths - - anns_file = tmpdir.join("anns.csv").strpath - assert not os.path.exists(anns_file) - sample_protos = [random.choice(prot_pool) for _ in range(10)] - sample_names = [randstr(string.ascii_letters, 20) for _ in sample_protos] - repeated_sample_names = count_repeats(sample_names) - assert [] == repeated_sample_names, \ - "Repeated sample names: {}".format(repeated_sample_names) - anns_data = [(SAMPLE_NAME_COLNAME, ASSAY_KEY)] + \ - list(zip(sample_names, sample_protos)) - with open(anns_file, 'w') as f: - f.write("\n".join("{0},{1}".format(*pair) for pair in anns_data)) - md[SAMPLE_ANNOTATIONS_KEY] = anns_file - - # DEBUG - print("Metadata: {}".format(md)) - - keyed_outputs = {pk: declared_outputs for pk in - [k for pi in ifaces for k in pi.keys()]} - for path, data in zip(iface_paths, ifaces): - _write_iface_file(path, data, outputs_by_pipe_key=keyed_outputs) - prj = _write_and_build_prj(cfg, {METADATA_KEY: md}) - print("TABLE below:\n{}".format(prj.sample_table)) - with pytest.raises(AttributeError): - # Should fail on .items() call during outputs determination. - print("Outputs: {}".format(prj.get_outputs(skip_sample_less))) - - -@pytest.mark.parametrize("ifaces", [ - [{WGBS_KEY: WGBS_IFACE_LINES}], [{RRBS_KEY: RRBS_IFACE_LINES}], - [{WGBS_KEY: WGBS_IFACE_LINES}, {RRBS_KEY: RRBS_IFACE_LINES}]]) -@pytest.mark.parametrize("declared_outputs", - [{n: DECLARED_OUTPUTS for n in [RRBS_NAME, WGBS_NAME]}]) -def test_only_subproject_has_outputs(tmpdir, ifaces, declared_outputs): - """ Activation state affects status of Project's outputs. """ - - cfg = tmpdir.join(randconf()).strpath - - iface_paths = [tmpdir.join(randconf()).strpath for _ in ifaces] - assert [] == count_repeats(iface_paths), \ - "Repeated temp filepath(s): {}".format(count_repeats(iface_paths)) - - for data, path in zip(ifaces, iface_paths): - with open(path, 'w') as f: - yaml.dump(data, f) - md = deepcopy(BASE_META) - md[PIPELINE_INTERFACES_KEY] = iface_paths - - sp_ifaces_paths = [tmpdir.join(randconf()).strpath for _ in ifaces] - assert [] == count_repeats(sp_ifaces_paths), \ - "Repeated temp filepath(s): {}".format(count_repeats(sp_ifaces_paths)) - iface_path_intersect = set(sp_ifaces_paths) & set(iface_paths) - assert set() == iface_path_intersect, \ - "Nonempty main/subs iface path intersection: {}".\ - format(", ".join(iface_path_intersect)) - - # DEBUG - print("Metadata: {}".format(md)) - - used_iface_keys = set(itertools.chain(*[pi.keys() for pi in ifaces])) - keyed_outputs = {pk: declared_outputs[PROTO_NAMES[pk]] - for pk in used_iface_keys} - for path, data in zip(iface_paths, ifaces): - _write_iface_file(path, data) - for path, data in zip(sp_ifaces_paths, ifaces): - _write_iface_file(path, data, outputs_by_pipe_key=keyed_outputs) - - sp_name = "testing_subproj" - prj = _write_and_build_prj(cfg, { - METADATA_KEY: md, - SUBPROJECTS_SECTION: { - sp_name: { - METADATA_KEY: { - PIPELINE_INTERFACES_KEY: sp_ifaces_paths - } - } - } - }) - - # DEBUG - print("TABLE below:\n{}".format(prj.sample_table)) - - assert len(prj.get_outputs(False)) == 0 - assert {} == prj.get_outputs(False) - p = prj.activate_subproject(sp_name) - assert len(p.get_outputs(False)) > 0 - exp = {pipe_name: {k: (v, []) for k, v in outs.items()} - for pipe_name, outs in declared_outputs.items() - if pipe_name in {PROTO_NAMES[k] for k in used_iface_keys}} - assert exp == p.get_outputs(False) - - -@pytest.mark.parametrize("ifaces", [ - [{WGBS_KEY: WGBS_IFACE_LINES}], [{RRBS_KEY: RRBS_IFACE_LINES}], - [{WGBS_KEY: WGBS_IFACE_LINES}, {RRBS_KEY: RRBS_IFACE_LINES}]]) -@pytest.mark.parametrize("declared_outputs", - [{n: DECLARED_OUTPUTS for n in [RRBS_NAME, WGBS_NAME]}]) -def test_only_main_project_has_outputs(tmpdir, ifaces, declared_outputs): - """ Activation state affects status of Project's outputs. """ - - cfg = tmpdir.join(randconf()).strpath - - iface_paths = [tmpdir.join(randconf()).strpath for _ in ifaces] - assert [] == count_repeats(iface_paths), \ - "Repeated temp filepath(s): {}".format(count_repeats(iface_paths)) - - for data, path in zip(ifaces, iface_paths): - with open(path, 'w') as f: - yaml.dump(data, f) - md = deepcopy(BASE_META) - md[PIPELINE_INTERFACES_KEY] = iface_paths - - sp_ifaces_paths = [tmpdir.join(randconf()).strpath for _ in ifaces] - assert [] == count_repeats(sp_ifaces_paths), \ - "Repeated temp filepath(s): {}".format(count_repeats(sp_ifaces_paths)) - iface_path_intersect = set(sp_ifaces_paths) & set(iface_paths) - assert set() == iface_path_intersect, \ - "Nonempty main/subs iface path intersection: {}". \ - format(", ".join(iface_path_intersect)) - - # DEBUG - print("Metadata: {}".format(md)) - - used_iface_keys = set(itertools.chain(*[pi.keys() for pi in ifaces])) - keyed_outputs = {pk: declared_outputs[PROTO_NAMES[pk]] - for pk in used_iface_keys} - for path, data in zip(iface_paths, ifaces): - _write_iface_file(path, data, outputs_by_pipe_key=keyed_outputs) - for path, data in zip(sp_ifaces_paths, ifaces): - _write_iface_file(path, data) - - sp_name = "testing_subproj" - prj = _write_and_build_prj(cfg, { - METADATA_KEY: md, - SUBPROJECTS_SECTION: { - sp_name: { - METADATA_KEY: { - PIPELINE_INTERFACES_KEY: sp_ifaces_paths - } - } - } - }) - - # DEBUG - print("TABLE below:\n{}".format(prj.sample_table)) - - assert len(prj.get_outputs(False)) > 0 - exp = {pipe_name: {k: (v, []) for k, v in outs.items()} - for pipe_name, outs in declared_outputs.items() - if pipe_name in {PROTO_NAMES[k] for k in used_iface_keys}} - assert exp == prj.get_outputs(False) - p = prj.activate_subproject(sp_name) - assert len(p.get_outputs(False)) == 0 - assert {} == p.get_outputs(False) - - -def test_multiple_project_units_have_declare_interfaces_with_outputs(tmpdir): - """ Activation state affects status of Project's outputs. """ - - # Generate config filepaths. - iface_paths = set() - while len(iface_paths) < 3: - iface_paths.add(tmpdir.join(randconf()).strpath) - iface_paths = list(iface_paths) - - # Collect the Project config data. - main_iface_file, sp_iface_files = iface_paths[0], iface_paths[1:] - sp_files = dict(zip(["sp1", "sp2"], sp_iface_files)) - prj_dat = { - METADATA_KEY: { - OUTDIR_KEY: tmpdir.strpath, - PIPELINE_INTERFACES_KEY: main_iface_file - }, - SUBPROJECTS_SECTION: {n: {METADATA_KEY: {PIPELINE_INTERFACES_KEY: f}} - for n, f in sp_files.items()} - } - - # Generate Project config filepath and create Project. - conf_file = make_temp_file_path(folder=tmpdir.strpath, known=iface_paths) - for f, (lines_spec, outs_spec) in zip( - iface_paths, - [({WGBS_KEY: WGBS_IFACE_LINES}, {WGBS_KEY: DECLARED_OUTPUTS}), - ({RRBS_KEY: RRBS_IFACE_LINES}, {RRBS_KEY: DECLARED_OUTPUTS}), - ({WGBS_KEY: WGBS_IFACE_LINES, RRBS_KEY: RRBS_IFACE_LINES}, - {WGBS_KEY: DECLARED_OUTPUTS, RRBS_KEY: DECLARED_OUTPUTS})]): - _write_iface_file(f, lines_group_by_pipe_key=lines_spec, - outputs_by_pipe_key=outs_spec) - - prj = _write_and_build_prj(conf_file, prj_dat) - - # DEBUG - print("TMPDIR contents:\n{}".format("\n".join( - os.path.join(tmpdir.strpath, f) for f in os.listdir(tmpdir.strpath)))) - - def observe(p): - return p.get_outputs(False) - - def extract_just_path_template(out_res): - return {pipe_name: {k: v for k, (v, _) in outs.items()} - for pipe_name, outs in out_res.items()} - - assert {WGBS_NAME: DECLARED_OUTPUTS} == extract_just_path_template(observe(prj)) - p1 = prj.activate_subproject("sp1") - assert {RRBS_NAME: DECLARED_OUTPUTS} == extract_just_path_template(observe(p1)) - p2 = p1.activate_subproject("sp2") - assert {pn: DECLARED_OUTPUTS for pn in [WGBS_NAME, RRBS_NAME]} == \ - extract_just_path_template(observe(p2)) - - -@pytest.mark.parametrize("noskip", [False, True]) -@pytest.mark.parametrize("protocols", - [[], [random.choice(["INVALID", "NULL"]) for _ in range(10)]]) -@pytest.mark.parametrize("declared_outputs", - [{n: DECLARED_OUTPUTS for n in [RRBS_NAME, WGBS_NAME]}]) -def test_no_samples_match_protocols_with_outputs( - tmpdir, noskip, protocols, declared_outputs): - """ get_outputs behavior is sensitive to protocol match and skip flag. """ - temproot = tmpdir.strpath - path_iface_file = tmpdir.join(randconf()).strpath - prj_cfg = make_temp_file_path(folder=temproot, known=[path_iface_file]) - prj_dat = { - METADATA_KEY: { - OUTDIR_KEY: temproot, - PIPELINE_INTERFACES_KEY: path_iface_file - } - } - if protocols: - anns_file = make_temp_file_path( - folder=temproot, known=[path_iface_file, prj_cfg]) - anns_data = [("sample{}".format(i), p) for i, p in enumerate(protocols)] - with open(anns_file, 'w') as f: - for n, p in [(SAMPLE_NAME_COLNAME, ASSAY_KEY)] + anns_data: - f.write("{},{}\n".format(n, p)) - prj_dat[METADATA_KEY][SAMPLE_ANNOTATIONS_KEY] = anns_file - _write_iface_file( - path_iface_file, {WGBS_KEY: WGBS_IFACE_LINES, RRBS_KEY: RRBS_IFACE_LINES}, - outputs_by_pipe_key={PROTOMAP[n]: DECLARED_OUTPUTS for n in declared_outputs.keys()}) - prj = _write_and_build_prj(prj_cfg, prj_dat) - exp = { - pipe_name: { - path_key: (path_temp, []) - for path_key, path_temp in decl_outs.items()} - for pipe_name, decl_outs in declared_outputs.items() - } if noskip else {} - assert exp == prj.get_outputs(not noskip) - - -@pytest.mark.parametrize("protomap", [None, PROTOMAP]) -@pytest.mark.parametrize("include_outputs", [False, True]) -def test_pipeline_identifier_collision_same_data(tmpdir, protomap, include_outputs): - """ Interface data that differs from another with same identifier is unexceptional. """ - - temproot = tmpdir.strpath - lines_groups = {WGBS_KEY: WGBS_IFACE_LINES, RRBS_KEY: RRBS_IFACE_LINES} - outputs = {k: DECLARED_OUTPUTS for k in lines_groups.keys()} \ - if include_outputs else None - - def write_iface(f, pm): - _write_iface_file(f, lines_groups, outputs, pm) - - iface_file_1 = os.path.join(temproot, "piface1.yaml") - write_iface(iface_file_1, protomap) - iface_file_2 = os.path.join(temproot, "piface2.yaml") - write_iface(iface_file_2, protomap) - - prj_dat = { - METADATA_KEY: { - OUTDIR_KEY: tmpdir.strpath, - PIPELINE_INTERFACES_KEY: [iface_file_1, iface_file_2] - } - } - prj = _write_and_build_prj(os.path.join(temproot, "pc.yaml"), prj_dat) - exp = {n: {k: (v, []) for k, v in DECLARED_OUTPUTS.items()} - for n in [WGBS_NAME, RRBS_NAME]} if include_outputs else {} - assert exp == prj.get_outputs(skip_sample_less=False) - - -@pytest.mark.parametrize("protomap", [None, PROTOMAP]) -@pytest.mark.parametrize("include_outputs", [False, True]) -@pytest.mark.parametrize("rep_key", [WGBS_KEY, RRBS_KEY]) -def test_pipeline_identifier_collision_different_data( - tmpdir, include_outputs, protomap, skip_sample_less, rep_key): - """ Interface data that differs from another with same identifier is exceptional. """ - temproot = tmpdir.strpath - - def write_iface(f, lines_group): - out_by_key = {k: DECLARED_OUTPUTS for k in lines_group} \ - if include_outputs else None - _write_iface_file(f, lines_group, out_by_key, pm=protomap) - - iface_file_1 = os.path.join(temproot, "piface1.yaml") - write_iface(iface_file_1, {rep_key: WGBS_IFACE_LINES}) - iface_file_2 = os.path.join(temproot, "piface2.yaml") - write_iface(iface_file_2, {rep_key: RRBS_IFACE_LINES}) - - def observe(): - prj_cfg = os.path.join(temproot, "pc.yaml") - prj_dat = { - METADATA_KEY: { - OUTDIR_KEY: tmpdir.strpath, - PIPELINE_INTERFACES_KEY: [iface_file_1, iface_file_2] - } - } - return _write_and_build_prj(prj_cfg, prj_dat).get_outputs(skip_sample_less) - - try: - observe() - except Exception as e: - pytest.fail("Unexpected exception: {}".format(e)) - - write_iface(iface_file_1, {rep_key: WGBS_IFACE_LINES[1:]}) - write_iface(iface_file_2, {rep_key: RRBS_IFACE_LINES[1:]}) - - # DEBUG - def print_iface(fp): - with open(fp, 'r') as f: - return yaml.load(f, yaml.SafeLoader) - - # DEBUG - print("First interface contents (below):\n{}\n".format(print_iface(iface_file_1))) - print("Second interface contents (below):\n{}".format(print_iface(iface_file_2))) - - with pytest.raises(DuplicatePipelineKeyException): - observe() - - -def test_sample_collection_accuracy(tmpdir, skip_sample_less, rna_pi_lines): - """ Names of samples collected for each pipeline are as expected. """ - temproot = tmpdir.strpath - samples = [("sampleA", WGBS_NAME), ("sample2", "HiChIP"), - ("sampleC", RNASEQ), ("sample4", "ATAC"), - ("sampleE", WGBS_NAME), ("sample6", "HiChIP"), - ("sampleG", RNASEQ), ("sample8", "ATAC")] - iface_files = list(get_temp_paths(2, temproot)) - anns_file = make_temp_file_path( - temproot, iface_files, - generate=lambda: "".join(randstr(LETTERS_AND_DIGITS, 20)) + ".csv") - with open(anns_file, 'w') as f: - f.write("\n".join("{},{}".format(*pair) for pair in - [(SAMPLE_NAME_COLNAME, ASSAY_KEY)] + samples)) - _write_iface_file( - iface_files[0], - lines_group_by_pipe_key={WGBS_KEY: WGBS_IFACE_LINES}, - outputs_by_pipe_key={WGBS_KEY: DECLARED_OUTPUTS}, pm=PROTOMAP) - with open(iface_files[1], 'w') as f: - for l in rna_pi_lines: - f.write(l) - prj_dat = { - METADATA_KEY: { - SAMPLE_ANNOTATIONS_KEY: anns_file, - OUTDIR_KEY: tmpdir.strpath, - PIPELINE_INTERFACES_KEY: iface_files - } - } - prj_cfg = make_temp_file_path(temproot, iface_files + [anns_file]) - prj = _write_and_build_prj(prj_cfg, prj_dat) - exp = { - WGBS_NAME: { - k: (v, [sn for sn, pn in samples if pn == WGBS_NAME]) - for k, v in DECLARED_OUTPUTS.items() - }, - RNA_PIPES["kallisto"].name: { - KALLISTO_ABUNDANCES_KEY: ( - KALLISTO_ABUNDANCES_TEMPLATE, - [sn for sn, prot in samples if prot == RNASEQ] - ) - } - } - assert exp == prj.get_outputs(skip_sample_less) - - -def get_temp_paths(n, folder, known=None, generate=randconf): - """ - Generate unique tempfile paths pointing to within a particular folder. - - :param int n: number of paths to generate - :param str folder: path to folder into which randomly generated filepaths - should point - :param Iterable[str] known: collection of filepaths to prohibit a - match to for a newly generated path - :param function() -> str generate: how to randomly generate a filename - :return Iterable[str]: collection of unique tempfile paths pointing to - within a particular folder. - """ - paths = set() - known = set(known or []) - gen = lambda pool: make_temp_file_path(folder, pool, generate) - while len(paths) < n: - p = gen(known) - known.add(p) - paths.add(p) - return paths - - -def make_temp_file_path(folder, known, generate=randconf): - """ - Generate a new tempfile path. - - :param str folder: path to folder that represents parent of path to - generate, i.e. the path to the folder to which a randomized filename - is to be joined - :param Iterable[str] known: collection of current filePATHs - :param function() -> str generate: how to generate fileNAME - :return str: randomly generated filepath that doesn't match a known value - """ - while True: - fp = os.path.join(folder, generate()) - if fp not in known: - return fp - - -def _write_and_build_prj(conf_file, conf_data): - """ - Write Project config data and create the instance. - - :param str conf_file: path to file to write - :param Mapping conf_data: Project config data - :return looper.Project: new Project instance - """ - with open(conf_file, 'w') as f: - yaml.dump(conf_data, f) - return LP(conf_file) - - -def _write_iface_file( - path_iface_file, lines_group_by_pipe_key, - outputs_by_pipe_key=None, pm=None): - """ - Write a pipeline interface file. - - :param str path_iface_file: path to the file to write - :param Mapping[str, Iterable[str]] lines_group_by_pipe_key: binding between - pipeline key and collection of lines that encode its specific - configuration data - :param Mapping[str, Mapping[str, str]] outputs_by_pipe_key: binding between - pipeline key and mapping from output type/kind name to path template - :param Mapping[str, str] pm: protocol mapping - :return str: path to the file written - """ - - folder = os.path.dirname(path_iface_file) - temps = [os.path.join(folder, randconf()) for _ in lines_group_by_pipe_key] - - def read_iface_data(fp, lines): - with open(fp, 'w') as f: - for l in lines: - f.write(l) - try: - with open(fp, 'r') as f: - return yaml.load(f, yaml.SafeLoader) - except yaml.scanner.ScannerError: - with open(fp, 'r') as f: - for l in f.readlines(): - print(l) - raise - - outputs_by_pipe_key = outputs_by_pipe_key or dict() - - dat_by_key = { - k: read_iface_data(tf, lines_group) for tf, (k, lines_group) - in zip(temps, lines_group_by_pipe_key.items())} - for k, outs in outputs_by_pipe_key.items(): - if k not in dat_by_key: - continue - dat_by_key[k][OUTKEY] = outs - - data = {PROTOMAP_KEY: pm or PROTOMAP, PL_KEY: dat_by_key} - with open(path_iface_file, 'w') as f: - yaml.dump(data, f) - - return path_iface_file - - -class PipeSpec(object): - """ Pipeline key and name """ - def __init__(self, key, name=None): - assert "" != os.path.splitext(key)[1] - self.key = key - self.name = name or key.rstrip(".py") - - -RNA_PIPES = {"kallisto": PipeSpec("rnaKallisto.py"), - "tophat": PipeSpec("rnaTopHat.py"), - "bitseq": PipeSpec("rnaBitSeq.py")} - - -@pytest.fixture(scope="function") -def rna_pi_lines(): - return """{pm_key}: - {rnaseq_proto_name}: [{bs_name}, {kall_name}, {th_name}] - SMART: [{bs_name}, {th_name}] - -pipelines: - {bs_key}: - name: {bs_name} - path: src/rnaBitSeq.py - arguments: - "--sample-name": sample_name - "--genome": transcriptome - "--input": data_source - "--single-or-paired": read_type - required_input_files: [data_source] - ngs_input_files: [data_source] - {res}: - {dr}: - file_size: "0" - cores: "6" - mem: "36000" - time: "2-00:00:00" - large: - file_size: "4" - cores: "6" - mem: "44000" - time: "2-00:00:00" - - {th_key}: - name: {th_name} - path: src/rnaTopHat.py - required_input_files: [data_source] - ngs_input_files: [data_source] - arguments: - "--sample-name": sample_name - "--genome": genome - "--input": data_source - "--single-or-paired": read_type - {res}: - {dr}: - file_size: "0" - cores: "2" - mem: "60000" - time: "7-00:00:00" - - {kall_key}: - name: {kall_name} - path: src/rnaKallisto.py - required_input_files: [data_source] - ngs_input_files: [data_source] - arguments: - "--sample-yaml": yaml_file - "--sample-name": sample_name - "--input": data_source - "--single-or-paired": read_type - optional_arguments: - "--input2": read2 - "--fragment-length": fragment_length - "--fragment-length-sdev": fragment_length_sdev - outputs: - {abundances_key}: \"{abundances_val}\" - {res}: - {dr}: - cores: "2" - mem: "4000" - time: "0-6:00:00" - normal: - min_file_size: "3" - cores: "2" - mem: "8000" - time: "0-12:00:00" -""".format( - pm_key=PROTOMAP_KEY, res=RESOURCES_KEY, dr=DEF_RES, rnaseq_proto_name=RNASEQ, - bs_key=RNA_PIPES["bitseq"].key, bs_name=RNA_PIPES["bitseq"].name, - th_key=RNA_PIPES["tophat"].key, th_name=RNA_PIPES["tophat"].name, - kall_key=RNA_PIPES["kallisto"].key, kall_name=RNA_PIPES["kallisto"].name, - abundances_key=KALLISTO_ABUNDANCES_KEY, - abundances_val=KALLISTO_ABUNDANCES_TEMPLATE).splitlines(True) diff --git a/tests/output/submission/TEST2_sample1.sub b/tests/output/submission/TEST2_sample1.sub new file mode 100644 index 000000000..a9e41ce4d --- /dev/null +++ b/tests/output/submission/TEST2_sample1.sub @@ -0,0 +1,19 @@ +#!/bin/bash +#SBATCH --job-name='TEST2_sample1' +#SBATCH --output='../output/submission/TEST2_sample1.log' +#SBATCH --mem='{MEM}' +#SBATCH --cpus-per-task='{CORES}' +#SBATCH --time='{TIME}' +#SBATCH --partition='standard' +#SBATCH -m block +#SBATCH --ntasks=1 +#SBATCH --open-mode=append + +echo 'Compute node:' `hostname` +echo 'Start time:' `date +'%Y-%m-%d %T'` + +cmd="pipelines/other_pipeline2.py --sample-name sample1 " + +y=`echo "$cmd" | sed -e 's/^/srun /'` +eval "$y" + diff --git a/tests/output/submission/TEST2_sample2.sub b/tests/output/submission/TEST2_sample2.sub new file mode 100644 index 000000000..daea9db38 --- /dev/null +++ b/tests/output/submission/TEST2_sample2.sub @@ -0,0 +1,19 @@ +#!/bin/bash +#SBATCH --job-name='TEST2_sample2' +#SBATCH --output='../output/submission/TEST2_sample2.log' +#SBATCH --mem='{MEM}' +#SBATCH --cpus-per-task='{CORES}' +#SBATCH --time='{TIME}' +#SBATCH --partition='standard' +#SBATCH -m block +#SBATCH --ntasks=1 +#SBATCH --open-mode=append + +echo 'Compute node:' `hostname` +echo 'Start time:' `date +'%Y-%m-%d %T'` + +cmd="pipelines/other_pipeline2.py --sample-name sample2 " + +y=`echo "$cmd" | sed -e 's/^/srun /'` +eval "$y" + diff --git a/tests/output/submission/TEST2_sample3.sub b/tests/output/submission/TEST2_sample3.sub new file mode 100644 index 000000000..2ffa25b94 --- /dev/null +++ b/tests/output/submission/TEST2_sample3.sub @@ -0,0 +1,19 @@ +#!/bin/bash +#SBATCH --job-name='TEST2_sample3' +#SBATCH --output='../output/submission/TEST2_sample3.log' +#SBATCH --mem='{MEM}' +#SBATCH --cpus-per-task='{CORES}' +#SBATCH --time='{TIME}' +#SBATCH --partition='standard' +#SBATCH -m block +#SBATCH --ntasks=1 +#SBATCH --open-mode=append + +echo 'Compute node:' `hostname` +echo 'Start time:' `date +'%Y-%m-%d %T'` + +cmd="pipelines/other_pipeline2.py --sample-name sample3 " + +y=`echo "$cmd" | sed -e 's/^/srun /'` +eval "$y" + diff --git a/tests/output/submission/TEST_sample1.sub b/tests/output/submission/TEST_sample1.sub new file mode 100644 index 000000000..190ced0bb --- /dev/null +++ b/tests/output/submission/TEST_sample1.sub @@ -0,0 +1,19 @@ +#!/bin/bash +#SBATCH --job-name='TEST_sample1' +#SBATCH --output='../output/submission/TEST_sample1.log' +#SBATCH --mem='{MEM}' +#SBATCH --cpus-per-task='{CORES}' +#SBATCH --time='{TIME}' +#SBATCH --partition='standard' +#SBATCH -m block +#SBATCH --ntasks=1 +#SBATCH --open-mode=append + +echo 'Compute node:' `hostname` +echo 'Start time:' `date +'%Y-%m-%d %T'` + +cmd="pipelines/pipeline1.py --sample-name sample1 " + +y=`echo "$cmd" | sed -e 's/^/srun /'` +eval "$y" + diff --git a/tests/output/submission/TEST_sample2.sub b/tests/output/submission/TEST_sample2.sub new file mode 100644 index 000000000..5f83a636a --- /dev/null +++ b/tests/output/submission/TEST_sample2.sub @@ -0,0 +1,19 @@ +#!/bin/bash +#SBATCH --job-name='TEST_sample2' +#SBATCH --output='../output/submission/TEST_sample2.log' +#SBATCH --mem='{MEM}' +#SBATCH --cpus-per-task='{CORES}' +#SBATCH --time='{TIME}' +#SBATCH --partition='standard' +#SBATCH -m block +#SBATCH --ntasks=1 +#SBATCH --open-mode=append + +echo 'Compute node:' `hostname` +echo 'Start time:' `date +'%Y-%m-%d %T'` + +cmd="pipelines/pipeline1.py --sample-name sample2 " + +y=`echo "$cmd" | sed -e 's/^/srun /'` +eval "$y" + diff --git a/tests/output/submission/TEST_sample3.sub b/tests/output/submission/TEST_sample3.sub new file mode 100644 index 000000000..63ed924ed --- /dev/null +++ b/tests/output/submission/TEST_sample3.sub @@ -0,0 +1,19 @@ +#!/bin/bash +#SBATCH --job-name='TEST_sample3' +#SBATCH --output='../output/submission/TEST_sample3.log' +#SBATCH --mem='{MEM}' +#SBATCH --cpus-per-task='{CORES}' +#SBATCH --time='{TIME}' +#SBATCH --partition='standard' +#SBATCH -m block +#SBATCH --ntasks=1 +#SBATCH --open-mode=append + +echo 'Compute node:' `hostname` +echo 'Start time:' `date +'%Y-%m-%d %T'` + +cmd="pipelines/pipeline1.py --sample-name sample3 " + +y=`echo "$cmd" | sed -e 's/^/srun /'` +eval "$y" + diff --git a/tests/output/submission/sample1.yaml b/tests/output/submission/sample1.yaml new file mode 100644 index 000000000..c5991ee0b --- /dev/null +++ b/tests/output/submission/sample1.yaml @@ -0,0 +1,28 @@ +sample_name: sample1 +protocol: PROTO1 +data_source: SRA +SRR: SRR5210416 +Sample_geo_accession: GSM2471255 +read1: SRR5210416_1.fastq.gz +read2: SRR5210416_2.fastq.gz +pipeline_interfaces: pipeline_interface2.yaml +prj: + pep_version: 2.0.0 + name: test + sample_table: /var/folders/3f/0wj7rs2144l9zsgxd3jn5nxc0000gn/T/annotation_sheet.csv + looper: + output_dir: ../output + sample_modifiers: + append: + pipeline_interfaces: pipeline_interface2.yaml + derive: + attributes: + - read1 + - read2 + sources: + SRA_1: '{SRR}_1.fastq.gz' + SRA_2: '{SRR}_2.fastq.gz' +input_file_size: 0 +test_property: ~/sample/sample1_file.txt +test_property1: ~/sample/sample1_file1.txt +yaml_file: ../output/submission/sample1.yaml diff --git a/tests/output/submission/sample2.yaml b/tests/output/submission/sample2.yaml new file mode 100644 index 000000000..35c0490bc --- /dev/null +++ b/tests/output/submission/sample2.yaml @@ -0,0 +1,28 @@ +sample_name: sample2 +protocol: PROTO1 +data_source: SRA +SRR: SRR5210450 +Sample_geo_accession: GSM2471300 +read1: SRR5210450_1.fastq.gz +read2: SRR5210450_2.fastq.gz +pipeline_interfaces: pipeline_interface2.yaml +prj: + pep_version: 2.0.0 + name: test + sample_table: /var/folders/3f/0wj7rs2144l9zsgxd3jn5nxc0000gn/T/annotation_sheet.csv + looper: + output_dir: ../output + sample_modifiers: + append: + pipeline_interfaces: pipeline_interface2.yaml + derive: + attributes: + - read1 + - read2 + sources: + SRA_1: '{SRR}_1.fastq.gz' + SRA_2: '{SRR}_2.fastq.gz' +input_file_size: 0 +test_property: ~/sample/sample2_file.txt +test_property1: ~/sample/sample2_file1.txt +yaml_file: ../output/submission/sample2.yaml diff --git a/tests/output/submission/sample3.yaml b/tests/output/submission/sample3.yaml new file mode 100644 index 000000000..c8b86f72b --- /dev/null +++ b/tests/output/submission/sample3.yaml @@ -0,0 +1,28 @@ +sample_name: sample3 +protocol: PROTO2 +data_source: SRA +SRR: SRR5210398 +Sample_geo_accession: GSM2471249 +read1: SRR5210398_1.fastq.gz +read2: SRR5210398_2.fastq.gz +pipeline_interfaces: pipeline_interface2.yaml +prj: + pep_version: 2.0.0 + name: test + sample_table: /var/folders/3f/0wj7rs2144l9zsgxd3jn5nxc0000gn/T/annotation_sheet.csv + looper: + output_dir: ../output + sample_modifiers: + append: + pipeline_interfaces: pipeline_interface2.yaml + derive: + attributes: + - read1 + - read2 + sources: + SRA_1: '{SRR}_1.fastq.gz' + SRA_2: '{SRR}_2.fastq.gz' +input_file_size: 0 +test_property: ~/sample/sample3_file.txt +test_property1: ~/sample/sample3_file1.txt +yaml_file: ../output/submission/sample3.yaml diff --git a/tests/smoketests/__init__.py b/tests/smoketests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/smoketests/conftest.py b/tests/smoketests/conftest.py new file mode 100644 index 000000000..4c84b7bbe --- /dev/null +++ b/tests/smoketests/conftest.py @@ -0,0 +1,124 @@ +import pytest +import os +import tempfile +from shutil import copyfile as cpf, rmtree +from looper.const import * +from peppy.const import * +from yaml import safe_load +import subprocess + +CFG = "project_config.yaml" +ST = "annotation_sheet.csv" +PIP = "pipeline_interface{}_project.yaml" +PIS = "pipeline_interface{}_sample.yaml" +OS = "output_schema.yaml" +RES = "resources-{}.tsv" + + +def get_outdir(pth): + """ + Get output directory from a config file + + :param str pth: + :return str: output directory + """ + with open(pth, 'r') as conf_file: + config_data = safe_load(conf_file) + return config_data[LOOPER_KEY][OUTDIR_KEY] + + +def is_in_file(fs, s, reverse=False): + """ + Verify if string is in files content + + :param str | Iterable[str] fs: list of files + :param str s: string to look for + :param bool reverse: whether the reverse should be checked + """ + if isinstance(fs, str): + fs = [fs] + for f in fs: + with open(f, 'r') as fh: + if reverse: + assert s not in fh.read() + else: + assert s in fh.read() + + +def subp_exec(pth=None, cmd=None, appendix=list(), dry=True): + """ + + :param str pth: config path + :param str cmd: looper subcommand + :param Iterable[str] appendix: other args to pass to the cmd + :return: + """ + x = ["looper", cmd, "-d" if dry else ""] + if pth: + x.append(pth) + x.extend(appendix) + proc = subprocess.Popen(x, stderr=subprocess.PIPE, stdout=subprocess.PIPE) + stdout, stderr = proc.communicate() + return str(stdout), str(stderr), proc.returncode + + +@pytest.fixture +def example_pep_piface_path(): + return os.path.join( + os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "data") + + +@pytest.fixture +def example_pep_piface_path_cfg(example_pep_piface_path): + return os.path.join(example_pep_piface_path, CFG) + + +@pytest.fixture +def prep_temp_pep(example_pep_piface_path): + # temp dir + td = tempfile.mkdtemp() + out_td = os.path.join(td, "output") + # ori paths + cfg_path = os.path.join(example_pep_piface_path, CFG) + sample_table_path = os.path.join(example_pep_piface_path, ST) + piface1p_path = os.path.join(example_pep_piface_path, PIP.format("1")) + piface2p_path = os.path.join(example_pep_piface_path, PIP.format("2")) + piface1s_path = os.path.join(example_pep_piface_path, PIS.format("1")) + piface2s_path = os.path.join(example_pep_piface_path, PIS.format("2")) + output_schema_path = os.path.join(example_pep_piface_path, OS) + res_proj_path = os.path.join(example_pep_piface_path, RES.format("project")) + res_samp_path = os.path.join(example_pep_piface_path, RES.format("sample")) + # temp copies + temp_path_cfg = os.path.join(td, CFG) + temp_path_sample_table = os.path.join(td, ST) + temp_path_piface1s = os.path.join(td, PIS.format("1")) + temp_path_piface2s = os.path.join(td, PIS.format("2")) + temp_path_piface1p = os.path.join(td, PIP.format("1")) + temp_path_piface2p = os.path.join(td, PIP.format("2")) + temp_path_output_schema = os.path.join(td, OS) + temp_path_res_proj = os.path.join(td, RES.format("project")) + temp_path_res_samp = os.path.join(td, RES.format("sample")) + # copying + cpf(cfg_path, temp_path_cfg) + cpf(sample_table_path, temp_path_sample_table) + cpf(piface1s_path, temp_path_piface1s) + cpf(piface2s_path, temp_path_piface2s) + cpf(piface1p_path, temp_path_piface1p) + cpf(piface2p_path, temp_path_piface2p) + cpf(output_schema_path, temp_path_output_schema) + cpf(res_proj_path, temp_path_res_proj) + cpf(res_samp_path, temp_path_res_samp) + # modififactions + from yaml import safe_load, dump + with open(temp_path_cfg, 'r') as f: + piface_data = safe_load(f) + piface_data[LOOPER_KEY][OUTDIR_KEY] = out_td + piface_data[LOOPER_KEY][CLI_KEY] = {} + piface_data[LOOPER_KEY][CLI_KEY]["runp"] = {} + piface_data[LOOPER_KEY][CLI_KEY]["runp"][PIPELINE_INTERFACES_KEY] = \ + [temp_path_piface1p, temp_path_piface2p] + piface_data[SAMPLE_MODS_KEY][CONSTANT_KEY][PIPELINE_INTERFACES_KEY] = \ + [temp_path_piface1s, temp_path_piface2s] + with open(temp_path_cfg, 'w') as f: + dump(piface_data, f) + return temp_path_cfg diff --git a/tests/smoketests/test_other.py b/tests/smoketests/test_other.py new file mode 100644 index 000000000..b44ecac41 --- /dev/null +++ b/tests/smoketests/test_other.py @@ -0,0 +1,51 @@ +import pytest +from tests.smoketests.conftest import * +from looper.const import FLAGS +from peppy import Project + + +def _make_flags(cfg, type, count): + p = Project(cfg) + out_dir = p[CONFIG_KEY][LOOPER_KEY][OUTDIR_KEY] + for s in p.samples[:count]: + sf = os.path.join(out_dir, "results_pipeline", s[SAMPLE_NAME_ATTR]) + if not os.path.exists(sf): + os.makedirs(sf) + open(os.path.join(sf, type + ".flag"), 'a').close() + + +class LooperCheckTests: + @pytest.mark.parametrize("flag_id", FLAGS) + @pytest.mark.parametrize("count", list(range(2))) + def test_check_works(self, prep_temp_pep, flag_id, count): + """ Verify that checking works """ + tp = prep_temp_pep + _make_flags(tp, flag_id, count) + stdout, stderr, rc = subp_exec(tp, "check") + assert rc == 0 + print(stderr) + assert "{}: {}".format(flag_id.upper(), str(count)) in stderr + + @pytest.mark.parametrize("flag_id", FLAGS) + @pytest.mark.parametrize("count", list(range(2))) + def test_check_multi(self, prep_temp_pep, flag_id, count): + """ Verify that checking works when multiple flags are created """ + tp = prep_temp_pep + _make_flags(tp, flag_id, count) + _make_flags(tp, FLAGS[1], count) + stdout, stderr, rc = subp_exec(tp, "check") + assert rc == 0 + print(stderr) + if flag_id != FLAGS[1]: + assert "{}: {}".format(flag_id.upper(), str(count)) in stderr + + @pytest.mark.parametrize("flag_id", ["3333", "tonieflag", "bogus", "ms"]) + def test_check_bogus(self, prep_temp_pep, flag_id): + """ Verify that checking works when bogus flags are created """ + tp = prep_temp_pep + _make_flags(tp, flag_id, 1) + stdout, stderr, rc = subp_exec(tp, "check") + assert rc == 0 + print(stderr) + for f in FLAGS: + assert "{}: {}".format(f.upper(), "0") in stderr diff --git a/tests/smoketests/test_run.py b/tests/smoketests/test_run.py new file mode 100644 index 000000000..fe17ff9a9 --- /dev/null +++ b/tests/smoketests/test_run.py @@ -0,0 +1,475 @@ +import pytest +from tests.smoketests.conftest import * +from peppy.const import * +from looper.const import * +from yaml import dump + +CMD_STRS = ["string", " --string", " --sjhsjd 212", "7867#$@#$cc@@"] + + +class LooperBothRunsTests: + @pytest.mark.parametrize("cmd", ["run", "runp"]) + def test_looper_cfg_invalid(self, cmd): + """ Verify looper does not accept invalid cfg paths """ + stdout, stderr, rc = subp_exec("jdfskfds/dsjfklds/dsjklsf.yaml", cmd) + print(stderr) + assert rc != 0 + + @pytest.mark.parametrize("cmd", ["run", "runp"]) + def test_looper_cfg_required(self, cmd): + """ Verify looper does not accept invalid cfg paths """ + stdout, stderr, rc = subp_exec(pth="", cmd=cmd) + print(stderr) + assert rc != 0 + + @pytest.mark.parametrize("cmd", ["run", "runp"]) + @pytest.mark.parametrize("arg", [["--command-extra", CMD_STRS[0]], + ["--command-extra", CMD_STRS[1]], + ["--command-extra", CMD_STRS[2]], + ["--command-extra", CMD_STRS[3]]]) + def test_cmd_extra_cli(self, prep_temp_pep, cmd, arg): + """ + Argument passing functionality works only for the above + configurations. Notably, it does not work for --command-extra '--arg'. + + See https://github.com/pepkit/looper/issues/245#issuecomment-621815222 + """ + tp = prep_temp_pep + stdout, stderr, rc = subp_exec(tp, cmd, arg) + sd = os.path.join(get_outdir(tp), "submission") + print(stderr) + assert rc == 0 + subs_list = \ + [os.path.join(sd, f) for f in os.listdir(sd) if f.endswith(".sub")] + is_in_file(subs_list, arg[1]) + + @pytest.mark.parametrize("cmd", ["run", "runp"]) + def test_unrecognized_args_not_passing(self, prep_temp_pep, cmd): + tp = prep_temp_pep + stdout, stderr, rc = subp_exec(tp, cmd, ["--unknown-arg", "4"]) + sd = os.path.join(get_outdir(tp), "submission") + print(stderr) + assert rc == 0 + subs_list = \ + [os.path.join(sd, f) for f in os.listdir(sd) if f.endswith(".sub")] + is_in_file(subs_list, "--unknown-arg", reverse=True) + + @pytest.mark.parametrize("cmd", ["run", "runp"]) + def test_run_after_init(self, prep_temp_pep, cmd): + tp = prep_temp_pep + dotfile_path = os.path.join(os.getcwd(), LOOPER_DOTFILE_NAME) + stdout, stderr, rc = subp_exec(tp, "init") + print(stderr) + print(stdout) + assert rc == 0 + is_in_file(dotfile_path, tp) + stdout, stderr, rc = subp_exec(cmd=cmd) + print(stderr) + print(stdout) + assert rc == 0 + os.remove(dotfile_path) + + +class LooperRunBehaviorTests: + def test_looper_run_basic(self, prep_temp_pep): + """ Verify looper runs in a basic case and return code is 0 """ + tp = prep_temp_pep + stdout, stderr, rc = subp_exec(tp, "run") + print(stderr) + assert rc == 0 + + def test_looper_multi_pipeline(self, prep_temp_pep): + tp = prep_temp_pep + stdout, stderr, rc = subp_exec(tp, "run") + print(stderr) + assert "Commands submitted: 6 of 6" in stderr + + def test_looper_single_pipeline(self, prep_temp_pep): + tp = prep_temp_pep + with open(tp, 'r') as conf_file: + config_data = safe_load(conf_file) + print("\nconfig_data: \n{}\n".format(config_data)) + pifaces = \ + config_data[SAMPLE_MODS_KEY][CONSTANT_KEY][PIPELINE_INTERFACES_KEY] + config_data[SAMPLE_MODS_KEY][CONSTANT_KEY][PIPELINE_INTERFACES_KEY] = \ + pifaces[1] + print("\nconfig_data: \n{}\n".format(config_data)) + with open(tp, 'w') as conf_file: + dump(config_data, conf_file) + stdout, stderr, rc = subp_exec(tp, "run") + print(stderr) + assert rc == 0 + assert "Commands submitted: 6 of 6" not in stderr + + def test_looper_cli_pipeline(self, prep_temp_pep): + """ CLI-specified pipelines overwrite ones from config """ + tp = prep_temp_pep + pi_pth = os.path.join(os.path.dirname(tp), PIS.format("1")) + stdout, stderr, rc = subp_exec(tp, "run", + ["--pipeline-interfaces", pi_pth]) + print(stderr) + assert rc == 0 + assert "Commands submitted: 3 of 3" not in stdout + + def test_looper_no_pipeline(self, prep_temp_pep): + """ + No jobs are submitted and proper log is produced when there are no + valid pifaces defined + """ + tp = prep_temp_pep + with open(tp, 'r') as conf_file: + config_data = safe_load(conf_file) + print("\nconfig_data: \n{}\n".format(config_data)) + del config_data[SAMPLE_MODS_KEY][CONSTANT_KEY][PIPELINE_INTERFACES_KEY] + print("\nconfig_data: \n{}\n".format(config_data)) + with open(tp, 'w') as conf_file: + dump(config_data, conf_file) + stdout, stderr, rc = subp_exec(tp, "run") + print(stderr) + assert rc == 0 + assert "Jobs submitted: 0" in stderr + assert "No pipeline interfaces defined" + + def test_looper_pipeline_not_found(self, prep_temp_pep): + """ + Piface is ignored when when it does not exist + """ + tp = prep_temp_pep + with open(tp, 'r') as conf_file: + config_data = safe_load(conf_file) + print("\nconfig_data: \n{}\n".format(config_data)) + config_data[SAMPLE_MODS_KEY][CONSTANT_KEY][PIPELINE_INTERFACES_KEY] = \ + ["bogus"] + print("\nconfig_data: \n{}\n".format(config_data)) + with open(tp, 'w') as conf_file: + dump(config_data, conf_file) + stdout, stderr, rc = subp_exec(tp, "run") + print(stderr) + assert rc == 0 + assert "Jobs submitted: 0" in stderr + assert "Ignoring invalid pipeline interface source" + + def test_looper_pipeline_invalid(self, prep_temp_pep): + """ + Pipeline is ignored when does not validate successfully + agianst a schema + """ + tp = prep_temp_pep + with open(tp, 'r') as conf_file: + config_data = safe_load(conf_file) + print("\nconfig_data: \n{}\n".format(config_data)) + pifaces = config_data[SAMPLE_MODS_KEY][CONSTANT_KEY][ + PIPELINE_INTERFACES_KEY] + config_data[SAMPLE_MODS_KEY][CONSTANT_KEY][PIPELINE_INTERFACES_KEY] = \ + pifaces[1] + print("\nconfig_data: \n{}\n".format(config_data)) + with open(tp, 'w') as conf_file: + dump(config_data, conf_file) + piface_path = os.path.join(os.path.dirname(tp), pifaces[1]) + with open(piface_path, 'r') as piface_file: + piface_data = safe_load(piface_file) + del piface_data["pipeline_name"] + with open(piface_path, 'w') as piface_file: + dump(piface_data, piface_file) + stdout, stderr, rc = subp_exec(tp, "run") + print(stderr) + assert rc == 0 + assert "Jobs submitted: 0" in stderr + assert "Ignoring invalid pipeline interface source" + assert "'pipeline_name' is a required property" + + def test_looper_sample_attr_missing(self, prep_temp_pep): + """ + Piface is ignored when when it does not exist + """ + tp = prep_temp_pep + with open(tp, 'r') as conf_file: + config_data = safe_load(conf_file) + print("\nconfig_data: \n{}\n".format(config_data)) + del config_data[SAMPLE_MODS_KEY][CONSTANT_KEY]["attr"] + print("\nconfig_data: \n{}\n".format(config_data)) + with open(tp, 'w') as conf_file: + dump(config_data, conf_file) + stdout, stderr, rc = subp_exec(tp, "run") + print(stderr) + assert rc == 0 + assert "Jobs submitted: 0" in stderr + + def test_looper_sample_name_whitespace(self, prep_temp_pep): + """ + Piface is ignored when when it does not exist + """ + tp = prep_temp_pep + imply_whitespace = \ + [{IMPLIED_IF_KEY: {'sample_name': 'sample1'}, + IMPLIED_THEN_KEY: {'sample_name': 'sample whitespace'}}] + with open(tp, 'r') as conf_file: + config_data = safe_load(conf_file) + print("\nconfig_data: \n{}\n".format(config_data)) + config_data[SAMPLE_MODS_KEY][IMPLIED_KEY] = imply_whitespace + print("\nconfig_data: \n{}\n".format(config_data)) + with open(tp, 'w') as conf_file: + dump(config_data, conf_file) + stdout, stderr, rc = subp_exec(tp, "run") + print(stderr) + assert rc != 0 + + def test_looper_toogle(self, prep_temp_pep): + """ + If all samples have tooggle attr set to 0, no jobs are submitted + """ + tp = prep_temp_pep + with open(tp, 'r') as conf_file: + config_data = safe_load(conf_file) + print("\nconfig_data: \n{}\n".format(config_data)) + config_data[SAMPLE_MODS_KEY][CONSTANT_KEY][SAMPLE_TOGGLE_ATTR] = 0 + print("\nconfig_data: \n{}\n".format(config_data)) + with open(tp, 'w') as conf_file: + dump(config_data, conf_file) + stdout, stderr, rc = subp_exec(tp, "run") + print(stderr) + assert rc == 0 + assert "Jobs submitted: 0" in stderr + + @pytest.mark.parametrize("arg", CMD_STRS) + def test_cmd_extra_sample(self, prep_temp_pep, arg): + """ + string set by sample_modifiers in Sample.command_extra shuld be + appended to the pipelinecommand + """ + tp = prep_temp_pep + with open(tp, 'r') as conf_file: + config_data = safe_load(conf_file) + print("\nconfig_data: \n{}\n".format(config_data)) + config_data[SAMPLE_MODS_KEY][CONSTANT_KEY]["command_extra"] = arg + print("\nconfig_data: \n{}\n".format(config_data)) + with open(tp, 'w') as conf_file: + dump(config_data, conf_file) + stdout, stderr, rc = subp_exec(tp, "run") + sd = os.path.join(get_outdir(tp), "submission") + print(stderr) + assert rc == 0 + subs_list = \ + [os.path.join(sd, f) for f in os.listdir(sd) if f.endswith(".sub")] + is_in_file(subs_list, arg) + + @pytest.mark.parametrize("arg", CMD_STRS) + def test_cmd_extra_override_sample(self, prep_temp_pep, arg): + """ + --command-extra-override should override the Sample.command_extra + and Project.looper.command_extra attributes appeneded to the + pipeline command + """ + tp = prep_temp_pep + with open(tp, 'r') as conf_file: + config_data = safe_load(conf_file) + print("\nconfig_data: \n{}\n".format(config_data)) + config_data[SAMPLE_MODS_KEY][CONSTANT_KEY]["command_extra"] = arg + print("\nconfig_data: \n{}\n".format(config_data)) + with open(tp, 'w') as conf_file: + dump(config_data, conf_file) + stdout, stderr, rc = \ + subp_exec(tp, "run", ["--command-extra-override='different'"]) + sd = os.path.join(get_outdir(tp), "submission") + print(stderr) + assert rc == 0 + subs_list = \ + [os.path.join(sd, f) for f in os.listdir(sd) if f.endswith(".sub")] + is_in_file(subs_list, arg, reverse=True) + + +class LooperRunpBehaviorTests: + def test_looper_runp_basic(self, prep_temp_pep): + """ Verify looper runps in a basic case and return code is 0 """ + tp = prep_temp_pep + stdout, stderr, rc = subp_exec(tp, "runp") + print(stderr) + assert rc == 0 + + def test_looper_multi_pipeline(self, prep_temp_pep): + tp = prep_temp_pep + stdout, stderr, rc = subp_exec(tp, "runp") + assert "Jobs submitted: 2" in stderr + + def test_looper_single_pipeline(self, prep_temp_pep): + tp = prep_temp_pep + with open(tp, 'r') as conf_file: + config_data = safe_load(conf_file) + print("\nconfig_data: \n{}\n".format(config_data)) + piface_path = os.path.join(os.path.dirname(tp), PIP.format("1")) + config_data[LOOPER_KEY][CLI_KEY]["runp"][PIPELINE_INTERFACES_KEY] = \ + piface_path + print("\nconfig_data: \n{}\n".format(config_data)) + with open(tp, 'w') as conf_file: + dump(config_data, conf_file) + stdout, stderr, rc = subp_exec(tp, "runp") + print(stderr) + assert rc == 0 + assert "Jobs submitted: 2" not in stderr + assert "Jobs submitted: 1" in stderr + + @pytest.mark.parametrize("arg", CMD_STRS) + def test_cmd_extra_project(self, prep_temp_pep, arg): + """ + """ + tp = prep_temp_pep + with open(tp, 'r') as conf_file: + config_data = safe_load(conf_file) + print("\nconfig_data: \n{}\n".format(config_data)) + config_data[LOOPER_KEY]["command_extra"] = arg + print("\nconfig_data: \n{}\n".format(config_data)) + with open(tp, 'w') as conf_file: + dump(config_data, conf_file) + stdout, stderr, rc = subp_exec(tp, "runp") + sd = os.path.join(get_outdir(tp), "submission") + print(stderr) + assert rc == 0 + subs_list = \ + [os.path.join(sd, f) for f in os.listdir(sd) if f.endswith(".sub")] + is_in_file(subs_list, arg) + + +class LooperRunSubmissionScriptTests: + def test_looper_run_produces_submission_scripts(self, prep_temp_pep): + tp = prep_temp_pep + with open(tp, 'r') as conf_file: + config_data = safe_load(conf_file) + print("\nconfig_data: \n{}\n".format(config_data)) + outdir = config_data[LOOPER_KEY][OUTDIR_KEY] + stdout, stderr, rc = subp_exec(tp, "run") + sd = os.path.join(outdir, "submission") + subm_err = \ + IOError("Not found in submission directory ({}): 6 " + "submission scripts (3 per pipeline) and 3 sample" + " YAML representations".format(sd)) + print(stderr) + assert rc == 0 + assert os.path.isdir(sd) + assert len(os.listdir(sd)) == 9, subm_err + assert sum([f.endswith(".sub") for f in os.listdir(sd)]) == 6, subm_err + assert sum([f.endswith(".yaml") for f in os.listdir(sd)]) == 3, subm_err + + def test_looper_lumping(self, prep_temp_pep): + tp = prep_temp_pep + stdout, stderr, rc = subp_exec(tp, "run", ["--lumpn", "2"]) + sd = os.path.join(get_outdir(tp), "submission") + subm_err = \ + IOError("Not found in submission directory ({}): 4 " + "submission scripts (2 per pipeline) and 3 sample" + " YAML representations. Listdir: \n{}". + format(sd, os.listdir(sd))) + print(stderr) + assert rc == 0 + assert os.path.isdir(sd) + assert len(os.listdir(sd)) == 7, subm_err + assert sum([f.endswith(".sub") for f in os.listdir(sd)]) == 4, subm_err + assert sum([f.endswith(".yaml") for f in os.listdir(sd)]) == 3, subm_err + + def test_looper_lumping(self, prep_temp_pep): + tp = prep_temp_pep + stdout, stderr, rc = subp_exec(tp, "run", ["--lumpn", "2"]) + sd = os.path.join(get_outdir(tp), "submission") + subm_err = \ + IOError("Not found in submission directory ({}): 4 " + "submission scripts (2 per pipeline) and 3 sample" + " YAML representations. Listdir: \n{}". + format(sd, os.listdir(sd))) + print(stderr) + assert rc == 0 + assert os.path.isdir(sd) + assert sum([f.endswith(".sub") for f in os.listdir(sd)]) == 4, subm_err + assert sum([f.endswith(".yaml") for f in os.listdir(sd)]) == 3, subm_err + + def test_looper_limiting(self, prep_temp_pep): + tp = prep_temp_pep + stdout, stderr, rc = subp_exec(tp, "run", ["--limit", "2"]) + sd = os.path.join(get_outdir(tp), "submission") + subm_err = \ + IOError("Not found in submission directory ({}): 4 " + "submission scripts (2 per pipeline) and 2 sample " + "YAML representations. Listdir: \n{}". + format(sd, os.listdir(sd))) + print(stderr) + assert rc == 0 + assert os.path.isdir(sd) + assert sum([f.endswith(".sub") for f in os.listdir(sd)]) == 4, subm_err + assert sum([f.endswith(".yaml") for f in os.listdir(sd)]) == 2, subm_err + + +class LooperComputeTests: + @pytest.mark.parametrize("cmd", ["run", "runp"]) + def test_looper_respects_pkg_selection(self, prep_temp_pep, cmd): + tp = prep_temp_pep + stdout, stderr, rc = subp_exec(tp, cmd, ["--package", "local"]) + sd = os.path.join(get_outdir(tp), "submission") + print(stderr) + assert rc == 0 + subs_list = \ + [os.path.join(sd, f) for f in os.listdir(sd) if f.endswith(".sub")] + is_in_file(subs_list, "#SBATCH", reverse=True) + + @pytest.mark.parametrize("cmd", ["run", "runp"]) + def test_looper_uses_cli_compute_options_spec(self, prep_temp_pep, cmd): + tp = prep_temp_pep + stdout, stderr, rc = subp_exec(tp, cmd, ["--compute", "mem=12345", + "--package", "slurm"]) + sd = os.path.join(get_outdir(tp), "submission") + print(stderr) + assert rc == 0 + subs_list = \ + [os.path.join(sd, f) for f in os.listdir(sd) if f.endswith(".sub")] + is_in_file(subs_list, "#SBATCH --mem='12345'") + + @pytest.mark.parametrize("cmd", ["run", "runp"]) + def test_cli_yaml_settings_general(self, prep_temp_pep, cmd): + tp = prep_temp_pep + td = tempfile.mkdtemp() + settings_file_path = os.path.join(td, "settings.yaml") + with open(settings_file_path, 'w') as sf: + dump({"mem": "testin_mem"}, sf) + stdout, stderr, rc = \ + subp_exec(tp, cmd, ["--settings", settings_file_path]) + print(stderr) + assert rc == 0 + + @pytest.mark.parametrize("cmd", ["run", "runp"]) + def test_nonexistent_yaml_settings_disregarded(self, prep_temp_pep, cmd): + tp = prep_temp_pep + stdout, stderr, rc = \ + subp_exec(tp, cmd, ["--settings", "niema.yaml"]) + print(stderr) + assert rc == 0 + + @pytest.mark.parametrize("cmd", ["run", "runp"]) + def test_cli_yaml_settings_passes_settings(self, prep_temp_pep, cmd): + tp = prep_temp_pep + td = tempfile.mkdtemp() + settings_file_path = os.path.join(td, "settings.yaml") + with open(settings_file_path, 'w') as sf: + dump({"mem": "testin_mem"}, sf) + stdout, stderr, rc = \ + subp_exec(tp, cmd, ["--settings", settings_file_path, "-p", "slurm"]) + print(stderr) + assert rc == 0 + sd = os.path.join(get_outdir(tp), "submission") + subs_list = [os.path.join(sd, f) + for f in os.listdir(sd) if f.endswith(".sub")] + is_in_file(subs_list, "testin_mem") + + @pytest.mark.parametrize("cmd", ["run", "runp"]) + def test_cli_compute_overwrites_yaml_settings_spec(self, prep_temp_pep, cmd): + tp = prep_temp_pep + td = tempfile.mkdtemp() + settings_file_path = os.path.join(td, "settings.yaml") + with open(settings_file_path, 'w') as sf: + dump({"mem": "testin_mem"}, sf) + stdout, stderr, rc = \ + subp_exec(tp, cmd, ["--settings", settings_file_path, + "--compute", "mem=10", + "-p", "slurm"]) + print(stderr) + assert rc == 0 + sd = os.path.join(get_outdir(tp), "submission") + subs_list = [os.path.join(sd, f) + for f in os.listdir(sd) if f.endswith(".sub")] + is_in_file(subs_list, "testin_mem", reverse=True) \ No newline at end of file diff --git a/update-usage-docs.sh b/update-usage-docs.sh index 496780d3e..ae2390f19 100755 --- a/update-usage-docs.sh +++ b/update-usage-docs.sh @@ -2,18 +2,21 @@ cp docs/usage.template usage.template #looper --help > USAGE.temp 2>&1 -for cmd in "--help" "run --help" "summarize --help" "destroy --help" "check --help" "clean --help" "rerun --help"; do +for cmd in "--help" "run --help" "runp --help" "rerun --help" "report --help" "table --help" "inspect --help" "init --help" "destroy --help" "check --help" "clean --help"; do echo $cmd echo -e "## \`looper $cmd\`" > USAGE_header.temp looper $cmd --help > USAGE.temp 2>&1 # sed -i 's/^/\t/' USAGE.temp - sed -i '1s/^/\n\`\`\`console\n/' USAGE.temp + sed -i.bak '1s;^;\`\`\`console\ +;' USAGE.temp +# sed -i '1s/^/\n\`\`\`console\n/' USAGE.temp echo -e "\`\`\`\n" >> USAGE.temp #sed -i -e "/\`looper $cmd\`/r USAGE.temp" -e '$G' usage.template # for -in place inserts cat USAGE_header.temp USAGE.temp >> usage.template # to append to the end done rm USAGE.temp rm USAGE_header.temp +rm USAGE.temp.bak mv usage.template docs/usage.md cat docs/usage.md #rm USAGE.temp