diff --git a/.github/workflows/black.yml b/.github/workflows/black.yml new file mode 100644 index 00000000..f58e4c63 --- /dev/null +++ b/.github/workflows/black.yml @@ -0,0 +1,11 @@ +name: Lint + +on: [push, pull_request] + +jobs: + lint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - uses: actions/setup-python@v2 + - uses: psf/black@stable diff --git a/.github/workflows/run-pytest.yml b/.github/workflows/run-pytest.yml index 00522850..0da960fc 100644 --- a/.github/workflows/run-pytest.yml +++ b/.github/workflows/run-pytest.yml @@ -11,7 +11,7 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - python-version: [3.6, 3.7, 3.8] + python-version: [3.6, 3.7, 3.8, 3.9] os: [ubuntu-latest, macos-latest] steps: diff --git a/.gitignore b/.gitignore index b0197d9c..5aa97d38 100644 --- a/.gitignore +++ b/.gitignore @@ -26,7 +26,6 @@ tests/test/* # git has its own built in compression methods *.7z *.dmg -*.gz *.iso *.jar *.rar diff --git a/MANIFEST.in b/MANIFEST.in index 29209598..d7febc6d 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,3 +1,4 @@ +include refgenconf/schemas/* include requirements/* include README.md include LICENSE.txt diff --git a/codecov.yml b/codecov.yml new file mode 100644 index 00000000..0645ae19 --- /dev/null +++ b/codecov.yml @@ -0,0 +1,2 @@ +ignore: + - "refgenconf/refgenconf_v03.py" \ No newline at end of file diff --git a/docs/changelog.md b/docs/changelog.md index 213bf0ac..116912c6 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -1,6 +1,29 @@ # Changelog This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) and [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) format. +## [0.10.0] - 2021-03-11 +**After updating to this version your configuration file and genome assets will not be compatible with the software. Please refer to the [upgrade tutorial](config_upgrade_03_to_04.md) for instructions on how to migrate the config between versions.** + +## Changed + +- instead of using human-readable names as genome identifiers refgenie uses sequence-derived digests in the config +- asset data moved to `data` directory +- asset files are now named after genome digests +- refgenieserver APIv3 is now used for remote assets retrieval +- `RefGenConf.genomes` becomes an `AliasedYacAttMap` object + +### Removed +- `as_string` and `order` option from `listr` method + +### Added +- `upgrade_config` function for genome configuration file migrating between versions +- `RefGenConf.compare` method for genome compatibility level determination +- `as_digests` option in `RefGenConf.listr` method +- genome config validation on `RefGenConf` object instantiation stage and on every write +- new progress bar in `RefGenConf.pull` +- `RefGenConf.get_ta` +- numerous `RefGenConf` object properties and methods related to genome aliases handling: `genome_aliases`, `genome_aliases_table`, `alias_dir`, `data_dir`, `get_genome_alias`, `get_genome_alias_digest`, `remove_genome_aliases`, `set_genome_alias`, `initialize_genome`. Refer to [API documentation](http://refgenie.databio.org/en/latest/autodoc_build/refgenconf/) for more specific information. +- `get_asset_table` method, which displays a concise assets table ## [0.9.3] - 2020-09-02 diff --git a/refgenconf/__init__.py b/refgenconf/__init__.py index 03e5679d..c8de2e4e 100644 --- a/refgenconf/__init__.py +++ b/refgenconf/__init__.py @@ -1,12 +1,5 @@ from ._version import __version__ - +from .refgenconf import RefGenConf, upgrade_config +from .helpers import select_genome_config, get_dir_digest from .const import * from .exceptions import * -from .helpers import * -from .refgenconf import * - -__all__ = ["RefGenConf", "select_genome_config", "get_dir_digest", - "GenomeConfigFormatError", "MissingAssetError", - "MissingConfigDataError", "MissingGenomeError", "RefgenconfError", - "UnboundEnvironmentVariablesError"] + ["DEFAULT_SERVER"] + \ - CFG_KEY_NAMES diff --git a/refgenconf/_version.py b/refgenconf/_version.py index faa5dea2..61fb31ca 100644 --- a/refgenconf/_version.py +++ b/refgenconf/_version.py @@ -1 +1 @@ -__version__ = "0.9.3" \ No newline at end of file +__version__ = "0.10.0" diff --git a/refgenconf/const.py b/refgenconf/const.py index 075a8d9a..431af2df 100644 --- a/refgenconf/const.py +++ b/refgenconf/const.py @@ -5,12 +5,18 @@ but they're also integral to both refgenie and to refgenieserver. """ +import os + CFG_NAME = "genome configuration" CFG_ENV_VARS = ["REFGENIE"] CFG_CONST = ["CFG_NAME", "CFG_ENV_VARS"] DEFAULT_SERVER = "http://refgenomes.databio.org" -API_VERSION = "v2" +API_VERSION = "v3" +API_VERSION_2 = "v2" DEFAULT_TAG = "default" +DEFAULT_CONFIG_SCHEMA = os.path.join( + os.path.dirname(__file__), "schemas", "genome_config_schema.yaml" +) # file or dir names TEMPLATE_RECIPE_JSON = "build_recipe_{}__{}.json" @@ -18,38 +24,76 @@ TEMPLATE_LOG = "build_log_{}__{}.md" ORI_LOG_NAME = "refgenie_log.md" BUILD_STATS_DIR = "_refgenie_build" +ALIAS_DIR = "alias" +DATA_DIR = "data" -FILE_DIR_NAMES = ["TEMPLATE_RECIPE_JSON", "TEMPLATE_TARGET", "TEMPLATE_LOG", "ORI_LOG_NAME", "BUILD_STATS_DIR"] +FILE_DIR_NAMES = [ + "TEMPLATE_RECIPE_JSON", + "TEMPLATE_TARGET", + "TEMPLATE_LOG", + "ORI_LOG_NAME", + "BUILD_STATS_DIR", + "ALIAS_DIR", + "DATA_DIR", +] -# project-wide definition of the endpoint IDs. They are used to establish the way of communication between the server -# and the client so that changes of endpoint function names OR endpoints themselves do not influence the connection +# project-wide definition of the endpoint IDs. They are used to establish the +# way of communication between the server and the client so that changes of +# endpoint function names OR endpoints themselves do not influence the connection CUSTOM_PFX = "custom_Id" +API_ID_ALIAS_ALIAS = CUSTOM_PFX + "_alias_alias" +API_ID_ALIAS_DIGEST = CUSTOM_PFX + "_alias_digest" +API_ID_ALIASES_DICT = CUSTOM_PFX + "_aliases_dict" API_ID_ASSETS = CUSTOM_PFX + "_assets" API_ID_ARCHIVE = CUSTOM_PFX + "_archive" API_ID_DEFAULT_TAG = CUSTOM_PFX + "_default_tag" API_ID_ASSET_ATTRS = CUSTOM_PFX + "_asset_attrs" -API_ID_GENOME_ATTRS = "download_genome_attributes_v2_genome__genome__get" # temporary, default operation ID assigned by FastAPI. Switch to custom once refgenieserver supports it -# API_ID_GENOME_ATTRS = CUSTOM_PFX + "_genome_attrs" +API_ID_GENOME_ATTRS = CUSTOM_PFX + "_genome_attrs" API_ID_DIGEST = CUSTOM_PFX + "_asset_digest" API_ID_RECIPE = CUSTOM_PFX + "_asset_recipe" API_ID_LOG = CUSTOM_PFX + "_asset_log" API_ID_ARCHIVE_DIGEST = CUSTOM_PFX + "_asset_archive_digest" API_ID_SPLASH = CUSTOM_PFX + "_asset_splash" +API_ID_GENOMES_DICT = CUSTOM_PFX + "_genomes_dict" + +PRIVATE_API = "_private_api" -# this dictionary groups the operationIds so that they can be accessed as modules -# for systematic links generation in the splash pages +# this dictionary groups the operationIds so that they can be accessed as +# modules for systematic links generation in the splash pages OPERATION_IDS = { "asset": { - API_ID_ARCHIVE: "archive", API_ID_ASSET_ATTRS: "attributes", - API_ID_DIGEST: "asset digest", API_ID_ARCHIVE_DIGEST: "archive digest", - API_ID_RECIPE: "build recipe", API_ID_LOG: "build log" - } + API_ID_ARCHIVE: "archive", + API_ID_ASSET_ATTRS: "attributes", + API_ID_DIGEST: "asset digest", + API_ID_ARCHIVE_DIGEST: "archive digest", + API_ID_RECIPE: "build recipe", + API_ID_LOG: "build log", + }, + "v3_asset": { + API_VERSION + API_ID_ARCHIVE: "archive", + API_VERSION + API_ID_ASSET_ATTRS: "attributes", + API_VERSION + API_ID_DIGEST: "asset digest", + API_VERSION + API_ID_ARCHIVE_DIGEST: "archive digest", + API_VERSION + API_ID_RECIPE: "build recipe", + API_VERSION + API_ID_LOG: "build log", + }, } API_IDS = [ - "API_ID_ASSETS", "API_ID_ARCHIVE", "API_ID_DEFAULT_TAG", "API_ID_LOG", - "API_ID_DIGEST", "API_ID_RECIPE", "API_ID_ASSET_ATTRS", "API_ID_SPLASH", - "API_ID_ARCHIVE_DIGEST", "API_ID_GENOME_ATTRS" + "API_ID_ASSETS", + "API_ID_ARCHIVE", + "API_ID_DEFAULT_TAG", + "API_ID_LOG", + "API_ID_DIGEST", + "API_ID_RECIPE", + "API_ID_ASSET_ATTRS", + "API_ID_SPLASH", + "API_ID_ALIASES_DICT", + "API_ID_ARCHIVE_DIGEST", + "API_ID_ALIAS_ALIAS", + "API_ID_ALIAS_DIGEST", + "API_ID_GENOME_ATTRS", + "API_ID_GENOMES_DICT", ] CFG_FOLDER_KEY = "genome_folder" @@ -61,11 +105,13 @@ CFG_REMOTE_URL_BASE_KEY = "remote_url_base" CFG_VERSION_KEY = "config_version" CFG_GENOMES_KEY = "genomes" +CFG_ALIASES_KEY = "aliases" CFG_CHECKSUM_KEY = "genome_digest" CFG_GENOME_DESC_KEY = "genome_description" CFG_ASSETS_KEY = "assets" +CFG_GENOME_MASK_KEY = "genome_mask" CFG_ASSET_PATH_KEY = "asset_path" CFG_ASSET_SIZE_KEY = "asset_size" CFG_ASSET_DESC_KEY = "asset_description" @@ -82,22 +128,57 @@ CFG_ASSET_RELATIVES_KEYS = [CFG_ASSET_CHILDREN_KEY, CFG_ASSET_PARENTS_KEY] CFG_TOP_LEVEL_KEYS = [ - CFG_FOLDER_KEY, CFG_SERVER_KEY, CFG_SERVERS_KEY, CFG_ARCHIVE_KEY, CFG_GENOMES_KEY, - CFG_VERSION_KEY, CFG_ARCHIVE_CONFIG_KEY, CFG_ARCHIVE_KEY_OLD, CFG_REMOTE_URL_BASE_KEY] -CFG_GENOME_KEYS = [ - CFG_GENOME_DESC_KEY, CFG_ASSETS_KEY, CFG_CHECKSUM_KEY] + CFG_FOLDER_KEY, + CFG_SERVER_KEY, + CFG_SERVERS_KEY, + CFG_ARCHIVE_KEY, + CFG_GENOMES_KEY, + CFG_ALIASES_KEY, + CFG_VERSION_KEY, + CFG_ARCHIVE_CONFIG_KEY, + CFG_ARCHIVE_KEY_OLD, + CFG_REMOTE_URL_BASE_KEY, +] +CFG_GENOME_KEYS = [CFG_GENOME_DESC_KEY, CFG_ASSETS_KEY, CFG_CHECKSUM_KEY] CFG_GENOME_ATTRS_KEYS = [CFG_GENOME_DESC_KEY, CFG_CHECKSUM_KEY] -CFG_SINGLE_ASSET_SECTION_KEYS = [CFG_ASSET_PATH_KEY, CFG_ASSET_DESC_KEY, CFG_ASSET_SIZE_KEY, CFG_ARCHIVE_SIZE_KEY, - CFG_ARCHIVE_CHECKSUM_KEY, CFG_SEEK_KEYS_KEY] +CFG_SINGLE_ASSET_SECTION_KEYS = [ + CFG_ASSET_PATH_KEY, + CFG_ASSET_DESC_KEY, + CFG_ASSET_SIZE_KEY, + CFG_ARCHIVE_SIZE_KEY, + CFG_ARCHIVE_CHECKSUM_KEY, + CFG_SEEK_KEYS_KEY, + CFG_GENOME_MASK_KEY, +] RGC_REQ_KEYS = [CFG_SERVERS_KEY, CFG_FOLDER_KEY, CFG_GENOMES_KEY, CFG_VERSION_KEY] CFG_KEY_NAMES = [ - "CFG_FOLDER_KEY", "CFG_SERVER_KEY", "CFG_SERVERS_KEY", "CFG_GENOMES_KEY", - "CFG_ASSET_PATH_KEY", "CFG_ASSET_DESC_KEY", "CFG_ARCHIVE_KEY", "CFG_ARCHIVE_SIZE_KEY", "CFG_SEEK_KEYS_KEY", - "CFG_ASSET_SIZE_KEY", "CFG_CHECKSUM_KEY", "CFG_ARCHIVE_CHECKSUM_KEY", "CFG_VERSION_KEY", "CFG_ASSET_PARENTS_KEY", - "CFG_ASSET_CHILDREN_KEY", "CFG_TAG_DESC_KEY", "CFG_ASSET_CHECKSUM_KEY", "CFG_ASSET_TAGS_KEY", - "CFG_ASSET_RELATIVES_KEYS", "CFG_ARCHIVE_CONFIG_KEY", "CFG_ARCHIVE_KEY_OLD", "CFG_REMOTE_URL_BASE_KEY"] + "CFG_FOLDER_KEY", + "CFG_SERVER_KEY", + "CFG_SERVERS_KEY", + "CFG_GENOMES_KEY", + "CFG_GENOME_MASK_KEY", + "CFG_ALIASES_KEY", + "CFG_ASSET_PATH_KEY", + "CFG_ASSET_DESC_KEY", + "CFG_ARCHIVE_KEY", + "CFG_ARCHIVE_SIZE_KEY", + "CFG_SEEK_KEYS_KEY", + "CFG_ASSET_SIZE_KEY", + "CFG_CHECKSUM_KEY", + "CFG_ARCHIVE_CHECKSUM_KEY", + "CFG_VERSION_KEY", + "CFG_ASSET_PARENTS_KEY", + "CFG_ASSET_CHILDREN_KEY", + "CFG_TAG_DESC_KEY", + "CFG_ASSET_CHECKSUM_KEY", + "CFG_ASSET_TAGS_KEY", + "CFG_ASSET_RELATIVES_KEYS", + "CFG_ARCHIVE_CONFIG_KEY", + "CFG_ARCHIVE_KEY_OLD", + "CFG_REMOTE_URL_BASE_KEY", +] # hook identifiers PRE_UPDATE_HOOK = "pre_update" @@ -110,22 +191,63 @@ POST_LIST_HOOK = "post_list" # HOOKS is a list of all available plugin entry points HOOK_NAMES = [ - "PRE_LIST_HOOK", "PRE_PULL_HOOK", "PRE_TAG_HOOK", "PRE_UPDATE_HOOK", - "POST_TAG_HOOK", "POST_LIST_HOOK", "POST_PULL_HOOK", "POST_UPDATE_HOOK"] + "PRE_LIST_HOOK", + "PRE_PULL_HOOK", + "PRE_TAG_HOOK", + "PRE_UPDATE_HOOK", + "POST_TAG_HOOK", + "POST_LIST_HOOK", + "POST_PULL_HOOK", + "POST_UPDATE_HOOK", +] HOOKS = [eval(x) for x in HOOK_NAMES] # other consts -REQ_CFG_VERSION = 0.3 -REFGENIE_BY_CFG = {"0.3": "0.7.0", "0.2": "0.6.0"} -ATTRS_COPY_PULL = [CFG_ASSET_DESC_KEY, CFG_SEEK_KEYS_KEY, CFG_ASSET_PARENTS_KEY, CFG_ASSET_PATH_KEY, - CFG_ASSET_CHECKSUM_KEY, CFG_TAG_DESC_KEY] +REQ_CFG_VERSION = 0.4 +REFGENIE_BY_CFG = {"0.4": "0.10.0", "0.3": "0.7.0", "0.2": "0.6.0"} +CFG_UPGRADE = {"0.3": ["0.4"]} +ATTRS_COPY_PULL = [ + CFG_ASSET_DESC_KEY, + CFG_SEEK_KEYS_KEY, + CFG_ASSET_PARENTS_KEY, + CFG_ASSET_PATH_KEY, + CFG_ASSET_CHECKSUM_KEY, + CFG_TAG_DESC_KEY, +] REQ_TAG_ATTRS = [CFG_ASSET_PATH_KEY, CFG_SEEK_KEYS_KEY] -CUSTOM_BAR_FMT = "{desc}{percentage:3.0f}%|{bar}| {n_fmt} [{elapsed}<{remaining}{rate_fmt}{postfix}]" +CUSTOM_BAR_FMT = "{desc}{percentage:3.0f}%|{bar}| {n_fmt} [{elapsed}<{remaining} {rate_fmt}{postfix}]" -__all__ = ["DEFAULT_SERVER", "CFG_ASSET_DEFAULT_TAG_KEY", "CFG_KEY_NAMES", "CFG_GENOME_DESC_KEY", "REQ_CFG_VERSION", - "CFG_ASSETS_KEY", "CFG_GENOME_ATTRS_KEYS", "REFGENIE_BY_CFG", "DEFAULT_TAG", "ATTRS_COPY_PULL", - "RGC_REQ_KEYS", "REQ_TAG_ATTRS", "CUSTOM_BAR_FMT", "API_VERSION", "CONF_STRUCTURE", "OPERATION_IDS", - "CUSTOM_PFX", "HOOKS"] + FILE_DIR_NAMES + CFG_CONST + CFG_KEY_NAMES + API_IDS + HOOK_NAMES +__all__ = ( + [ + "DEFAULT_SERVER", + "CFG_ASSET_DEFAULT_TAG_KEY", + "CFG_KEY_NAMES", + "CFG_GENOME_DESC_KEY", + "REQ_CFG_VERSION", + "CFG_ASSETS_KEY", + "CFG_GENOME_ATTRS_KEYS", + "REFGENIE_BY_CFG", + "CFG_UPGRADE", + "DEFAULT_TAG", + "ATTRS_COPY_PULL", + "RGC_REQ_KEYS", + "REQ_TAG_ATTRS", + "CUSTOM_BAR_FMT", + "API_VERSION", + "API_VERSION_2", + "CONF_STRUCTURE", + "OPERATION_IDS", + "CUSTOM_PFX", + "PRIVATE_API", + "HOOKS", + "DEFAULT_CONFIG_SCHEMA", + ] + + FILE_DIR_NAMES + + CFG_CONST + + CFG_KEY_NAMES + + API_IDS + + HOOK_NAMES +) CONF_STRUCTURE = """ # example genome configuration structure @@ -135,33 +257,47 @@ {archive}: /path/to/archives {genomes}: - hg38: + fcdd62cb90e86d03e45dcd05efa70d8bdc9577d5c6259cf5: + {aliases}: ['hg38'] {desc_genome}: Reference assembly GRCh38, released in Dec 2013 - {digest}: 1110349234n20349280345df5035 {assets}: - bowtie2_index: + fasta: {default}: tag_name - {desc_asset}: Genome index for bowtie2, produced with bowtie2-build + {desc_asset}: DNA sequences in the FASTA format, indexed FASTA (produced with samtools index) and chromosome sizes file {tags}: tag_name: - {asset_path}: bowtie2_index - {tag_description}: produced with this settings/version of the bowtie2 software - {archive_digest}: 2220349234n20349280345mv2035 - {asset_digest}: 4420349234n20349jkn5jk4nj34n - {asset_size}: 32G - {archive_size}: 7G - {asset_parents}: - {asset_children}: ["fasta:default"] + {asset_path}: fasta + {archive_digest}: 35ae9a42c36c126f9d8ef6d938a122d0 + {asset_digest}: 3aff393d290884336945534ea709d30e + {asset_size}: 3.0GB + {archive_size}: 938.3MB + {asset_parents}:[] + {asset_children}: [] {seek_keys}: - fasta: hg38.fa.gz - fai: hg38.fa.fai - chrom_sizes: sizes.txt -""".format(folder=CFG_FOLDER_KEY, server=CFG_SERVERS_KEY, version=CFG_VERSION_KEY, assets=CFG_ASSETS_KEY, - archive=CFG_ARCHIVE_KEY, digest=CFG_CHECKSUM_KEY, genomes=CFG_GENOMES_KEY, - desc_genome=CFG_GENOME_DESC_KEY, asset_path=CFG_ASSET_PATH_KEY, desc_asset=CFG_ASSET_DESC_KEY, - archive_digest=CFG_ARCHIVE_CHECKSUM_KEY, asset_size=CFG_ASSET_SIZE_KEY, archive_size=CFG_ARCHIVE_SIZE_KEY, - seek_keys=CFG_SEEK_KEYS_KEY, asset_parents=CFG_ASSET_PARENTS_KEY, asset_children=CFG_ASSET_CHILDREN_KEY, - default=CFG_ASSET_DEFAULT_TAG_KEY, tags=CFG_ASSET_TAGS_KEY, asset_digest=CFG_ASSET_CHECKSUM_KEY, - tag_description=CFG_TAG_DESC_KEY, v=REQ_CFG_VERSION) - - + fasta: fcdd62cb90e86d03e45dcd05efa70d8bdc9577d5c6259cf5.fa.gz + fai: fcdd62cb90e86d03e45dcd05efa70d8bdc9577d5c6259cf5.fa.fai + chrom_sizes: fcdd62cb90e86d03e45dcd05efa70d8bdc9577d5c6259cf5.chrom.sizes +""".format( + folder=CFG_FOLDER_KEY, + server=CFG_SERVERS_KEY, + version=CFG_VERSION_KEY, + assets=CFG_ASSETS_KEY, + archive=CFG_ARCHIVE_KEY, + digest=CFG_CHECKSUM_KEY, + genomes=CFG_GENOMES_KEY, + aliases=CFG_ALIASES_KEY, + desc_genome=CFG_GENOME_DESC_KEY, + asset_path=CFG_ASSET_PATH_KEY, + desc_asset=CFG_ASSET_DESC_KEY, + archive_digest=CFG_ARCHIVE_CHECKSUM_KEY, + asset_size=CFG_ASSET_SIZE_KEY, + archive_size=CFG_ARCHIVE_SIZE_KEY, + seek_keys=CFG_SEEK_KEYS_KEY, + asset_parents=CFG_ASSET_PARENTS_KEY, + asset_children=CFG_ASSET_CHILDREN_KEY, + default=CFG_ASSET_DEFAULT_TAG_KEY, + tags=CFG_ASSET_TAGS_KEY, + asset_digest=CFG_ASSET_CHECKSUM_KEY, + tag_description=CFG_TAG_DESC_KEY, + v=REQ_CFG_VERSION, +) diff --git a/refgenconf/exceptions.py b/refgenconf/exceptions.py index a8e88042..b841e1f4 100644 --- a/refgenconf/exceptions.py +++ b/refgenconf/exceptions.py @@ -2,30 +2,43 @@ import abc -__all__ = ["DownloadJsonError", "GenomeConfigFormatError", "MissingAssetError", "MissingRecipeError", - "MissingConfigDataError", "MissingGenomeError", "MissingSeekKeyError", "MissingTagError", - "RefgenconfError", "UnboundEnvironmentVariablesError", "ConfigNotCompliantError", - "RemoteDigestMismatchError", "UndefinedAliasError"] +__all__ = [ + "DownloadJsonError", + "GenomeConfigFormatError", + "MissingAssetError", + "MissingRecipeError", + "MissingConfigDataError", + "MissingGenomeError", + "MissingSeekKeyError", + "MissingTagError", + "RefgenconfError", + "UnboundEnvironmentVariablesError", + "ConfigNotCompliantError", + "RemoteDigestMismatchError", +] DOC_URL = "http://refgenie.databio.org/en/latest/genome_config/" class RefgenconfError(Exception): """ Base exception type for this package """ + __metaclass__ = abc.ABCMeta class DownloadJsonError(RefgenconfError): """ Non-OK response from a JSON download attempt """ + def __init__(self, resp): super(DownloadJsonError, self).__init__( - "No response provided" if resp is None else - "JSON: {}".format(resp.json())) + "No response provided" if resp is None else "JSON: {}".format(resp.json()) + ) self.response = resp class GenomeConfigFormatError(RefgenconfError): """ Exception for invalid genome config file format. """ + def __init__(self, msg): spacing = " " if msg[-1] in ["?", ".", "\n"] else "; " suggest = "For config format documentation please see " + DOC_URL @@ -34,54 +47,61 @@ def __init__(self, msg): class MissingAssetError(RefgenconfError): """ Error type for request of an unavailable genome asset. """ + pass class MissingTagError(RefgenconfError): """ Error type for request of an unavailable asset tag. """ + pass class MissingSeekKeyError(RefgenconfError): """ Error type for request of an unavailable asset seek key. """ + pass class MissingRecipeError(RefgenconfError): """ Error type for request of an unavailable recipe. """ + pass class MissingConfigDataError(RefgenconfError): """ Missing required configuration instance items """ + pass class ConfigNotCompliantError(GenomeConfigFormatError): """ The format of the config file does not match required version/standards """ + pass class MissingGenomeError(RefgenconfError): """ Error type for request of unknown genome/assembly. """ + pass class UnboundEnvironmentVariablesError(RefgenconfError): """ Use of environment variable that isn't bound to a value. """ + pass class RemoteDigestMismatchError(RefgenconfError): """ Remote digest of the parent asset does not match its local counterpart """ + def __init__(self, asset, local_digest, remote_digest): - msg = "This asset is built from parent asset '{}', but for this parent, the remote does not " \ - "match your local asset (local: {}; remote: {}). Refgenie will not pull this asset " \ - "because the remote version was not built from the same parent asset you have locally." \ - .format(asset, local_digest, remote_digest) + msg = ( + "This asset is built from parent asset '{}', but for this parent, the remote does not " + "match your local asset (local: {}; remote: {}). Refgenie will not pull this asset " + "because the remote version was not built from the same parent asset you have locally.".format( + asset, local_digest, remote_digest + ) + ) super(RemoteDigestMismatchError, self).__init__(msg) - - -class UndefinedAliasError(RefgenconfError): - """ Alias is is not defined. """ - pass diff --git a/refgenconf/helpers.py b/refgenconf/helpers.py index 470819f2..ae344141 100644 --- a/refgenconf/helpers.py +++ b/refgenconf/helpers.py @@ -2,10 +2,22 @@ import os from yacman import select_config -from .const import CFG_ENV_VARS, BUILD_STATS_DIR +from .const import * +from .exceptions import DownloadJsonError, MissingAssetError +from .seqcol import SeqColClient + +import json from re import sub +import requests +from requests import get from ubiquerg import is_command_callable +import logging +import shutil +from copy import copy +from functools import partial +from requests import ConnectionError +_LOGGER = logging.getLogger(__name__) __all__ = ["select_genome_config", "get_dir_digest"] @@ -39,6 +51,7 @@ def unbound_env_vars(path): def asciify_json_dict(json_dict): from ubiquerg.collection import asciify_dict + return asciify_dict(json_dict) @@ -53,19 +66,263 @@ def get_dir_digest(path, pm=None): :return str: a digest, e.g. a3c46f201a3ce7831d85cf4a125aa334 """ if not is_command_callable("md5sum"): - raise OSError("md5sum command line tool is required for asset digest " - "calculation. \n" - "Install and try again, e.g on macOS: 'brew install " - "md5sha1sum'") - cmd = "cd {}; find . -type f -not -path './" + BUILD_STATS_DIR + \ - "*' -exec md5sum {{}} \; | sort -k 2 | awk '{{print $1}}' | md5sum" + raise OSError( + "md5sum command line tool is required for asset digest " + "calculation. \n" + "Install and try again, e.g on macOS: 'brew install " + "md5sha1sum'" + ) + cmd = ( + "cd {}; find . -type f -not -path './" + + BUILD_STATS_DIR + + "*' -exec md5sum {{}} \; | sort -k 2 | awk '{{print $1}}' | md5sum" + ) try: x = pm.checkprint(cmd.format(path)) except AttributeError: try: from subprocess import check_output + x = check_output(cmd.format(path), shell=True).decode("utf-8") except Exception as e: - + _LOGGER.warning( + "{}: could not calculate digest for '{}'".format( + e.__class__.__name__, path + ) + ) return - return str(sub(r'\W+', '', x)) # strips non-alphanumeric + return str(sub(r"\W+", "", x)) # strips non-alphanumeric + + +def format_config_03_04(rgc, get_json_url): + """ + upgrade the v0.3 config file format to v0.4 format: + get genome digests from the server or local fasta assets, + use the genome digests as primary key, + add 'aliases' section to the config, + remove 'genome_digests' section from the config + replace all aliases in keys/asset names with genome digests + + :param obj rgc: RefGenConfV03 obj + :param function(str, str) -> str get_json_url: how to build URL from + genome server URL base, genome, and asset + """ + + _LOGGER.info("Upgrading v0.3 config file format to v0.4.") + + for genome, genome_v in rgc[CFG_GENOMES_KEY].items(): + digest = "" + try: + _LOGGER.info( + f"Generating the digest from a local fasta file, " + f"and createing the ASDs for {genome}." + ) + tag = rgc.get_default_tag(genome, "fasta") + asset_path = rgc.seek(genome, "fasta", tag, "fasta") + ssc = SeqColClient({}) + digest, asdl = ssc.load_fasta(asset_path) + _LOGGER.info(f"Generated {genome} digest from local fasta file: {digest}") + # retrieve annotated sequence digests list to save in a JSON file + pth = os.path.join(rgc[CFG_FOLDER_KEY], genome, genome + "__ASDs.json") + os.makedirs(os.path.dirname(pth), exist_ok=True) + with open(pth, "w") as jfp: + json.dump(asdl, jfp) + _LOGGER.info(f"Saved ASDs to JSON: {pth}") + except (MissingAssetError, FileNotFoundError): + _LOGGER.info( + f"No local fasta asset found for {genome}. Retrieving digest from the server." + ) + # get genome digest from the server + cnt = 0 + servers = rgc[CFG_SERVERS_KEY] + for server in servers: + cnt += 1 + if not digest: + try: + url_alias = get_json_url( + s=server, i=API_VERSION + API_ID_ALIAS_DIGEST + ).format(alias=genome) + digest = download_json(url_alias) + _LOGGER.info( + f"Retrieved {genome} digest from the server: {digest}" + ) + except (KeyError, ConnectionError, DownloadJsonError) as e: + if cnt == len(servers): + _LOGGER.info( + f"Failed to retrieve the digest for {genome}. " + ) + continue + continue + + if digest: + # convert seek keys, children/parent asset keys from aliases to + # genome digests + rgc[CFG_GENOMES_KEY][genome] = replace_str_in_obj(genome_v, genome, digest) + # use the genome digest as primary keys + rgc[CFG_GENOMES_KEY][digest] = rgc[CFG_GENOMES_KEY].pop(genome) + # create "aliases" section + rgc[CFG_GENOMES_KEY][digest][CFG_ALIASES_KEY] = [genome] + # remove old "genome_digest" section + del rgc[CFG_GENOMES_KEY][digest][CFG_CHECKSUM_KEY] + else: + del rgc[CFG_GENOMES_KEY][genome] + + +def alter_file_tree_03_04(rgc, link_fun): + """ + update file structure inside genome_folder: + Drop genomes for which genome_digest is not available + on any of the servers and do not have a fasta asset locally. + contents inside genome_folder will be replaced by 'alias' and 'data' dir + + :param obj rgc: RefGenConfV03 obj + :param callable link_fun: function to use to link files, e.g os.symlink + or os.link + """ + my_genome = {} + for k, v in rgc[CFG_GENOMES_KEY].items(): + my_genome.update([(v[CFG_ALIASES_KEY][0], k)]) + + _LOGGER.info( + f"Creating '{DATA_DIR}' and '{ALIAS_DIR}' directories in " + f"'{rgc[CFG_FOLDER_KEY]}'." + ) + os.mkdir(os.path.abspath(os.path.join(rgc[CFG_FOLDER_KEY], DATA_DIR))) + os.mkdir(os.path.abspath(os.path.join(rgc[CFG_FOLDER_KEY], ALIAS_DIR))) + + _LOGGER.info( + f"Copying assets to '{DATA_DIR}' and creating alias symlinks in " + f"'{ALIAS_DIR}'. Genomes that the digest could not be determined for " + f"will be ignored." + ) + for root, dirs, files in os.walk(rgc[CFG_FOLDER_KEY]): + for dir in dirs: + if dir in my_genome: + shutil.copytree( + os.path.join(rgc[CFG_FOLDER_KEY], dir), + os.path.join(rgc[CFG_FOLDER_KEY], DATA_DIR, dir), + symlinks=True, + ) + del dirs[:] + + for root, dirs, files in os.walk(os.path.join(rgc[CFG_FOLDER_KEY], DATA_DIR)): + for dir in dirs: + swap_names_in_tree(os.path.join(root, dir), my_genome[dir], dir) + os.mkdir(os.path.join(rgc[CFG_FOLDER_KEY], ALIAS_DIR, dir)) + # create symlink for alias folder + for genome, assets, files in os.walk(os.path.join(root, my_genome[dir])): + for asset in assets: + old_path = os.path.join(genome, asset) + new_path = old_path.replace(my_genome[dir], dir).replace( + DATA_DIR, ALIAS_DIR + ) + os.mkdir(new_path) + + for file in files: + des_path = os.path.join(genome, file) # current file + src_path = ( + os.path.realpath(des_path) + .replace( + os.path.realpath(rgc[CFG_FOLDER_KEY]), + os.path.join(rgc[CFG_FOLDER_KEY], DATA_DIR), + ) + .replace(dir, my_genome[dir]) + ) # replace /genome_folder with /genome_folder/data + # replace alias in the file name with genome digest + + if os.path.islink(des_path): # if the current file is a link + os.remove( + des_path + ) # remove the link that would not work after deleting old genome assest + link_fun(src_path, des_path) # create the link with correct src + + old_path = os.path.join( + genome, file + ) # path of the file in data dir + new_path = old_path.replace( + my_genome[dir], dir + ).replace( # path of the file in alias + DATA_DIR, ALIAS_DIR + ) + + rel_old_path = os.path.join( + os.path.relpath( + os.path.dirname(old_path), os.path.dirname(new_path) + ), + os.path.basename(old_path), + ) + link_fun(rel_old_path, new_path) + del dirs[:] + + _LOGGER.info( + f"Removing genome assets that have been copied " f"to '{DATA_DIR}' directory." + ) + for genome, genome_v in rgc[CFG_GENOMES_KEY].items(): + d = os.path.join(rgc[CFG_FOLDER_KEY], genome_v[CFG_ALIASES_KEY][0]) + shutil.rmtree(d) + + +def swap_names_in_tree(top, new_name, old_name): + """ + Rename all files and directories within a directory tree and the + directory itself + + :param str top: path to the top of the tree to be renamed + :param str new_name: new name + :param str old_name: old name + :return bool: whether the renaming has been carried out + """ + + def _rename(x, rt): + os.rename(os.path.join(rt, x), os.path.join(rt, x.replace(old_name, new_name))) + + if not os.path.isdir(top): + return False + for root, dirs, files in os.walk(top): + for dir in dirs: + _rename(dir, root) + for file in files: + _rename(file, root) + if os.path.split(top)[1] == old_name: + # rename the top of the tree only if it is named as old_name + os.rename(top, os.path.join(os.path.join(top, os.pardir), new_name)) + return True + + +def download_json(url, params=None): + """ + Safely connect to the provided API endpoint and download JSON data. + + :param str url: server API endpoint + :param dict params: query parameters + :return dict: served data + """ + + _LOGGER.debug(f"Downloading JSON data; querying URL: {url}") + resp = get(url, params=params) + if resp.ok: + return resp.json() + elif resp.status_code == 404: + resp = None + raise DownloadJsonError(resp) + + +def replace_str_in_obj(object, x, y): + """ + Replace strings in an object + + :param any object: object to replace strings in + :param str x: string to replace + :param str y: replacement + :return any: object with strings replaced + """ + _replace = partial(replace_str_in_obj, x=x, y=y) + obj = copy(object) + if isinstance(obj, dict): + for k, v in obj.items(): + obj[k] = _replace(v) + if isinstance(obj, list): + obj = [_replace(i) for i in obj] + if isinstance(object, str): + obj = object.replace(x, y) + return obj diff --git a/refgenconf/henge.py b/refgenconf/henge.py new file mode 100644 index 00000000..a9fec682 --- /dev/null +++ b/refgenconf/henge.py @@ -0,0 +1,402 @@ +""" An interface to a database back-end for DRUIDs """ + +import copy +import hashlib +import jsonschema +import logging +import os +import yacman +import yaml + + +# module constants +# seqcol separators definition, e.g. chr1>2342>k34m6vlksb35nb,chr2>234>8h4m6vlaaab31ng +DELIM_ATTR = ">" # separating attributes in an item (internal separator) +DELIM_ITEM = "," # separating items in a collection (external separator) +ITEM_TYPE = "_item_type" + + +_LOGGER = logging.getLogger(__name__) + + +class NotFoundException(Exception): + """Raised when a digest is not found""" + + def __init__(self, m): + self.message = "{} not found in database".format(m) + + def __str__(self): + return self.message + + +def md5(seq): + return hashlib.md5(seq.encode()).hexdigest() + + +class Henge(object): + def __init__(self, database, schemas, henges=None, checksum_function=md5): + """ + A user interface to insert and retrieve decomposable recursive unique + identifiers (DRUIDs). + + :param dict database: Dict-like lookup database for sequences and + hashes. + :param list schemas: One or more jsonschema schemas describing the + data types stored by this Henge + :param dict henges: One or more henge objects indexed by object name for + remote storing of items. + :param function(str) -> str checksum_function: Default function to + handle the digest of the serialized items stored in this henge. + """ + self.database = database + self.checksum_function = checksum_function + self.digest_version = checksum_function.__name__ + + if isinstance(schemas, dict): + _LOGGER.debug("Using old dict schemas") + populated_schemas = {} + for schema_key, schema_value in schemas.items(): + if isinstance(schema_value, str): + populated_schemas[schema_key] = yacman.load_yaml(schema_value) + self.schemas = populated_schemas + else: + populated_schemas = [] + for schema_value in schemas: + if isinstance(schema_value, str): + if os.path.isfile(schema_value): + populated_schemas.append(yacman.load_yaml(schema_value)) + else: + populated_schemas.append(yaml.safe_load(schema_value)) + split_schemas = {} + for s in populated_schemas: + split_schemas.update(split_schema(s)) + + self.schemas = split_schemas + + # Identify which henge to use for each item type. Default to self: + self.henges = {} + for item_type in self.item_types: + self.henges[item_type] = self + + # Next add in any remote henges for item types not stored in self: + if henges: + for item_type, henge in henges.items(): + if item_type not in self.item_types: + self.schemas[item_type] = henge.schemas[item_type] + self.henges[item_type] = henge + + def retrieve(self, druid, reclimit=None, raw=False): + """ + Retrieve an item given a digest + + :param str druid: The Decomposable recursive unique identifier (DRUID), or + digest that uniquely identifies that item to retrieve. + :param int reclimit: Recursion limit. Set to None for no limit (default). + :param bool raw: Return the value as a raw, henge-delimited string, instead + of processing into a mapping. Default: False. + """ + + def reconstruct_item(string, schema, reclimit): + if "type" in schema and schema["type"] == "array": + return [ + reconstruct_item(substr, schema["items"], reclimit) + for substr in string.split(DELIM_ITEM) + ] + elif schema["type"] == "object": + # else: # assume it's an object + attr_array = string.split(DELIM_ATTR) + item_reconstituted = dict(zip(schema["properties"].keys(), attr_array)) + _LOGGER.debug(schema) + if "recursive" in schema: + if isinstance(reclimit, int) and reclimit == 0: + return item_reconstituted + else: + if isinstance(reclimit, int): + reclimit = reclimit - 1 + for recursive_attr in schema["recursive"]: + if ( + item_reconstituted[recursive_attr] + and item_reconstituted[recursive_attr] != "" + ): + item_reconstituted[recursive_attr] = self.retrieve( + item_reconstituted[recursive_attr], reclimit, raw + ) + return item_reconstituted + else: # it must be a primitive + if "recursive" in schema: + if isinstance(reclimit, int) and reclimit == 0: + return string + else: + if isinstance(reclimit, int): + reclimit = reclimit - 1 + return self.retrieve(string, reclimit, raw) + else: + print("not recursive") + print(schema) + return string + + if not druid + ITEM_TYPE in self.database: + raise NotFoundException(druid) + + item_type = self.database[druid + ITEM_TYPE] + _LOGGER.debug("item_type: {}".format(item_type)) + henge_to_query = self.henges[item_type] + _LOGGER.debug("henge_to_query: {}".format(henge_to_query)) + try: + string = henge_to_query.database[druid] + except KeyError: + raise NotFoundException(druid) + + schema = self.schemas[item_type] + return reconstruct_item(string, schema, reclimit) + + @property + def item_types(self): + """ + A list of item types handled by this Henge instance + """ + return list(self.schemas.keys()) + + def select_item_type(self, item): + """ + Returns a list of all item types handled by this instance that validate + with the given item. + + :param dict item: The item you wish to validate type of. + """ + valid_schemas = [] + for name, schema in self.schemas.items(): + _LOGGER.debug("Testing schema: {}".format(name)) + try: + jsonschema.validate(item, schema) + valid_schemas.append(name) + except jsonschema.ValidationError: + continue + return valid_schemas + + def insert(self, item, item_type): + """ + Add structured items of a specified type to the database. + + :param list item: List of items to add. + :param str item_type: A string specifying the type of item. Must match + something from Henge.list_item_types. You can use + Henge.select_item_type to automatically choose this, if only one + fits. + """ + + if item_type not in self.schemas.keys(): + _LOGGER.error( + "I don't know about items of type '{}'. " + "I know of: '{}'".format(item_type, list(self.schemas.keys())) + ) + return False + + schema = self.schemas[item_type] + + if not schema: + return self.insert(item, item_type) + + if schema["type"] == "object": + flat_item = {} + for prop in item: + if prop in schema["properties"]: + if "recursive" in schema and prop in schema["recursive"]: + hclass = schema["properties"][prop]["henge_class"] + digest = self.insert(item[prop], hclass) + flat_item[prop] = digest + else: + flat_item[prop] = item[prop] + else: + pass # Ignore non-schema defined properties + elif schema["type"] == "array": + flat_item = [] + if schema["recursive"]: + digest = [] + hclass = schema["items"]["henge_class"] + for element in item: + digest.append(self.insert(element, hclass)) + flat_item = digest + else: + flat_item = item + + return self._insert_flat(flat_item, item_type) + + def _insert_flat(self, item, item_type=None): + """ + Add flattened items (of a specified type) to the database. + + Flattened items have removed all levels, so it's only attributes and + strict values; no nesting allowed. Use the upstream insert function + to insert full structured objects, which calls this function. + + :param list item: List of items to add. + :param str item_type: A string specifying the type of item. Must match + something from Henge.list_item_types. You can use + Henge.select_item_type to automatically choose this, if only one + fits. + """ + if item_type not in self.schemas.keys(): + _LOGGER.error( + "I don't know about items of type '{}'. " + "I know of: '{}'".format(item_type, list(self.schemas.keys())) + ) + return False + + # digest_version should be automatically appended to the item by the + # henge. if we can put a 'default' into the schema, then the henge + # should also populate any missing attributes with default values. can + # jsonschema do this automatically? + # also item_type ? + + def safestr(item, x): + try: + return str(item[x]) + except (ValueError, TypeError, KeyError): + return "" + + def build_attr_string(item, schema): + if "type" in schema and schema["type"] == "array": + return DELIM_ITEM.join( + [build_attr_string(x, schema["items"]) for x in item] + ) + elif schema["type"] == "object" and "properties" in schema: + # else: # assume it's an object + return DELIM_ATTR.join( + [safestr(item, x) for x in list(schema["properties"].keys())] + ) + else: # assume it's a primitive + return item + + valid_schema = self.schemas[item_type] + # Add defaults here ? + try: + jsonschema.validate(item, valid_schema) + except jsonschema.ValidationError as e: + _LOGGER.error("Not valid data") + _LOGGER.error("Attempting to insert item: {}".format(item)) + _LOGGER.error("Item type: {}".format(item_type)) + print(e) + return False + + attr_string = build_attr_string(item, valid_schema) + druid = self.checksum_function(attr_string) + self._henge_insert(druid, attr_string, item_type) + _LOGGER.debug("Loaded {}".format(druid)) + return druid + + def _henge_insert(self, druid, string, item_type, digest_version=None): + """ + Inserts an item into the database, with henge-metadata slots for item + type and digest version. + """ + if not digest_version: + digest_version = self.digest_version + + # Here we could do a few things; should we put this metadata into the + # interface henge or the henge where the storage actually occurs? it + # MUST be in the interface henge; should it also be in the storage + # henge? + + henge_to_query = self.henges[item_type] + _LOGGER.debug("henge_to_query: {}".format(henge_to_query)) + try: + henge_to_query.database[druid] = string + henge_to_query.database[druid + ITEM_TYPE] = item_type + henge_to_query.database[druid + "_digest_version"] = digest_version + + if henge_to_query != self: + self.database[druid + ITEM_TYPE] = item_type + self.database[druid + "_digest_version"] = digest_version + except Exception as e: + raise e + + def clean(self): + """ + Remove all items from this database. + """ + for k, v in self.database.items(): + try: + del self.database[k] + del self.database[k + ITEM_TYPE] + del self.database[k + "_digest_version"] + except (KeyError, AttributeError): + pass + + def show(self): + """ + Show all items in the database. + """ + for k, v in self.database.items(): + print(k, v) + + def __repr__(self): + repr = ( + "Henge object\n" + + "Item types: " + + ",".join(self.item_types) + + "\n" + + "Schemas: " + + str(self.schemas) + ) + return repr + + +def split_schema(schema, name=None): + """ + Splits a hierarchical schema into flat components suitable for a Henge + """ + slist = {} + # base case + if schema["type"] not in ["object", "array"]: + _LOGGER.debug(schema) + if name: + slist[name] = schema + elif "henge_class" in schema: + slist[schema["henge_class"]] = schema + _LOGGER.debug("Returning slist: {}".format(str(slist))) + return slist + elif schema["type"] == "object": + if "henge_class" in schema: + schema_copy = copy.deepcopy(schema) + _LOGGER.debug("adding " + str(schema_copy["henge_class"])) + henge_class = schema_copy["henge_class"] + # del schema_copy['henge_class'] + for p in schema_copy["properties"]: + hclass = None + if "henge_class" in schema_copy["properties"][p]: + hclass = schema_copy["properties"][p]["henge_class"] + if schema_copy["properties"][p]["type"] in ["object", "array"]: + schema_copy["properties"][p] = {"type": "string"} + if hclass: + schema_copy["properties"][p][ + "henge_class" + ] = hclass # schema_copy['properties'][p]['type'] = "string" + # del schema_copy['properties'] + slist[henge_class] = schema_copy + + for p in schema["properties"]: + schema_sub = schema["properties"][p] + _LOGGER.debug("checking property:" + p) + slist.update(split_schema(schema["properties"][p])) + elif schema["type"] == "array": + _LOGGER.debug("found array") + if "henge_class" in schema: + schema_copy = copy.deepcopy(schema) + _LOGGER.debug("adding", schema_copy["henge_class"]) + henge_class = schema_copy["henge_class"] + # del schema_copy['henge_class'] + schema_copy["items"] = {"type": "string"} + if "recursive" in schema_copy: + schema_copy["items"]["recursive"] = True + if "henge_class" in schema["items"]: + schema_copy["items"]["henge_class"] = schema["items"]["henge_class"] + # schema_copy['items']['type'] = "string" + # if 'properties' in schema_copy['items']: + # del schema_copy['items']['properties'] + slist[henge_class] = schema_copy + + schema_sub = schema["items"] + _LOGGER.debug("Checking item") + slist.update(split_schema(schema_sub)) + return slist diff --git a/refgenconf/progress_bar.py b/refgenconf/progress_bar.py new file mode 100644 index 00000000..a1640273 --- /dev/null +++ b/refgenconf/progress_bar.py @@ -0,0 +1,53 @@ +from rich.progress import ProgressColumn, filesize +from rich.text import Text +from datetime import timedelta + + +class _DownloadColumn(ProgressColumn): + """Renders file size downloaded and total, e.g. '0.5/2.3 GB'.""" + + @staticmethod + def render(task): + """Calculate common unit for completed and total.""" + completed = int(task.completed) + total = int(task.total) + unit, suffix = filesize.pick_unit_and_suffix( + total, ["bytes", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB"], 1024 + ) + completed_ratio = completed / unit + total_ratio = total / unit + precision = 0 if unit == 1 else 1 + completed_str = f"{completed_ratio:,.{precision}f}" + total_str = f"{total_ratio:,.{precision}f}" + download_status = f"{completed_str}/{total_str} {suffix}" + download_text = Text(download_status, style="[bright_white]") + return download_text + + +class _TransferSpeedColumn(ProgressColumn): + """Renders human readable transfer speed.""" + + @staticmethod + def render(task): + """Show data transfer speed.""" + speed = task.speed + if speed is None: + return Text("?", style="[bright_white]") + data_speed = filesize.decimal(int(speed)) + return Text(f"{data_speed}/s", style="[bright_white]") + + +class _TimeRemainingColumn(ProgressColumn): + """Renders estimated time remaining.""" + + # Only refresh twice a second to prevent jitter + max_refresh = 1 + + @staticmethod + def render(task): + """Show time remaining.""" + remaining = task.time_remaining + if remaining is None: + return Text("-:--:--", style="[bright_white]") + remaining_delta = timedelta(seconds=int(remaining)) + return Text(str(remaining_delta), style="[bright_white]") diff --git a/refgenconf/refgenconf.py b/refgenconf/refgenconf.py index b15ff68a..0e6563cd 100755 --- a/refgenconf/refgenconf.py +++ b/refgenconf/refgenconf.py @@ -1,7 +1,6 @@ #!/usr/bin/env python import sys -import urllib.request import itertools import logging import os @@ -10,27 +9,47 @@ import shutil import json -from collections import Iterable, Mapping, OrderedDict +import yacman + +from urllib.request import urlopen, urlretrieve +from urllib.error import HTTPError, ContentTooShortError +from urllib.parse import urlencode +from collections import OrderedDict +from collections.abc import Iterable, Mapping from functools import partial from inspect import getfullargspec as finspect -from urllib.error import HTTPError, ContentTooShortError -from tqdm import tqdm from pkg_resources import iter_entry_points -from tempfile import TemporaryDirectory +from rich.table import Table +from rich.progress import Progress, TextColumn, BarColumn +from requests import ConnectionError +from requests.exceptions import MissingSchema +from jsonschema.exceptions import ValidationError -from yacman import YacAttMap +from .progress_bar import _DownloadColumn, _TimeRemainingColumn, _TransferSpeedColumn + +from .seqcol import SeqColClient from attmap import PathExAttMap as PXAM -from ubiquerg import checksum, is_url, query_yes_no, untar, is_writable, \ - parse_registry_path as prp +from ubiquerg import ( + checksum, + is_url, + query_yes_no, + untar, + is_writable, + parse_registry_path as prp, +) from .const import * -from .helpers import unbound_env_vars, asciify_json_dict, select_genome_config,\ - get_dir_digest +from .helpers import ( + asciify_json_dict, + select_genome_config, + get_dir_digest, + download_json, +) from .exceptions import * _LOGGER = logging.getLogger(__name__) -__all__ = ["RefGenConf"] +__all__ = ["RefGenConf", "upgrade_config"] def _handle_sigint(filepath): @@ -43,14 +62,23 @@ def handle(sig, frame): else: _LOGGER.info("Incomplete file '{}' was removed".format(filepath)) sys.exit(0) + return handle -class RefGenConf(YacAttMap): +class RefGenConf(yacman.YacAttMap): """ A sort of oracle of available reference genome assembly assets """ - def __init__(self, filepath=None, entries=None, writable=False, wait_max=60, - skip_read_lock=False): + def __init__( + self, + filepath=None, + entries=None, + writable=False, + wait_max=60, + skip_read_lock=False, + genome_exact=False, + schema_source=None, + ): """ Create the config instance by with a filepath or key-value pairs. @@ -70,18 +98,16 @@ def __init__(self, filepath=None, entries=None, writable=False, wait_max=60, def _missing_key_msg(key, value): _LOGGER.debug("Config lacks '{}' key. Setting to: {}".format(key, value)) - super(RefGenConf, self).__init__(filepath=filepath, entries=entries, - writable=writable, wait_max=wait_max, - skip_read_lock=skip_read_lock) - genomes = self.setdefault(CFG_GENOMES_KEY, PXAM()) - if not isinstance(genomes, PXAM): - if genomes: - _LOGGER.warning("'{k}' value is a {t_old}, not a {t_new}; setting to empty {t_new}". - format(k=CFG_GENOMES_KEY, t_old=type(genomes).__name__, t_new=PXAM.__name__)) - self[CFG_GENOMES_KEY] = PXAM() - if CFG_FOLDER_KEY not in self: - self[CFG_FOLDER_KEY] = os.path.dirname(entries) if isinstance(entries, str) else os.getcwd() - _missing_key_msg(CFG_FOLDER_KEY, self[CFG_FOLDER_KEY]) + super(RefGenConf, self).__init__( + filepath=filepath, + entries=entries, + writable=writable, + wait_max=wait_max, + skip_read_lock=skip_read_lock, + schema_source=schema_source or DEFAULT_CONFIG_SCHEMA, + write_validate=True, + ) + # assert correct config version try: version = self[CFG_VERSION_KEY] except KeyError: @@ -91,23 +117,47 @@ def _missing_key_msg(key, value): try: version = float(version) except ValueError: - _LOGGER.warning("Cannot parse config version as numeric: {}".format(version)) + _LOGGER.warning( + "Cannot parse config version as numeric: {}".format(version) + ) else: if version < REQ_CFG_VERSION: - msg = "This genome config (v{}) is not compliant with v{} standards. To use it, please downgrade " \ - "refgenie: 'pip install refgenie=={}'.".format(self[CFG_VERSION_KEY], str(REQ_CFG_VERSION), - REFGENIE_BY_CFG[str(version)]) + msg = ( + "This genome config (v{}) is not compliant with v{} standards. \n" + "To use current refgenconf, please use upgrade_config function to upgrade, or" + "downgrade refgenconf: 'pip install \"refgenconf>={},<{}\"'. \n" + "If refgenie is installed, you can use 'refgenie upgrade --target-version {}'".format( + self[CFG_VERSION_KEY], + str(REQ_CFG_VERSION), + REFGENIE_BY_CFG[str(version)], + REFGENIE_BY_CFG[str(REQ_CFG_VERSION)], + str(REQ_CFG_VERSION), + ) + ) raise ConfigNotCompliantError(msg) + else: _LOGGER.debug("Config version is compliant: {}".format(version)) + + # initialize "genomes_folder" + if CFG_FOLDER_KEY not in self: + self[CFG_FOLDER_KEY] = ( + os.path.dirname(filepath) if filepath else os.getcwd() + ) + _missing_key_msg(CFG_FOLDER_KEY, self[CFG_FOLDER_KEY]) + # initialize "genome_servers" if CFG_SERVERS_KEY not in self and CFG_SERVER_KEY in self: # backwards compatibility after server config key change self[CFG_SERVERS_KEY] = self[CFG_SERVER_KEY] del self[CFG_SERVER_KEY] - _LOGGER.debug("Moved servers list from '{}' to '{}'".format(CFG_SERVER_KEY, CFG_SERVERS_KEY)) + _LOGGER.debug( + f"Moved servers list from '{CFG_SERVER_KEY}' to '{CFG_SERVERS_KEY}'" + ) try: if isinstance(self[CFG_SERVERS_KEY], list): - tmp_list = [server_url.rstrip("/") for server_url in self[CFG_SERVERS_KEY]] + tmp_list = [ + server_url.rstrip("/") for server_url in self[CFG_SERVERS_KEY] + ] self[CFG_SERVERS_KEY] = tmp_list else: # Logic in pull_asset expects a list, even for a single server self[CFG_SERVERS_KEY] = self[CFG_SERVERS_KEY].rstrip("/") @@ -116,6 +166,28 @@ def _missing_key_msg(key, value): _missing_key_msg(CFG_SERVERS_KEY, str([DEFAULT_SERVER])) self[CFG_SERVERS_KEY] = [DEFAULT_SERVER] + # initialize "genomes" mapping + if CFG_GENOMES_KEY in self: + if not isinstance(self[CFG_GENOMES_KEY], PXAM): + if self[CFG_GENOMES_KEY]: + _LOGGER.warning( + "'{k}' value is a {t_old}, not a {t_new}; setting to empty {t_new}".format( + k=CFG_GENOMES_KEY, + t_old=type(self[CFG_GENOMES_KEY]).__name__, + t_new=PXAM.__name__, + ) + ) + self[CFG_GENOMES_KEY] = PXAM() + else: + self[CFG_GENOMES_KEY] = PXAM() + + self[CFG_GENOMES_KEY] = yacman.AliasedYacAttMap( + entries=self[CFG_GENOMES_KEY], + aliases=lambda x: {k: v.__getitem__(CFG_ALIASES_KEY) for k, v in x.items()}, + aliases_strict=True, + exact=genome_exact, + ) + def __bool__(self): minkeys = set(self.keys()) == set(RGC_REQ_KEYS) return not minkeys or bool(self[CFG_GENOMES_KEY]) @@ -131,7 +203,71 @@ def plugins(self): are names of all possible hooks and values are dicts mapping registered functions names to their values """ - return {h: {ep.name: ep.load() for ep in iter_entry_points('refgenie.hooks.' + h)} for h in HOOKS} + return { + h: {ep.name: ep.load() for ep in iter_entry_points("refgenie.hooks." + h)} + for h in HOOKS + } + + @property + def genome_aliases(self): + """ + Mapping of human-readable genome identifiers to genome identifiers + + :return dict: mapping of human-readable genome identifiers to genome + identifiers + """ + return self.genomes[yacman.IK][yacman.ALIASES_KEY] + + @property + def genome_aliases_table(self): + """ + Mapping of human-readable genome identifiers to genome identifiers + + :return dict: mapping of human-readable genome identifiers to genome + identifiers + """ + table = Table(title="Genome aliases") + table.add_column("genome") + table.add_column("alias") + if CFG_GENOMES_KEY not in self or not self[CFG_GENOMES_KEY]: + return table + for genome, genome_dict in self[CFG_GENOMES_KEY].items(): + if ( + CFG_ALIASES_KEY not in self[CFG_GENOMES_KEY][genome] + or not self[CFG_GENOMES_KEY][genome][CFG_ALIASES_KEY] + ): + aliases = "" + else: + aliases = ", ".join(self[CFG_GENOMES_KEY][genome][CFG_ALIASES_KEY]) + table.add_row(genome, aliases) + return table + + @property + def data_dir(self): + """ + Path to the genome data directory + + :return str: path to the directory where the assets are stored + """ + return os.path.abspath(os.path.join(self[CFG_FOLDER_KEY], DATA_DIR)) + + @property + def alias_dir(self): + """ + Path to the genome alias directory + + :return str: path to the directory where the assets are stored + """ + return os.path.abspath(os.path.join(self[CFG_FOLDER_KEY], ALIAS_DIR)) + + @property + def file_path(self): + """ + Path to the genome configuration file + + :return str: path to the genome configuration file + """ + return self[yacman.IK][yacman.FILEPATH_KEY] def initialize_config_file(self, filepath=None): """ @@ -142,13 +278,16 @@ def initialize_config_file(self, filepath=None): :raise OSError: in case the file could not be initialized due to insufficient permissions or pre-existence :raise TypeError: if no valid filepath cat be determined """ + def _write_fail_err(reason): raise OSError("Can't initialize, {}: {} ".format(reason, filepath)) filepath = select_genome_config(filepath, check_exist=False) if not isinstance(filepath, str): - raise TypeError("Could not determine a valid path to " - "initialize a configuration file: {}".format(str(filepath))) + raise TypeError( + f"Could not determine a valid path to initialize a " + f"configuration file: {filepath}" + ) if os.path.exists(filepath): _write_fail_err("file exists") if not is_writable(filepath, check_exist=False): @@ -156,7 +295,12 @@ def _write_fail_err(reason): self.make_writable(filepath) self.write() self.make_readonly() - _LOGGER.info("Initialized genome configuration file: {}".format(filepath)) + _LOGGER.info(f"Initialized genome configuration file: {filepath}") + os.makedirs(self.data_dir, exist_ok=True) + os.makedirs(self.alias_dir, exist_ok=True) + _LOGGER.info( + f"Created directories:\n - {self.data_dir}" f"\n - {self.alias_dir}" + ) return filepath def list(self, genome=None, order=None, include_tags=False): @@ -170,17 +314,114 @@ def list(self, genome=None, order=None, include_tags=False): collection of available asset names. """ self.run_plugins(PRE_LIST_HOOK) - refgens = _select_genomes(sorted(self[CFG_GENOMES_KEY].keys(), key=order), genome) + refgens = self._select_genomes(genome=genome, order=order) if include_tags: self.run_plugins(POST_LIST_HOOK) return OrderedDict( - [(g, sorted(_make_asset_tags_product(self[CFG_GENOMES_KEY][g][CFG_ASSETS_KEY], ":"), key=order)) - for g in refgens]) + [ + ( + g, + sorted( + _make_asset_tags_product( + self[CFG_GENOMES_KEY][g][CFG_ASSETS_KEY], ":" + ), + key=order, + ), + ) + for g in refgens + ] + ) self.run_plugins(POST_LIST_HOOK) - return OrderedDict([(g, sorted(list(self[CFG_GENOMES_KEY][g][CFG_ASSETS_KEY].keys()), key=order)) - for g in refgens]) + return OrderedDict( + [ + ( + g, + sorted( + list(self[CFG_GENOMES_KEY][g][CFG_ASSETS_KEY].keys()), key=order + ), + ) + for g in refgens + ] + ) + + def get_asset_table( + self, + genomes=None, + server_url=None, + get_json_url=lambda s, i: construct_request_url(s, i, PRIVATE_API), + ): + """ + Get a rich.Table object representing assets available locally + + :param list[str] genomes: genomes to restrict the results with + :param str server_url: server URL to query for the remote genome data + :param function(str, str) -> str get_json_url: how to build URL from + genome server URL base, genome, and asset + :return rich.table.Table: table of assets available locally + """ + + def _fill_table_with_genomes_data(rgc, genomes_data, table, genomes=None): + it = "([italic]{}[/italic])" + if genomes: + table.add_column("genome") + table.add_column("asset " + it.format("seek_keys")) + table.add_column("tags") + for g in genomes: + genome = rgc.get_genome_alias_digest(alias=g, fallback=True) + if genome not in genomes_data: + _LOGGER.error(f"Genome {g} ({genome}) not found") + continue + genome_dict = genomes_data[genome] + for asset, asset_dict in genome_dict[CFG_ASSETS_KEY].items(): + tags = list(asset_dict[CFG_ASSET_TAGS_KEY].keys()) + seek_keys = list( + asset_dict[CFG_ASSET_TAGS_KEY][tags[0]][ + CFG_SEEK_KEYS_KEY + ].keys() + ) + table.add_row( + ", ".join(genome_dict[CFG_ALIASES_KEY]), + "{} ".format(asset) + it.format(", ".join(seek_keys)), + ", ".join(tags), + ) + else: + table.add_column("genome") + table.add_column("assets") + for genome in list(genomes_data.keys()): + genome_dict = genomes_data[genome] + table.add_row( + ", ".join(genome_dict[CFG_ALIASES_KEY]), + ", ".join(list(genome_dict[CFG_ASSETS_KEY].keys())), + ) + return table + + if server_url is None: + genomes_data = self[CFG_GENOMES_KEY] + title = ( + f"Local refgenie assets\nServer subscriptions: " + f"{', '.join(self[CFG_SERVERS_KEY])}" + ) + else: + genomes_data = download_json(get_json_url(server_url, API_ID_GENOMES_DICT)) + title = f"Remote refgenie assets\nServer URL: {server_url}" + c = ( + f"use refgenie list{'r' if server_url is not None else ''} " + f"-g for more detailed view" + if genomes is None + else "" + ) + return _fill_table_with_genomes_data( + self, genomes_data, Table(title=title, min_width=70, caption=c), genomes + ) - def assets_str(self, offset_text=" ", asset_sep=", ", genome_assets_delim="/ ", genome=None, order=None): + def assets_str( + self, + offset_text=" ", + asset_sep=", ", + genome_assets_delim="/ ", + genome=None, + order=None, + ): """ Create a block of text representing genome-to-asset mapping. @@ -195,10 +436,18 @@ def assets_str(self, offset_text=" ", asset_sep=", ", genome_assets_delim="/ ", names for sort :return str: text representing genome-to-asset mapping """ - refgens = _select_genomes(sorted(self[CFG_GENOMES_KEY].keys(), key=order), genome) - make_line = partial(_make_genome_assets_line, offset_text=offset_text, genome_assets_delim=genome_assets_delim, - asset_sep=asset_sep, order=order) - return "\n".join([make_line(g, self[CFG_GENOMES_KEY][g][CFG_ASSETS_KEY]) for g in refgens]) + refgens = self._select_genomes(genome=genome, order=order) + make_line = partial( + _make_genome_assets_line, + offset_text=offset_text, + genome_assets_delim=genome_assets_delim, + asset_sep=asset_sep, + order=order, + rjust=max(map(len, refgens) or [0]) + 2, + ) + return "\n".join( + [make_line(g, self[CFG_GENOMES_KEY][g][CFG_ASSETS_KEY]) for g in refgens] + ) def add(self, path, genome, asset, tag=None, seek_keys=None, force=False): """ @@ -212,31 +461,46 @@ def add(self, path, genome, asset, tag=None, seek_keys=None, force=False): :param dict seek_keys: seek keys to add :param bool force: whether to force existing asset overwrite """ + try: + genome = self.get_genome_alias_digest(alias=genome, fallback=True) + except yacman.UndefinedAliasError: + _LOGGER.error( + "No digest defined for '{}'. Set an alias or pull an" + " asset to initialize.".format(genome) + ) + return False tag = tag or self.get_default_tag(genome, asset) abspath = os.path.join(self[CFG_FOLDER_KEY], path) remove = False if not os.path.exists(abspath) or not os.path.isabs(abspath): - raise OSError("Provided path must exist and be relative to the" - " genome_folder: {}".format(self[CFG_FOLDER_KEY])) + raise OSError( + "Provided path must exist and be relative to the" + " genome_folder: {}".format(self[CFG_FOLDER_KEY]) + ) try: _assert_gat_exists(self[CFG_GENOMES_KEY], genome, asset, tag) except Exception: pass else: - if not force and not \ - query_yes_no("'{}/{}:{}' exists. Do you want to overwrite?". - format(genome, asset, tag)): + if not force and not query_yes_no( + "'{}/{}:{}' exists. Do you want to overwrite?".format( + genome, asset, tag + ) + ): _LOGGER.info("Aborted by a user, asset no added") return False remove = True _LOGGER.info("Will remove existing to overwrite") tag_data = { CFG_ASSET_PATH_KEY: path, - CFG_ASSET_CHECKSUM_KEY: get_dir_digest(path) or "" + CFG_ASSET_CHECKSUM_KEY: get_dir_digest(abspath) or "", } msg = "Added asset: {}/{}:{} {}".format( - genome, asset, tag, "" if not seek_keys else "with seek keys: {}". - format(seek_keys)) + genome, + asset, + tag, + "" if not seek_keys else "with seek keys: {}".format(seek_keys), + ) if not self.file_path: if remove: self.cfg_remove_assets(genome, asset, tag) @@ -244,15 +508,125 @@ def add(self, path, genome, asset, tag=None, seek_keys=None, force=False): self.update_seek_keys(genome, asset, tag, seek_keys or {asset: "."}) self.set_default_pointer(genome, asset, tag) _LOGGER.info(msg) - return True - with self as rgc: - if remove: - rgc.cfg_remove_assets(genome, asset, tag) - rgc.update_tags(genome, asset, tag, tag_data) - rgc.update_seek_keys(genome, asset, tag, seek_keys or {asset: "."}) - rgc.set_default_pointer(genome, asset, tag) - _LOGGER.info(msg) - return True + else: + with self as rgc: + if remove: + rgc.cfg_remove_assets(genome, asset, tag) + rgc.update_tags(genome, asset, tag, tag_data) + rgc.update_seek_keys(genome, asset, tag, seek_keys or {asset: "."}) + rgc.set_default_pointer(genome, asset, tag) + _LOGGER.info(msg) + self._symlink_alias(genome, asset, tag) + return True + + def get_symlink_paths(self, genome, asset=None, tag=None, all_aliases=False): + """ + Get path to the alias directory for the selected genome-asset-tag + + :param str genome: reference genome ID + :param str asset: asset name + :param str tag: tag name + :param bool all_aliases: whether to return a collection of symbolic + links or just the first one from the alias list + :return dict: + """ + try: + defined_aliases = self.get_genome_alias( + genome, fallback=True, all_aliases=all_aliases + ) + except yacman.UndefinedAliasError: + return {} + alias = _make_list_of_str(defined_aliases) + if asset: + tag = tag or self.get_default_tag(genome, asset) + return { + a: os.path.join(self.alias_dir, a, asset, tag) + if asset + else os.path.join(self.alias_dir, a) + for a in alias + } + + def _symlink_alias( + self, genome, asset=None, tag=None, link_fun=lambda t, s: os.symlink(t, s) + ): + """ + Go through the files in the asset directory and recreate the asset + directory tree, but instead of copying files, create symbolic links + + :param str genome: reference genome ID + :param str asset: asset name + :param str tag: tag name + :param callable link_fun: function to use to link files, e.g os.symlink + or os.link + """ + + def _rpl(str): + """ + RePLace genome digest with human-readable genome ID, if exists + """ + return str.replace(genome_digest, alias) + + if not callable(link_fun) or len(finspect(link_fun).args) != 2: + raise TypeError( + "Linking function must be a two-arg function " "(target, destination)" + ) + created = [] + genome_digest = self.get_genome_alias_digest(genome, fallback=True) + if asset: + tag = tag or self.get_default_tag(genome, asset) + src_path = self.seek_src(genome, asset, tag, enclosing_dir=True) + else: + src_path = os.path.join(self.data_dir, genome_digest) + target_paths_mapping = self.get_symlink_paths( + genome_digest, asset, tag, all_aliases=True + ) + for alias, path in target_paths_mapping.items(): + if not os.path.exists(path): + os.makedirs(path, exist_ok=True) + for root, dirs, files in os.walk(src_path): + appendix = os.path.relpath(root, src_path) + for dir in dirs: + try: + os.makedirs(os.path.join(path, appendix, _rpl(dir))) + except FileExistsError: + continue + for file in files: + try: + rel = os.path.relpath( + os.path.join(root, file), os.path.join(path) + ) + link_fun(rel, os.path.join(path, appendix, _rpl(file))) + except FileExistsError: + _LOGGER.warning( + f"Could not create link, file exists: " + f"{os.path.join(path, appendix, _rpl(file))}" + ) + continue + created.append(path) + if created: + _LOGGER.info( + "Created alias directories: \n - {}".format("\n - ".join(created)) + ) + + @staticmethod + def _remove_symlink_alias(symlink_dict, aliases_to_remove): + """ + Remove the symlink directories + + :param list[str] | str aliases_to_remove: collection of aliases to + remove the symlink directories for + :param dict symlink_dict: a dictionary mapping alias names to the + respective symlink directories + """ + dirs_to_remove = [symlink_dict[k] for k in _make_list_of_str(aliases_to_remove)] + for d in dirs_to_remove: + shutil.rmtree(d) + if dirs_to_remove: + _LOGGER.info( + "Removed alias directories: \n - {}".format( + "\n - ".join(dirs_to_remove) + ) + ) def filepath(self, genome, asset, tag, ext=".tgz", dir=False): """ @@ -265,7 +639,7 @@ def filepath(self, genome, asset, tag, ext=".tgz", dir=False): :param bool dir: whether to return the enclosing directory instead of the file :return str: path to asset for given genome and asset kind/name """ - tag_dir = os.path.join(self[CFG_FOLDER_KEY], genome, asset, tag) + tag_dir = os.path.join(self.data_dir, genome, asset, tag) return os.path.join(tag_dir, asset + "__" + tag + ext) if not dir else tag_dir def genomes_list(self, order=None): @@ -275,7 +649,13 @@ def genomes_list(self, order=None): :return Iterable[str]: list of this configuration's reference genome assembly IDs """ - return sorted(list(self[CFG_GENOMES_KEY].keys()), key=order) + return sorted( + [ + self.get_genome_alias(x, fallback=True) + for x in self[CFG_GENOMES_KEY].keys() + ], + key=order, + ) def genomes_str(self, order=None): """ @@ -287,9 +667,101 @@ def genomes_str(self, order=None): """ return ", ".join(self.genomes_list(order)) - def seek(self, genome_name, asset_name, tag_name=None, seek_key=None, - strict_exists=None, enclosing_dir=False, - check_exist=lambda p: os.path.exists(p) or is_url(p)): + def seek( + self, + genome_name, + asset_name, + tag_name=None, + seek_key=None, + strict_exists=None, + enclosing_dir=False, + all_aliases=False, + check_exist=lambda p: os.path.exists(p) or is_url(p), + ): + """ + Seek path to a specified genome-asset-tag alias + + :param str genome_name: name of a reference genome assembly of interest + :param str asset_name: name of the particular asset to fetch + :param str tag_name: name of the particular asset tag to fetch + :param str seek_key: name of the particular subasset to fetch + :param bool | NoneType strict_exists: how to handle case in which + path doesn't exist; True to raise IOError, False to raise + RuntimeWarning, and None to do nothing at all. + Default: None (do not check). + :param function(callable) -> bool check_exist: how to check for + asset/path existence + :param bool enclosing_dir: whether a path to the entire enclosing + directory should be returned, e.g. for a fasta asset that has 3 + seek_keys pointing to 3 files in an asset dir, that asset dir + is returned + :param bool all_aliases: whether to return paths to all asset aliases or + just the one for the specified 'genome_name` argument + :return str: path to the asset + :raise TypeError: if the existence check is not a one-arg function + :raise refgenconf.MissingGenomeError: if the named assembly isn't known + to this configuration instance + :raise refgenconf.MissingAssetError: if the names assembly is known to + this configuration instance, but the requested asset is unknown + """ + tag_name = tag_name or self.get_default_tag(genome_name, asset_name) + genome_digest = self.get_genome_alias_digest(genome_name, fallback=True) + genome_ids = _make_list_of_str( + self.get_genome_alias(genome_digest, fallback=True, all_aliases=True) + ) + idx = 0 + if genome_name in genome_ids: + idx = genome_ids.index(genome_name) + self._assert_gat_exists(genome_name, asset_name, tag_name) + asset_tag_data = self[CFG_GENOMES_KEY][genome_name][CFG_ASSETS_KEY][asset_name][ + CFG_ASSET_TAGS_KEY + ][tag_name] + if not seek_key: + if asset_name in asset_tag_data[CFG_SEEK_KEYS_KEY]: + seek_val = asset_tag_data[CFG_SEEK_KEYS_KEY][asset_name] + else: + seek_val = "" + else: + try: + seek_val = asset_tag_data[CFG_SEEK_KEYS_KEY][seek_key] + except KeyError: + raise MissingSeekKeyError( + f"Seek key '{seek_key}' not defined for: " + f"'{genome_name}.{asset_name}:{tag_name}'" + ) + if enclosing_dir: + seek_val = "" + fullpath = os.path.join( + self.alias_dir, genome_digest, asset_name, tag_name, seek_val + ) + fullpaths = [fullpath.replace(genome_digest, gid) for gid in genome_ids] + paths_existence = [check_exist(fp) for fp in fullpaths] + if all(paths_existence): + return fullpaths if all_aliases else fullpaths[idx] + nonexistent_pths = [ + fullpaths[p] for p in [i for i, x in enumerate(paths_existence) if not x] + ] + msg = "For genome '{}' alias to the asset '{}/{}:{}' doesn't exist: {}".format( + genome_name, asset_name, seek_key, tag_name, ", ".join(nonexistent_pths) + ) + if strict_exists is None: + _LOGGER.debug(msg) + if strict_exists is True: + raise OSError(msg) + else: + warnings.warn(msg, RuntimeWarning) + return fullpaths if all_aliases else fullpaths[idx] + + def seek_src( + self, + genome_name, + asset_name, + tag_name=None, + seek_key=None, + strict_exists=None, + enclosing_dir=False, + check_exist=lambda p: os.path.exists(p) or is_url(p), + ): """ Seek path to a specified genome-asset-tag @@ -299,11 +771,14 @@ def seek(self, genome_name, asset_name, tag_name=None, seek_key=None, :param str seek_key: name of the particular subasset to fetch :param bool | NoneType strict_exists: how to handle case in which path doesn't exist; True to raise IOError, False to raise - RuntimeWarning, and None to do nothing at all. Default: None (do not check). + RuntimeWarning, and None to do nothing at all. + Default: None (do not check). :param function(callable) -> bool check_exist: how to check for asset/path existence - :param bool enclosing_dir: whether a path to the entire enclosing directory should be returned, e.g. - for a fasta asset that has 3 seek_keys pointing to 3 files in an asset dir, that asset dir is returned + :param bool enclosing_dir: whether a path to the entire enclosing + directory should be returned, e.g. for a fasta asset that has 3 + seek_keys pointing to 3 files in an asset dir, that asset dir + is returned :return str: path to the asset :raise TypeError: if the existence check is not a one-arg function :raise refgenconf.MissingGenomeError: if the named assembly isn't known @@ -313,16 +788,23 @@ def seek(self, genome_name, asset_name, tag_name=None, seek_key=None, """ tag_name = tag_name or self.get_default_tag(genome_name, asset_name) _LOGGER.debug( - "getting asset: '{}/{}.{}:{}'".format(genome_name, asset_name, seek_key, - tag_name)) + "getting asset: '{}/{}.{}:{}'".format( + genome_name, asset_name, seek_key, tag_name + ) + ) if not callable(check_exist) or len(finspect(check_exist).args) != 1: raise TypeError("Asset existence check must be a one-arg function.") # 3 'path' key options supported # option1: absolute path # get just the saute path value from the config path_val = _genome_asset_path( - self[CFG_GENOMES_KEY], genome_name, asset_name, tag_name, - enclosing_dir=True, no_tag=True, seek_key=None + self[CFG_GENOMES_KEY], + genome_name, + asset_name, + tag_name, + enclosing_dir=True, + no_tag=True, + seek_key=None, ) _LOGGER.debug("Trying absolute path: {}".format(path_val)) if seek_key: @@ -331,28 +813,52 @@ def seek(self, genome_name, asset_name, tag_name=None, seek_key=None, path = path_val if os.path.isabs(path) and check_exist(path): return path + genome_name = self.get_genome_alias_digest(genome_name, fallback=True) # option2: relative to genome_folder/{genome} (default, canonical) - path = _genome_asset_path(self[CFG_GENOMES_KEY], genome_name, asset_name, - tag_name, seek_key, enclosing_dir) - fullpath = os.path.join(self[CFG_FOLDER_KEY], genome_name, path) - _LOGGER.debug("Trying relative to genome_folder/genome ({}/{}): {}". - format(self[CFG_FOLDER_KEY], genome_name, fullpath)) + path = _genome_asset_path( + self[CFG_GENOMES_KEY], + genome_name, + asset_name, + tag_name, + seek_key, + enclosing_dir, + ) + fullpath = os.path.join(self.data_dir, genome_name, path) + _LOGGER.debug( + "Trying relative to genome_folder/genome/_data ({}/{}/{}): {}".format( + self[CFG_FOLDER_KEY], genome_name, DATA_DIR, fullpath + ) + ) if check_exist(fullpath): return fullpath # option3: relative to the genome_folder (if option2 does not exist) gf_relpath = os.path.join( self[CFG_FOLDER_KEY], - _genome_asset_path(self[CFG_GENOMES_KEY], genome_name, asset_name, - tag_name, seek_key, enclosing_dir, no_tag=True) + _genome_asset_path( + self[CFG_GENOMES_KEY], + genome_name, + asset_name, + tag_name, + seek_key, + enclosing_dir, + no_tag=True, + ), + ) + _LOGGER.debug( + "Trying path relative to genome_folder ({}): {}".format( + self[CFG_FOLDER_KEY], gf_relpath + ) ) - _LOGGER.debug("Trying path relative to genome_folder ({}): {}". - format(self[CFG_FOLDER_KEY], gf_relpath)) if check_exist(gf_relpath): return gf_relpath - msg = "For genome '{}' the asset '{}.{}:{}' doesn't exist; tried: {}".\ - format(genome_name, asset_name, seek_key, tag_name, - ",".join([path, gf_relpath, fullpath])) + msg = "For genome '{}' the asset '{}.{}:{}' doesn't exist; tried: {}".format( + genome_name, + asset_name, + seek_key, + tag_name, + ", ".join([path, gf_relpath, fullpath]), + ) # return option2 if existence not enforced if strict_exists is None: _LOGGER.debug(msg) @@ -364,55 +870,91 @@ def seek(self, genome_name, asset_name, tag_name=None, seek_key=None, def get_default_tag(self, genome, asset, use_existing=True): """ - Determine the asset tag to use as default. The one indicated by the 'default_tag' key in the asset - section is returned. - If no 'default_tag' key is found, by default the first listed tag is returned with a RuntimeWarning. - This behavior can be turned off with use_existing=False + Determine the asset tag to use as default. The one indicated by + the 'default_tag' key in the asset section is returned. + If no 'default_tag' key is found, by default the first listed tag is returned + with a RuntimeWarning. This behavior can be turned off with use_existing=False :param str genome: name of a reference genome assembly of interest :param str asset: name of the particular asset of interest - :param bool use_existing: whether the first tag in the config should be returned in case there is no default - tag defined for an asset + :param bool use_existing: whether the first tag in the config should be + returned in case there is no default tag defined for an asset :return str: name of the tag to use as the default one """ try: - _assert_gat_exists(self[CFG_GENOMES_KEY], genome, asset) + self._assert_gat_exists(genome, asset) except RefgenconfError: - _LOGGER.info("Using '{}' as the default tag for '{}/{}'".format(DEFAULT_TAG, genome, asset)) + _LOGGER.info( + "Using '{}' as the default tag for '{}/{}'".format( + DEFAULT_TAG, genome, asset + ) + ) return DEFAULT_TAG try: - return self[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][asset][CFG_ASSET_DEFAULT_TAG_KEY] + return self[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][asset][ + CFG_ASSET_DEFAULT_TAG_KEY + ] except KeyError: - alt = self[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][asset][CFG_ASSET_TAGS_KEY].keys()[0] if use_existing\ + alt = ( + self[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][asset][ + CFG_ASSET_TAGS_KEY + ].keys()[0] + if use_existing else DEFAULT_TAG + ) if isinstance(alt, str): if alt != DEFAULT_TAG: - warnings.warn("Could not find the '{}' key for asset '{}/{}'. " - "Used the first one in the config instead: '{}'. " - "Make sure it does not corrupt your workflow." - .format(CFG_ASSET_DEFAULT_TAG_KEY, genome, asset, alt), RuntimeWarning) + warnings.warn( + "Could not find the '{}' key for asset '{}/{}'. " + "Used the first one in the config instead: '{}'. " + "Make sure it does not corrupt your workflow.".format( + CFG_ASSET_DEFAULT_TAG_KEY, genome, asset, alt + ), + RuntimeWarning, + ) else: - warnings.warn("Could not find the '{}' key for asset '{}/{}'. " - "Returning '{}' instead. Make sure it does not corrupt your workflow." - .format(CFG_ASSET_DEFAULT_TAG_KEY, genome, asset, alt), RuntimeWarning) + warnings.warn( + "Could not find the '{}' key for asset '{}/{}'. Returning '{}' " + "instead. Make sure it does not corrupt your workflow.".format( + CFG_ASSET_DEFAULT_TAG_KEY, genome, asset, alt + ), + RuntimeWarning, + ) return alt except TypeError: - _raise_not_mapping(self[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][asset], "Asset section ") + _raise_not_mapping( + self[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][asset], "Asset section " + ) - def set_default_pointer(self, genome, asset, tag, force=False): + def set_default_pointer(self, genome, asset, tag, force=False, force_digest=None): """ Point to the selected tag by default :param str genome: name of a reference genome assembly of interest :param str asset: name of the particular asset of interest :param str tag: name of the particular asset tag to point to by default - :param bool force: whether the default tag change should be forced (even if it exists) - """ - _assert_gat_exists(self[CFG_GENOMES_KEY], genome, asset, tag) - if CFG_ASSET_DEFAULT_TAG_KEY not in self[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][asset] or \ - len(self[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][asset][CFG_ASSET_DEFAULT_TAG_KEY]) == 0 or force: - self.update_assets(genome, asset, {CFG_ASSET_DEFAULT_TAG_KEY: tag}) - _LOGGER.info("Default tag for '{}/{}' set to: {}".format(genome, asset, tag)) + :param str force_digest: digest to force update of. The alias will + not be converted to the digest, even if provided. + :param bool force: whether the default tag change should be + forced (even if it exists) + """ + self._assert_gat_exists(genome, asset, tag) + asset_dict = self[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][asset] + if ( + CFG_ASSET_DEFAULT_TAG_KEY in asset_dict + and len(asset_dict[CFG_ASSET_DEFAULT_TAG_KEY]) > 0 + ): + if not force: + return + if asset == "fasta": + raise NotImplementedError( + "Can't change the default tag for fasta assets, " + "this would lead to genome identity issues" + ) + self.update_assets( + genome, asset, {CFG_ASSET_DEFAULT_TAG_KEY: tag}, force_digest=force_digest + ) + _LOGGER.info(f"Default tag for '{genome}/{asset}' set to: {tag}") def list_assets_by_genome(self, genome=None, order=None, include_tags=False): """ @@ -422,14 +964,20 @@ def list_assets_by_genome(self, genome=None, order=None, include_tags=False): if omitted, the full mapping from genome to asset names :param function(str) -> object order: how to key genome IDs and asset names for sort - :param bool include_tags: whether asset tags should be included in the returned dict + :param bool include_tags: whether asset tags should be included in the + returned dict :return Iterable[str] | Mapping[str, Iterable[str]]: collection of asset type names available for particular reference assembly if one is provided, else the full mapping between assembly ID and collection available asset type names """ - return self.list(genome, order, include_tags=include_tags)[genome] if genome is not None \ + if genome: + genome = self.get_genome_alias(digest=genome, fallback=True) + return ( + self.list(genome, order, include_tags=include_tags)[genome] + if genome is not None else self.list(order, include_tags=include_tags) + ) def list_genomes_by_asset(self, asset=None, order=None): """ @@ -444,9 +992,18 @@ def list_genomes_by_asset(self, asset=None, order=None): collection of assembly names for which the asset key is available will be returned. """ - return self._invert_genomes(order) if not asset else \ - sorted([g for g, data in self[CFG_GENOMES_KEY].items() - if asset in data.get(CFG_ASSETS_KEY)], key=order) + return ( + self._invert_genomes(order) + if not asset + else sorted( + [ + self.get_genome_alias(g, fallback=True) + for g, data in self[CFG_GENOMES_KEY].items() + if asset in data.get(CFG_ASSETS_KEY) + ], + key=order, + ) + ) def get_local_data_str(self, genome=None, order=None): """ @@ -459,20 +1016,25 @@ def get_local_data_str(self, genome=None, order=None): """ exceptions = [] if genome is not None: - if isinstance(genome, str): - genome = [genome] + genome = _make_list_of_str(genome) for g in genome: try: - _assert_gat_exists(self[CFG_GENOMES_KEY], g) + self._assert_gat_exists(gname=g) except MissingGenomeError as e: exceptions.append(e) if exceptions: raise MissingGenomeError(", ".join(map(str, exceptions))) - genomes_str = self.genomes_str(order=order) if genome is None \ - else ", ".join(_select_genomes(sorted(self[CFG_GENOMES_KEY].keys(), key=order), genome)) - return genomes_str, self.assets_str(genome=genome, order=order) + return ( + ", ".join(self._select_genomes(genome=genome, order=order)), + self.assets_str(genome=genome, order=order), + ) - def get_remote_data_str(self, genome=None, order=None, get_url=lambda server, id: construct_request_url(server, id)): + def get_remote_data_str( + self, + genome=None, + order=None, + get_url=lambda server, id: construct_request_url(server, id), + ): """ List genomes and assets available remotely. @@ -485,11 +1047,17 @@ def get_remote_data_str(self, genome=None, order=None, get_url=lambda server, id """ warnings.warn( "Please use listr method instead; get_remote_data_str will be " - "removed in the next release.", category=DeprecationWarning + "removed in the next release.", + category=DeprecationWarning, ) return self.listr(genome, order, get_url) - def listr(self, genome=None, order=None, get_url=lambda server, id: construct_request_url(server, id), as_str=False): + def listr( + self, + genome=None, + get_url=lambda server, id: construct_request_url(server, id), + as_digests=False, + ): """ List genomes and assets available remotely on all servers the object subscribes to @@ -503,12 +1071,46 @@ def listr(self, genome=None, order=None, get_url=lambda server, id: construct_re keyed by genome keyed by source server endpoint """ data_by_server = {} + for url in self[CFG_SERVERS_KEY]: - url = get_url(url, API_ID_ASSETS) - data_by_server[url] = _list_remote(url, genome, order, as_str=as_str) + aliases_url = get_url(url, API_ID_ALIASES_DICT) + assets_url = get_url(url, API_ID_ASSETS) + if assets_url is None or aliases_url is None: + continue + + aliases_by_digest = download_json(aliases_url) + # convert the original, condensed mapping to a data structure with optimal time complexity + digests_by_alias = {} + for k, v in aliases_by_digest.items(): + for alias in v: + digests_by_alias[alias] = k + + genome_digests = None + genomes = genome if isinstance(genome, list) else [genome] + if genome is not None: + genome_digests = [ + g + if g in aliases_by_digest.keys() + else digests_by_alias.get(g, None) + for g in genomes + ] + if genome_digests is None: + _LOGGER.info(f"{genome} not found on server: {url}") + continue + + server_data = self._list_remote( + url=assets_url, + genome=genome_digests, + ) + data_by_server[assets_url] = ( + server_data + if as_digests + else {aliases_by_digest[k][0]: v for k, v in server_data.items()} + ) + return data_by_server - def tag(self, genome, asset, tag, new_tag, files=True): + def tag(self, genome, asset, tag, new_tag, files=True, force=False): """ Retags the asset selected by the tag with the new_tag. Prompts if default already exists and overrides upon confirmation. @@ -529,14 +1131,19 @@ def tag(self, genome, asset, tag, new_tag, files=True): :return bool: a logical indicating whether the tagging was successful """ self.run_plugins(PRE_TAG_HOOK) - ori_path = self.seek(genome, asset, tag, enclosing_dir=True, strict_exists=True) + ori_path = self.seek_src( + genome, asset, tag, enclosing_dir=True, strict_exists=True + ) + alias_ori_path = self.seek( + genome, asset, tag, enclosing_dir=True, strict_exists=True + ) new_path = os.path.abspath(os.path.join(ori_path, os.pardir, new_tag)) if self.file_path: with self as r: - if not r.cfg_tag_asset(genome, asset, tag, new_tag): + if not r.cfg_tag_asset(genome, asset, tag, new_tag, force): sys.exit(0) else: - if not self.cfg_tag_asset(genome, asset, tag, new_tag): + if not self.cfg_tag_asset(genome, asset, tag, new_tag, force): sys.exit(0) if not files: self.run_plugins(POST_TAG_HOOK) @@ -545,47 +1152,63 @@ def tag(self, genome, asset, tag, new_tag, files=True): if os.path.exists(new_path): _remove(new_path) os.rename(ori_path, new_path) + _LOGGER.info("Renamed directory: {}".format(new_path)) + self._symlink_alias(genome, asset, new_tag) + _remove(alias_ori_path) except FileNotFoundError: - _LOGGER.warning("Could not rename original asset tag directory '{}'" - " to the new one '{}'".format(ori_path, new_path)) + _LOGGER.warning( + "Could not rename original asset tag directory '{}'" + " to the new one '{}'".format(ori_path, new_path) + ) else: if self.file_path: with self as r: r.cfg_remove_assets(genome, asset, tag, relationships=False) else: self.cfg_remove_assets(genome, asset, tag, relationships=False) - _LOGGER.debug("Asset '{}/{}' tagged with '{}' has been removed from" - " the genome config".format(genome, asset, tag)) - _LOGGER.debug("Original asset has been moved from '{}' to '{}'". - format(ori_path, new_path)) + _LOGGER.debug( + "Asset '{}/{}' tagged with '{}' has been removed from" + " the genome config".format(genome, asset, tag) + ) + _LOGGER.debug( + "Original asset has been moved from '{}' to '{}'".format( + ori_path, new_path + ) + ) self.run_plugins(POST_TAG_HOOK) - def cfg_tag_asset(self, genome, asset, tag, new_tag): + def cfg_tag_asset(self, genome, asset, tag, new_tag, force=False): """ Retags the asset selected by the tag with the new_tag. Prompts if default already exists and overrides upon confirmation. - This method does not override the original asset entry in the RefGenConf object. It creates its copy and tags - it with the new_tag. - Additionally, if the retagged asset has any children their parent will be retagged as new_tag that was - introduced upon this method execution. + This method does not override the original asset entry in the + RefGenConf object. It creates its copy and tags it with the new_tag. + Additionally, if the retagged asset has any children their parent will + be retagged as new_tag that was introduced upon this method execution. :param str genome: name of a reference genome assembly of interest :param str asset: name of particular asset of interest :param str tag: name of the tag that identifies the asset of interest :param str new_tag: name of particular the new tag + :param bool force: force any actions that require approval :raise ValueError: when the original tag is not specified :return bool: a logical indicating whether the tagging was successful """ - _assert_gat_exists(self[CFG_GENOMES_KEY], genome, asset, tag) + self._assert_gat_exists(genome, asset, tag) asset_mapping = self[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][asset] if tag is None: - raise ValueError("You must explicitly specify the tag of the asset " - "you want to reassign. Currently defined " - "tags for '{}/{}' are: {}".format(genome, asset, ", ".join(get_asset_tags(asset_mapping)))) + ts = ", ".join(get_asset_tags(asset_mapping)) + raise ValueError( + f"You must explicitly specify the tag of the asset" + f" you want to reassign. Currently defined tags " + f"for '{genome}/{asset}' are: {ts}" + ) if new_tag in asset_mapping[CFG_ASSET_TAGS_KEY]: - if not query_yes_no("You already have a '{}' asset tagged as '{}', do you wish to override?". - format(asset, new_tag)): + if not force and not query_yes_no( + f"You already have a '{asset}' asset tagged as " + f"'{new_tag}', do you wish to override?" + ): _LOGGER.info("Tag action aborted by the user") return children = [] @@ -595,64 +1218,127 @@ def cfg_tag_asset(self, genome, asset, tag, new_tag): if CFG_ASSET_PARENTS_KEY in asset_mapping[CFG_ASSET_TAGS_KEY][tag]: parents = asset_mapping[CFG_ASSET_TAGS_KEY][tag][CFG_ASSET_PARENTS_KEY] if len(children) > 0 or len(parents) > 0: - if not query_yes_no("The asset '{}/{}:{}' has {} children and {} parents. Refgenie will update the " - "relationship data. Do you want to proceed?".format(genome, asset, tag, len(children), - len(parents))): + if not force and not query_yes_no( + f"The asset '{genome}/{asset}:{tag}' has {len(children)} " + f"children and {len(parents)} parents. Refgenie will update" + f" the relationship data. Do you want to proceed?" + ): _LOGGER.info("Tag action aborted by the user") return False # updates children's parents - self._update_relatives_tags(genome, asset, tag, new_tag, children, update_children=False) + self._update_relatives_tags( + genome, asset, tag, new_tag, children, update_children=False + ) # updates parents' children - self._update_relatives_tags(genome, asset, tag, new_tag, parents, update_children=True) - self[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][asset][CFG_ASSET_TAGS_KEY][new_tag] = \ - asset_mapping[CFG_ASSET_TAGS_KEY][tag] - if CFG_ASSET_DEFAULT_TAG_KEY in asset_mapping and asset_mapping[CFG_ASSET_DEFAULT_TAG_KEY] == tag: + self._update_relatives_tags( + genome, asset, tag, new_tag, parents, update_children=True + ) + self[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][asset][CFG_ASSET_TAGS_KEY][ + new_tag + ] = asset_mapping[CFG_ASSET_TAGS_KEY][tag] + if ( + CFG_ASSET_DEFAULT_TAG_KEY in asset_mapping + and asset_mapping[CFG_ASSET_DEFAULT_TAG_KEY] == tag + ): self.set_default_pointer(genome, asset, new_tag, force=True) self.cfg_remove_assets(genome, asset, tag) return True - def _update_relatives_tags(self, genome, asset, tag, new_tag, relatives, update_children): + def _update_relatives_tags( + self, genome, asset, tag, new_tag, relatives, update_children + ): """ - Internal method used for tags updating in the 'asset_parents' section in the list of children. + Internal method used for tags updating in the 'asset_parents' section in the + list of children. :param str genome: name of a reference genome assembly of interest :param str asset: name of particular asset of interest :param str tag: name of the tag that identifies the asset of interest :param str new_tag: name of particular the new tag - :param list[str] relatives: relatives to be updated. Format: ["asset_name:tag", "asset_name1:tag1"] - :param bool update_children: whether the children of the selected relatives should be updated. + :param list[str] relatives: relatives to be updated. Format: ["asset_name:tag", + "asset_name1:tag1"] + :param bool update_children: whether the children of the selected relatives + should be updated. """ - relative_key = CFG_ASSET_CHILDREN_KEY if update_children else CFG_ASSET_PARENTS_KEY + relative_key = ( + CFG_ASSET_CHILDREN_KEY if update_children else CFG_ASSET_PARENTS_KEY + ) for r in relatives: - _LOGGER.debug("updating {} in '{}'".format("children" if update_children else "parents", r)) + _LOGGER.debug( + "updating {} in '{}'".format( + "children" if update_children else "parents", r + ) + ) r_data = prp(r) try: - self[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][r_data["item"]][CFG_ASSET_TAGS_KEY][r_data["tag"]] + self[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][r_data["item"]][ + CFG_ASSET_TAGS_KEY + ][r_data["tag"]] except KeyError: - _LOGGER.warning("The {} asset of '{}/{}' does not exist: {}". - format("parent" if update_children else "child", genome, asset, r)) + _LOGGER.warning( + "The {} asset of '{}/{}' does not exist: {}".format( + "parent" if update_children else "child", genome, asset, r + ) + ) continue updated_relatives = [] - if relative_key in \ - self[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][r_data["item"]][CFG_ASSET_TAGS_KEY][r_data["tag"]]: - relatives = \ - self[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][r_data["item"]][CFG_ASSET_TAGS_KEY][r_data["tag"]]\ - [relative_key] + if ( + relative_key + in self[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][r_data["item"]][ + CFG_ASSET_TAGS_KEY + ][r_data["tag"]] + ): + relatives = self[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][ + r_data["item"] + ][CFG_ASSET_TAGS_KEY][r_data["tag"]][relative_key] for relative in relatives: ori_relative_data = prp(relative) - if ori_relative_data["item"] == asset and ori_relative_data["tag"] == tag: + ori_relative_data["namespace"] = self.get_genome_alias_digest( + alias=ori_relative_data["namespace"], fallback=True + ) + if ( + ori_relative_data["item"] == asset + and ori_relative_data["tag"] == tag + ): ori_relative_data["tag"] = new_tag - updated_relatives.append("{}/{}:{}".format(genome, asset, new_tag)) + updated_relatives.append( + "{}/{}:{}".format( + ori_relative_data["namespace"], asset, new_tag + ) + ) else: - updated_relatives.append("{}/{}:{}".format(ori_relative_data["namespace"], - ori_relative_data["item"], ori_relative_data["tag"])) - self.update_relatives_assets(genome, r_data["item"], r_data["tag"], updated_relatives, update_children) - self[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][r_data["item"]][CFG_ASSET_TAGS_KEY][r_data["tag"]]\ - [relative_key] = updated_relatives - - def pull(self, genome, asset, tag, unpack=True, force=None, force_large=None, size_cutoff=10, - get_json_url=lambda server, operation_id: construct_request_url(server, operation_id), - build_signal_handler=_handle_sigint): + updated_relatives.append( + "{}/{}:{}".format( + ori_relative_data["namespace"], + ori_relative_data["item"], + ori_relative_data["tag"], + ) + ) + self.update_relatives_assets( + genome, + r_data["item"], + r_data["tag"], + updated_relatives, + update_children, + ) + self[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][r_data["item"]][ + CFG_ASSET_TAGS_KEY + ][r_data["tag"]][relative_key] = updated_relatives + + def pull( + self, + genome, + asset, + tag, + unpack=True, + force=None, + force_large=None, + size_cutoff=10, + get_json_url=lambda server, operation_id: construct_request_url( + server, operation_id + ), + build_signal_handler=_handle_sigint, + ): """ Download and possibly unpack one or more assets for a given ref gen. @@ -684,30 +1370,55 @@ def pull(self, genome, asset, tag, unpack=True, force=None, force_large=None, si a non-writable state """ self.run_plugins(PRE_PULL_HOOK) - missing_vars = unbound_env_vars(self[CFG_FOLDER_KEY]) - if missing_vars: - raise UnboundEnvironmentVariablesError(", ".join(missing_vars)) def _null_return(): self.run_plugins(POST_PULL_HOOK) return gat, None, None def _raise_unpack_error(): - raise NotImplementedError("Option to not extract tarballs is not yet supported.") + raise NotImplementedError( + "Option to not extract tarballs is not yet supported." + ) num_servers = 0 bad_servers = [] no_asset_json = [] + alias = genome + gat = [genome, asset, tag] if CFG_SERVERS_KEY not in self or self[CFG_SERVERS_KEY] is None: _LOGGER.error("You are not subscribed to any asset servers") return _null_return() - for server_url in self[CFG_SERVERS_KEY]: + + good_servers = [ + s for s in self[CFG_SERVERS_KEY] if get_json_url(s, API_ID_DIGEST) + ] + + _LOGGER.info(f"Compatible refgenieserver instances: {good_servers}") + + for server_url in good_servers: + try: + genome = self.get_genome_alias_digest(alias=alias) + except yacman.UndefinedAliasError: + _LOGGER.info(f"No local digest for genome alias: {genome}") + if not self.set_genome_alias( + genome=alias, servers=[server_url], create_genome=True + ): + continue + genome = self.get_genome_alias_digest(alias=alias) + num_servers += 1 try: - determined_tag = _download_json(get_json_url(server_url, API_ID_DEFAULT_TAG).format(genome=genome, asset=asset)) \ - if tag is None else tag + determined_tag = ( + download_json( + get_json_url(server_url, API_ID_DEFAULT_TAG).format( + genome=genome, asset=asset + ) + ) + if tag is None + else tag + ) except DownloadJsonError: - _LOGGER.warning("Could not retrieve JSON from: {}".format(server_url)) + _LOGGER.warning(f"Could not retrieve JSON from: {server_url}") bad_servers.append(server_url) continue else: @@ -715,77 +1426,103 @@ def _raise_unpack_error(): _LOGGER.debug("Determined tag: {}".format(determined_tag)) unpack or _raise_unpack_error() gat = [genome, asset, determined_tag] - url_asset_attrs = get_json_url(server_url, API_ID_ASSET_ATTRS).format(genome=genome, asset=asset) - url_genome_attrs = get_json_url(server_url, API_ID_GENOME_ATTRS).format(genome=genome) - url_archive = get_json_url(server_url, API_ID_ARCHIVE).format(genome=genome, asset=asset) + url_asset_attrs = get_json_url(server_url, API_ID_ASSET_ATTRS).format( + genome=genome, asset=asset + ) + url_genome_attrs = get_json_url(server_url, API_ID_GENOME_ATTRS).format( + genome=genome + ) + url_archive = get_json_url(server_url, API_ID_ARCHIVE).format( + genome=genome, asset=asset + ) try: - archive_data = _download_json(url_asset_attrs, params={"tag": determined_tag}) + archive_data = download_json( + url_asset_attrs, params={"tag": determined_tag} + ) except DownloadJsonError: no_asset_json.append(server_url) - if num_servers == len(self[CFG_SERVERS_KEY]): - _LOGGER.error("Asset '{}/{}:{}' not available on any of the following servers: {}". - format(genome, asset, determined_tag, ", ".join(self[CFG_SERVERS_KEY]))) + if num_servers == len(good_servers): + _LOGGER.error( + f"'{alias}/{asset}:{determined_tag}' not " + f"available on any of the following servers: " + f"{', '.join(self[CFG_SERVERS_KEY])}" + ) return _null_return() continue else: _LOGGER.debug("Determined server URL: {}".format(server_url)) - genome_archive_data = _download_json(url_genome_attrs) + genome_archive_data = download_json(url_genome_attrs) if sys.version_info[0] == 2: archive_data = asciify_json_dict(archive_data) # local directory that the asset data will be stored in tag_dir = os.path.dirname(self.filepath(*gat)) - # local directory the downloaded archive will be temporarily saved in - genome_dir_path = os.path.join(self[CFG_FOLDER_KEY], genome) - # local path to the temporarily saved archive - filepath = os.path.join(genome_dir_path, asset + "__" + determined_tag + ".tgz") + # local target path for the saved archive + tardir = os.path.join(self.data_dir, genome, asset) + tarpath = os.path.join(tardir, asset + "__" + determined_tag + ".tgz") # check if the genome/asset:tag exists and get request user decision if os.path.exists(tag_dir): + def preserve(): - _LOGGER.info("Preserving existing: {}".format(tag_dir)) + _LOGGER.info(f"Preserving existing: {tag_dir}") return _null_return() + if force is False: return preserve() elif force is None: - if not query_yes_no("Replace existing ({})?".format(tag_dir), "no"): + if not query_yes_no(f"Replace existing ({tag_dir})?", "no"): return preserve() else: - _LOGGER.debug("Overwriting: {}".format(tag_dir)) + _LOGGER.debug(f"Overwriting: {tag_dir}") else: - _LOGGER.debug("Overwriting: {}".format(tag_dir)) + _LOGGER.debug(f"Overwriting: {tag_dir}") # check asset digests local-server match for each parent - [self._chk_digest_if_avail(genome, x, server_url) - for x in archive_data[CFG_ASSET_PARENTS_KEY] if CFG_ASSET_PARENTS_KEY in archive_data] - - bundle_name = '{}/{}:{}'.format(*gat) + [ + self._chk_digest_if_avail( + genome, x, archive_data[CFG_ASSET_CHECKSUM_KEY] + ) + for x in archive_data[CFG_ASSET_PARENTS_KEY] + if CFG_ASSET_PARENTS_KEY in archive_data + ] + + bundle_name = "{}/{}:{}".format(*gat) archsize = archive_data[CFG_ARCHIVE_SIZE_KEY] - _LOGGER.debug("'{}' archive size: {}".format(bundle_name, archsize)) + _LOGGER.debug(f"'{bundle_name}' archive size: {archsize}") if not force_large and _is_large_archive(archsize, size_cutoff): if force_large is False: - _LOGGER.info("Skipping pull of {}/{}:{}; size: {}". - format(*gat, archsize)) + _LOGGER.info( + "Skipping pull of {}/{}:{}; size: {}".format(*gat, archsize) + ) return _null_return() - if not query_yes_no("This archive exceeds the size cutoff ({} > {:.1f}GB) " - "Do you want to proceed?".format(archsize, size_cutoff)): - _LOGGER.info("Skipping pull of {}/{}:{}; size: {}". - format(*gat, archsize)) + if not query_yes_no( + "This archive exceeds the size cutoff ({} > {:.1f}GB). " + "Do you want to proceed?".format(archsize, size_cutoff) + ): + _LOGGER.info( + "Skipping pull of {}/{}:{}; size: {}".format(*gat, archsize) + ) return _null_return() - if not os.path.exists(genome_dir_path): - _LOGGER.debug("Creating directory: {}".format(genome_dir_path)) - os.makedirs(genome_dir_path) + if not os.path.exists(tardir): + _LOGGER.debug(f"Creating directory: {tardir}") + os.makedirs(tardir) # Download the file from `url` and save it locally under `filepath`: - _LOGGER.info("Downloading URL: {}".format(url_archive)) + _LOGGER.info(f"Downloading URL: {url_archive}") try: - signal.signal(signal.SIGINT, build_signal_handler(filepath)) - _download_url_progress(url_archive, filepath, bundle_name, params={"tag": determined_tag}) + signal.signal(signal.SIGINT, build_signal_handler(tarpath)) + _download_url_progress( + url_archive, tarpath, bundle_name, params={"tag": determined_tag} + ) except HTTPError: - _LOGGER.error("Asset archive '{}/{}:{}' is missing on the server: {s}".format(*gat, s=server_url)) + _LOGGER.error( + "Asset archive '{}/{}:{}' is missing on the " + "server: {s}".format(*gat, s=server_url) + ) if server_url == self[CFG_SERVERS_KEY][-1]: # it this was the last server on the list, return return _null_return() @@ -796,58 +1533,320 @@ def preserve(): continue except ConnectionRefusedError as e: _LOGGER.error(str(e)) - _LOGGER.error("Server {}/{} refused download. " - "Check your internet settings". - format(server_url, API_VERSION)) + _LOGGER.error( + f"Server {server_url}/{API_VERSION} refused " + f"download. Check your internet settings" + ) return _null_return() except ContentTooShortError as e: _LOGGER.error(str(e)) - _LOGGER.error("'{}' download incomplete".format(bundle_name)) + _LOGGER.error(f"'{bundle_name}' download incomplete") return _null_return() else: - _LOGGER.info("Download complete: {}".format(filepath)) + _LOGGER.info(f"Download complete: {tarpath}") - new_checksum = checksum(filepath) + new_checksum = checksum(tarpath) old_checksum = archive_data and archive_data.get(CFG_ARCHIVE_CHECKSUM_KEY) if old_checksum and new_checksum != old_checksum: - _LOGGER.error("Downloaded archive ('{}') checksum mismatch: ({}, {})". - format(filepath, new_checksum, old_checksum)) + _LOGGER.error( + f"Downloaded archive ('{tarpath}') checksum " + f"mismatch: ({new_checksum}, {old_checksum})" + ) return _null_return() else: - _LOGGER.debug("Matched checksum: '{}'".format(old_checksum)) - # successfully downloaded and moved tarball; untar it - if unpack and filepath.endswith(".tgz"): - _LOGGER.info("Extracting asset tarball and saving to: {}".format(tag_dir)) - with TemporaryDirectory(dir=genome_dir_path) as tmpdir: - # here we suspect the unarchived asset to be an asset-named - # directory with the asset data inside and we transfer it - # to the tag-named subdirectory - untar(filepath, tmpdir) - if os.path.isdir(tag_dir): - shutil.rmtree(tag_dir) - _LOGGER.info("Removed existing directory: {}".format(tag_dir)) - shutil.move(os.path.join(tmpdir, asset), tag_dir) - if os.path.isfile(filepath): - os.remove(filepath) + _LOGGER.debug(f"Matched checksum: '{old_checksum}'") + # successfully downloaded tarball; untar it + if unpack and tarpath.endswith(".tgz"): + _LOGGER.info(f"Extracting asset tarball: {tarpath}") + untar(tarpath, tardir) + os.remove(tarpath) if self.file_path: with self as rgc: - [rgc.chk_digest_update_child(gat[0], x, "{}/{}:{}".format(*gat), server_url) - for x in archive_data[CFG_ASSET_PARENTS_KEY] if CFG_ASSET_PARENTS_KEY in archive_data] - rgc.update_tags(*gat, data={attr: archive_data[attr] - for attr in ATTRS_COPY_PULL if attr in archive_data}) + [ + rgc.chk_digest_update_child( + gat[0], x, "{}/{}:{}".format(*gat), server_url + ) + for x in archive_data[CFG_ASSET_PARENTS_KEY] + if CFG_ASSET_PARENTS_KEY in archive_data + ] + rgc.update_tags( + *gat, + data={ + attr: archive_data[attr] + for attr in ATTRS_COPY_PULL + if attr in archive_data + }, + ) rgc.set_default_pointer(*gat) rgc.update_genomes(genome=genome, data=genome_archive_data) else: - [self.chk_digest_update_child(gat[0], x, "{}/{}:{}".format(*gat), server_url) - for x in archive_data[CFG_ASSET_PARENTS_KEY] if CFG_ASSET_PARENTS_KEY in archive_data] - self.update_tags(*gat, data={attr: archive_data[attr] - for attr in ATTRS_COPY_PULL if attr in archive_data}) + [ + self.chk_digest_update_child( + gat[0], x, "{}/{}:{}".format(*gat), server_url + ) + for x in archive_data[CFG_ASSET_PARENTS_KEY] + if CFG_ASSET_PARENTS_KEY in archive_data + ] + self.update_tags( + *gat, + data={ + attr: archive_data[attr] + for attr in ATTRS_COPY_PULL + if attr in archive_data + }, + ) self.set_default_pointer(*gat) self.update_genomes(genome=genome, data=genome_archive_data) + if asset == "fasta": + self.initialize_genome( + fasta_path=self.seek_src(*gat), alias=alias, fasta_unzipped=True + ) self.run_plugins(POST_PULL_HOOK) + self._symlink_alias(*gat) return gat, archive_data, server_url + def get_genome_alias_digest(self, alias, fallback=False): + """ + Get the human readable alias for a genome digest + + :param str alias: alias to find digest for + :param bool fallback: whether to return the query alias in case + of failure and in case it is one of the digests + :return str: genome digest + :raise UndefinedAliasError: if the specified alias has been assigned to + any digests + """ + try: + return self[CFG_GENOMES_KEY].get_key(alias=alias) + except (yacman.UndefinedAliasError, AttributeError): + if not fallback: + raise + if alias in self.genome_aliases.values(): + return alias + raise + + def get_genome_alias(self, digest, fallback=False, all_aliases=False): + """ + Get the human readable alias for a genome digest + + :param str digest: digest to find human-readable alias for + :param bool fallback: whether to return the query digest in case + of failure + :param bool all_aliases: whether to return all aliases instead of just + the first one + :return str | list[str]: human-readable aliases + :raise GenomeConfigFormatError: if "genome_digests" section does + not exist in the config + :raise UndefinedAliasError: if a no alias has been defined for the + requested digest + """ + try: + res = self[CFG_GENOMES_KEY].get_aliases(key=digest) + return res if all_aliases else res[0] + except (yacman.UndefinedAliasError, AttributeError): + if not fallback: + raise + if digest in self.genome_aliases.keys(): + return digest + raise + + def remove_genome_aliases(self, digest, aliases=None): + """ + Remove alias for a specified genome digest. This method will remove the + digest both from the genomes object and from the aliases mapping + in tbe config + + :param str digest: genome digest to remove an alias for + :param list[str] aliases: a collection to aliases to remove for the + genome. If not provided, all aliases for the digest will be remove + :return bool: whether the removal has been performed + """ + + def _check_and_remove_alias(rgc, d, a): + """ + Remove genome alias only if the alias can be remove successfully and + genome exists + """ + if rgc[CFG_GENOMES_KEY]: + rmd = rgc[CFG_GENOMES_KEY].remove_aliases(key=d, aliases=a) + if not rmd: + return rmd + try: + rgc[CFG_GENOMES_KEY][d][CFG_ALIASES_KEY] = rgc[ + CFG_GENOMES_KEY + ].get_aliases(d) + except KeyError: + return [] + except yacman.UndefinedAliasError: + rgc[CFG_GENOMES_KEY][d][CFG_ALIASES_KEY] = [] + return rmd + + # get the symlink mapping before the removal for _remove_symlink_alias + symlink_mapping = self.get_symlink_paths(genome=digest, all_aliases=True) + if self.file_path: + with self as r: + removed_aliases = _check_and_remove_alias(r, digest, aliases) + else: + removed_aliases = _check_and_remove_alias(self, digest, aliases) + if not removed_aliases: + return [], [] + self._remove_symlink_alias(symlink_mapping, removed_aliases) + return removed_aliases + + def set_genome_alias( + self, + genome, + digest=None, + servers=None, + overwrite=False, + reset_digest=False, + create_genome=False, + no_write=False, + get_json_url=lambda server: construct_request_url(server, API_ID_ALIAS_DIGEST), + ): + """ + Assign a human-readable alias to a genome identifier. + + Genomes are identified by a unique identifier which is derived from the + FASTA file (part of fasta asset). This way we can ensure genome + provenance and compatibility with the server. This function maps a + human-readable identifier to make referring to the genomes easier. + + :param str genome: name of the genome to assign to an identifier + :param str digest: identifier to use + :param bool overwrite: whether all the previously set aliases should be + removed and just the current one stored + :param bool no_write: whether to skip writing the alias to the file + :return bool: whether the alias has been established + """ + + def _check_and_set_alias(rgc, d, a, create=False): + """ + Set genome alias only if the key alias can be set successfully and + genome exists or genome creation is forced + """ + try: + _assert_gat_exists(rgc[CFG_GENOMES_KEY], gname=digest) + except MissingGenomeError: + if not create: + raise + rgc[CFG_GENOMES_KEY][d] = PXAM() + + sa, ra = rgc[CFG_GENOMES_KEY].set_aliases( + aliases=a, key=d, overwrite=overwrite, reset_key=reset_digest + ) + try: + rgc[CFG_GENOMES_KEY][d][CFG_ALIASES_KEY] = rgc[ + CFG_GENOMES_KEY + ].get_aliases(d) + except KeyError: + return [], [] + _LOGGER.info( + f"Set genome alias ({d}: {', '.join(a) if isinstance(a, list) else a})" + ) + return sa, ra + + if not digest: + if isinstance(genome, list): + if len(genome) > 1: + raise NotImplementedError("Can look up just one digest at a time") + else: + genome = genome[0] + cnt = 0 + if servers is None: + servers = self[CFG_SERVERS_KEY] + for server in servers: + cnt += 1 + url_alias_template = get_json_url(server=server) + if url_alias_template is None: + continue + url_alias = url_alias_template.format(alias=genome) + _LOGGER.info( + "Setting '{}' identity with server: {}".format(genome, url_alias) + ) + try: + digest = download_json(url_alias) + except DownloadJsonError: + if cnt == len(servers): + _LOGGER.error( + "Genome '{}' not available on any of the following " + "servers: {}".format(genome, ", ".join(servers)) + ) + return False + continue + _LOGGER.info( + "Determined server digest for local genome alias ({}): {}".format( + genome, digest + ) + ) + break + + # get the symlink mapping before the removal for _remove_symlink_alias + symlink_mapping = self.get_symlink_paths(genome=digest, all_aliases=True) + if self.file_path and not no_write: + with self as r: + set_aliases, removed_aliases = _check_and_set_alias( + rgc=r, d=digest, a=genome, create=create_genome + ) + else: + set_aliases, removed_aliases = _check_and_set_alias( + rgc=self, d=digest, a=genome, create=create_genome + ) + if not set_aliases: + return False + self._remove_symlink_alias(symlink_mapping, removed_aliases) + self._symlink_alias(genome=digest) + return True + + def initialize_genome( + self, fasta_path, alias, fasta_unzipped=False, skip_alias_write=False + ): + """ + Initialize a genome + + Create a JSON file with Annotated Sequence Digests (ASDs) + for the FASTA file in the genome directory. + + :param str fasta_path: path to a FASTA file to initialize genome with + :param str alias: alias to set for the genome + :param bool skip_alias_write: whether to skip writing the alias to the file + :return str, list[dict[]]: human-readable name for the genome + """ + _LOGGER.info("Initializing genome: {}".format(alias)) + if not os.path.isfile(fasta_path): + raise FileNotFoundError( + "Can't initialize genome; FASTA file does " + "not exist: {}".format(fasta_path) + ) + ssc = SeqColClient({}) + d, _ = ssc.load_fasta(fasta_path, gzipped=not fasta_unzipped) + # retrieve annotated sequence digests list to save in a JSON file + asdl = ssc.retrieve(druid=d) + pth = self.get_asds_path(d) + os.makedirs(os.path.dirname(pth), exist_ok=True) + with open(pth, "w") as jfp: + json.dump(asdl, jfp) + _LOGGER.debug("Saved ASDs to JSON: {}".format(pth)) + self.set_genome_alias( + genome=alias, + digest=d, + overwrite=True, + create_genome=True, + no_write=skip_alias_write, + ) + return d, asdl + + def get_asds_path(self, genome): + """ + Get path to the Annotated Sequence Digests JSON file for a given genome. + Note that the path and/or genome may not exist. + + :param str genome: genome name + :return str: ASDs path + """ + return os.path.join(self.data_dir, genome, f"{genome}__ASDs.json") + def remove_asset_from_relatives(self, genome, asset, tag): """ Remove any relationship links associated with the selected asset @@ -856,42 +1855,65 @@ def remove_asset_from_relatives(self, genome, asset, tag): :param str asset: asset to be removed from its relatives' relatives list :param str tag: tag to be removed from its relatives' relatives list """ - to_remove = "{}/{}:{}".format(genome, asset, tag) + to_remove = "{}/{}:{}".format( + self.get_genome_alias_digest(alias=genome, fallback=True), asset, tag + ) for rel_type in CFG_ASSET_RELATIVES_KEYS: - tmp = CFG_ASSET_RELATIVES_KEYS[len(CFG_ASSET_RELATIVES_KEYS) - 1 - CFG_ASSET_RELATIVES_KEYS.index(rel_type)] - tag_data = self[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][asset][CFG_ASSET_TAGS_KEY][tag] + tmp = CFG_ASSET_RELATIVES_KEYS[ + len(CFG_ASSET_RELATIVES_KEYS) + - 1 + - CFG_ASSET_RELATIVES_KEYS.index(rel_type) + ] + tag_data = self[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][asset][ + CFG_ASSET_TAGS_KEY + ][tag] if rel_type not in tag_data: continue for rel in tag_data[rel_type]: parsed = prp(rel) _LOGGER.debug("Removing '{}' from '{}' {}".format(to_remove, rel, tmp)) try: - self[CFG_GENOMES_KEY][parsed["namespace"] or genome][CFG_ASSETS_KEY][parsed["item"]]\ - [CFG_ASSET_TAGS_KEY][parsed["tag"]][tmp].remove(to_remove) + self[CFG_GENOMES_KEY][parsed["namespace"] or genome][ + CFG_ASSETS_KEY + ][parsed["item"]][CFG_ASSET_TAGS_KEY][parsed["tag"]][tmp].remove( + to_remove + ) except (KeyError, ValueError): pass - def update_relatives_assets(self, genome, asset, tag=None, data=None, children=False): + def update_relatives_assets( + self, genome, asset, tag=None, data=None, children=False + ): """ - A convenience method which wraps the update assets and uses it to update the asset relatives of an asset. + A convenience method which wraps the update assets and uses it to update the + asset relatives of an asset. :param str genome: genome to be added/updated :param str asset: asset to be added/updated :param str tag: tag to be added/updated :param list data: asset parents to be added/updated - :param bool children: a logical indicating whether the relationship to be added is 'children' + :param bool children: a logical indicating whether the relationship to be + added is 'children' :return RefGenConf: updated object """ tag = tag or self.get_default_tag(genome, asset) relationship = CFG_ASSET_CHILDREN_KEY if children else CFG_ASSET_PARENTS_KEY if _check_insert_data(data, list, "data"): - self.update_tags(genome, asset, tag) # creates/asserts the genome/asset:tag combination - self[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][asset][CFG_ASSET_TAGS_KEY][tag].setdefault(relationship, list()) - self[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][asset][CFG_ASSET_TAGS_KEY][tag][relationship] = \ - _extend_unique(self[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][asset][CFG_ASSET_TAGS_KEY][tag] - [relationship], data) - - def update_seek_keys(self, genome, asset, tag=None, keys=None): + # creates/asserts the genome/asset:tag combination + self.update_tags(genome, asset, tag) + self[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][asset][CFG_ASSET_TAGS_KEY][ + tag + ].setdefault(relationship, list()) + self[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][asset][CFG_ASSET_TAGS_KEY][ + tag + ][relationship] = _extend_unique( + self[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][asset][ + CFG_ASSET_TAGS_KEY + ][tag][relationship], + data, + ) + + def update_seek_keys(self, genome, asset, tag=None, keys=None, force_digest=None): """ A convenience method which wraps the updated assets and uses it to update the seek keys for a tagged asset. @@ -899,19 +1921,20 @@ def update_seek_keys(self, genome, asset, tag=None, keys=None): :param str genome: genome to be added/updated :param str asset: asset to be added/updated :param str tag: tag to be added/updated + :param str force_digest: digest to force update of. The alias will + not be converted to the digest, even if provided. :param Mapping keys: seek_keys to be added/updated :return RefGenConf: updated object """ tag = tag or self.get_default_tag(genome, asset) if _check_insert_data(keys, Mapping, "keys"): - self.update_tags(genome, asset, tag) + self.update_tags(genome, asset, tag, force_digest=force_digest) asset = self[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][asset] - _safe_setdef(asset[CFG_ASSET_TAGS_KEY][tag], CFG_SEEK_KEYS_KEY, - PXAM()) + _safe_setdef(asset[CFG_ASSET_TAGS_KEY][tag], CFG_SEEK_KEYS_KEY, PXAM()) asset[CFG_ASSET_TAGS_KEY][tag][CFG_SEEK_KEYS_KEY].update(keys) return self - def update_tags(self, genome, asset=None, tag=None, data=None): + def update_tags(self, genome, asset=None, tag=None, data=None, force_digest=None): """ Updates the genomes in RefGenConf object at any level. If a requested genome-asset-tag mapping is missing, it will be created @@ -919,48 +1942,69 @@ def update_tags(self, genome, asset=None, tag=None, data=None): :param str genome: genome to be added/updated :param str asset: asset to be added/updated :param str tag: tag to be added/updated + :param str force_digest: digest to force update of. The alias will + not be converted to the digest, even if provided. :param Mapping data: data to be added/updated :return RefGenConf: updated object """ if _check_insert_data(genome, str, "genome"): + genome = force_digest or self.get_genome_alias_digest( + alias=genome, fallback=True + ) _safe_setdef(self[CFG_GENOMES_KEY], genome, PXAM()) if _check_insert_data(asset, str, "asset"): - _safe_setdef(self[CFG_GENOMES_KEY][genome], CFG_ASSETS_KEY, - PXAM()) - _safe_setdef(self[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY], - asset, PXAM()) + _safe_setdef(self[CFG_GENOMES_KEY][genome], CFG_ASSETS_KEY, PXAM()) + _safe_setdef( + self[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY], asset, PXAM() + ) if _check_insert_data(tag, str, "tag"): - _safe_setdef(self[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY] - [asset], CFG_ASSET_TAGS_KEY, PXAM()) - _safe_setdef(self[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY] - [asset][CFG_ASSET_TAGS_KEY], tag, PXAM()) + _safe_setdef( + self[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][asset], + CFG_ASSET_TAGS_KEY, + PXAM(), + ) + _safe_setdef( + self[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][asset][ + CFG_ASSET_TAGS_KEY + ], + tag, + PXAM(), + ) if _check_insert_data(data, Mapping, "data"): - self[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][asset][CFG_ASSET_TAGS_KEY][tag].update(data) + self[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][asset][ + CFG_ASSET_TAGS_KEY + ][tag].update(data) return self - def update_assets(self, genome, asset=None, data=None): + def update_assets(self, genome, asset=None, data=None, force_digest=None): """ Updates the genomes in RefGenConf object at any level. If a requested genome-asset mapping is missing, it will be created :param str genome: genome to be added/updated :param str asset: asset to be added/updated + :param str force_digest: digest to force update of. The alias will + not be converted to the digest, even if provided. :param Mapping data: data to be added/updated :return RefGenConf: updated object """ if _check_insert_data(genome, str, "genome"): + genome = force_digest or self.get_genome_alias_digest( + alias=genome, fallback=True + ) _safe_setdef(self[CFG_GENOMES_KEY], genome, PXAM()) if _check_insert_data(asset, str, "asset"): - _safe_setdef(self[CFG_GENOMES_KEY][genome], CFG_ASSETS_KEY, - PXAM()) - _safe_setdef(self[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY], - asset, PXAM()) + _safe_setdef(self[CFG_GENOMES_KEY][genome], CFG_ASSETS_KEY, PXAM()) + _safe_setdef( + self[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY], asset, PXAM() + ) if _check_insert_data(data, Mapping, "data"): - self[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][asset].\ - update(data) + self[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][asset].update(data) return self - def remove(self, genome, asset, tag=None, relationships=True, files=True, force=False): + def remove( + self, genome, asset, tag=None, relationships=True, files=True, force=False + ): """ Remove data associated with a specified genome:asset:tag combination. If no tags are specified, the entire asset is removed from the genome. @@ -982,26 +2026,42 @@ def remove(self, genome, asset, tag=None, relationships=True, files=True, force= """ tag = tag or self.get_default_tag(genome, asset, use_existing=False) if files: - req_dict = {"genome": genome, "asset": asset, "tag": tag} + req_dict = { + "genome": self.get_genome_alias_digest(genome, fallback=True), + "asset": asset, + "tag": tag, + } _LOGGER.debug("Attempting removal: {}".format(req_dict)) - if not force and \ - not query_yes_no("Remove '{genome}/{asset}:{tag}'?". - format(**req_dict)): + if not force and not query_yes_no( + "Remove '{}/{}:{}'?".format(genome, asset, tag) + ): _LOGGER.info("Action aborted by the user") return removed = [] - asset_path = self.seek(genome, asset, tag, enclosing_dir=True, strict_exists=False) + asset_path = self.seek_src( + genome, asset, tag, enclosing_dir=True, strict_exists=False + ) + alias_asset_paths = self.seek( + genome, + asset, + tag, + enclosing_dir=True, + strict_exists=False, + all_aliases=True, + ) if os.path.exists(asset_path): removed.append(_remove(asset_path)) + removed.extend([_remove(p) for p in alias_asset_paths]) if self.file_path: with self as r: r.cfg_remove_assets(genome, asset, tag, relationships) else: self.cfg_remove_assets(genome, asset, tag, relationships) else: - _LOGGER.warning("Selected asset does not exist on disk ({}). " - "Removing from genome config.". - format(asset_path)) + _LOGGER.warning( + "Selected asset does not exist on disk ({}). " + "Removing from genome config.".format(asset_path) + ) if self.file_path: with self as r: r.cfg_remove_assets(genome, asset, tag, relationships) @@ -1012,15 +2072,25 @@ def remove(self, genome, asset, tag=None, relationships=True, files=True, force= try: self[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][asset] except (KeyError, TypeError): - asset_dir = os.path.abspath( - os.path.join(asset_path, os.path.pardir)) + asset_dir = os.path.abspath(os.path.join(asset_path, os.path.pardir)) + alias_asset_dirs = [ + os.path.abspath(os.path.join(p, os.path.pardir)) + for p in alias_asset_paths + ] _entity_dir_removal_log(asset_dir, "asset", req_dict, removed) + removed.extend([_remove(p) for p in alias_asset_dirs]) try: self[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY] except (KeyError, TypeError): - genome_dir = \ - os.path.abspath(os.path.join(asset_dir, os.path.pardir)) + genome_dir = os.path.abspath( + os.path.join(asset_dir, os.path.pardir) + ) + alias_genome_dirs = [ + os.path.abspath(os.path.join(p, os.path.pardir)) + for p in alias_asset_dirs + ] _entity_dir_removal_log(genome_dir, "genome", req_dict, removed) + removed.extend([_remove(p) for p in alias_genome_dirs]) try: if self.file_path: with self as r: @@ -1030,9 +2100,11 @@ def remove(self, genome, asset, tag=None, relationships=True, files=True, force= except (KeyError, TypeError): _LOGGER.debug( "Could not remove genome '{}' from the config; it " - "does not exist".format(genome)) - _LOGGER.info("Successfully removed entities:\n- {}". - format("\n- ".join(removed))) + "does not exist".format(genome) + ) + _LOGGER.info( + "Successfully removed entities:\n- {}".format("\n- ".join(removed)) + ) else: if self.file_path: with self as r: @@ -1058,6 +2130,7 @@ def cfg_remove_assets(self, genome, asset, tag=None, relationships=True): :raise TypeError: if genome argument type is not a list or str :return RefGenConf: updated object """ + def _del_if_empty(obj, attr, alt=None): """ Internal function for Mapping attribute deleting. @@ -1081,35 +2154,52 @@ def _del_if_empty(obj, attr, alt=None): if _check_insert_data(tag, str, "tag"): if relationships: self.remove_asset_from_relatives(genome, asset, tag) - del self[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][asset][CFG_ASSET_TAGS_KEY][tag] - _del_if_empty(self[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][asset], CFG_ASSET_TAGS_KEY, - [self[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY], asset]) + del self[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][asset][ + CFG_ASSET_TAGS_KEY + ][tag] + _del_if_empty( + self[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][asset], + CFG_ASSET_TAGS_KEY, + [self[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY], asset], + ) _del_if_empty(self[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY], asset) - _del_if_empty(self[CFG_GENOMES_KEY][genome], CFG_ASSETS_KEY, [self[CFG_GENOMES_KEY], genome]) + _del_if_empty( + self[CFG_GENOMES_KEY][genome], + CFG_ASSETS_KEY, + [self[CFG_GENOMES_KEY], genome], + ) _del_if_empty(self[CFG_GENOMES_KEY], genome) try: - default_tag = self[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][asset][CFG_ASSET_DEFAULT_TAG_KEY] + default_tag = self[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][ + asset + ][CFG_ASSET_DEFAULT_TAG_KEY] except KeyError: pass else: if default_tag == tag: - del self[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][asset][CFG_ASSET_DEFAULT_TAG_KEY] + del self[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][asset][ + CFG_ASSET_DEFAULT_TAG_KEY + ] if len(self[CFG_GENOMES_KEY]) == 0: self[CFG_GENOMES_KEY] = None return self - def update_genomes(self, genome, data=None): + def update_genomes(self, genome, data=None, force_digest=None): """ Updates the genomes in RefGenConf object at any level. If a requested genome is missing, it will be added :param str genome: genome to be added/updated + :param str force_digest: digest to force update of. The alias will + not be converted to the digest, even if provided. :param Mapping data: data to be added/updated :return RefGenConf: updated object """ if _check_insert_data(genome, str, "genome"): - _safe_setdef(self[CFG_GENOMES_KEY], genome, - PXAM({CFG_ASSETS_KEY: PXAM()})) + genome = force_digest or self.get_genome_alias_digest( + alias=genome, fallback=True + ) + _safe_setdef(self[CFG_GENOMES_KEY], genome, PXAM({CFG_ASSETS_KEY: PXAM()})) if _check_insert_data(data, Mapping, "data"): self[CFG_GENOMES_KEY][genome].update(data) return self @@ -1118,19 +2208,22 @@ def _update_genome_servers(self, url, reset=False): """ Update the list of genome_servers. - Use reset argument to overwrite the current list. Otherwise the current one will be appended to. + Use reset argument to overwrite the current list. Otherwise the current + one will be appended to. :param list[str] | str url: url(s) to update the genome_servers list with :param bool reset: whether the current list should be overwritten """ - urls = _make_list_of_str(url) if CFG_SERVERS_KEY in self: - if reset: - self[CFG_SERVERS_KEY] = _extend_unique([], urls) - else: - self[CFG_SERVERS_KEY] = _extend_unique(self[CFG_SERVERS_KEY], urls) + self[CFG_SERVERS_KEY] = _extend_unique( + [] if reset else self[CFG_SERVERS_KEY], _make_list_of_str(url) + ) else: - raise GenomeConfigFormatError("The '{}' is missing. Can't update the server list".format(CFG_SERVERS_KEY)) + raise GenomeConfigFormatError( + "The '{}' is missing. Can't update the server list".format( + CFG_SERVERS_KEY + ) + ) def subscribe(self, urls, reset=False): """ @@ -1162,7 +2255,9 @@ def unsubscribe(self, urls): ori_servers.remove(s) unsub_list.append(s) except ValueError: - _LOGGER.warning("URL '{}' not in genome_servers list: {}".format(s, ori_servers)) + _LOGGER.warning( + "URL '{}' not in genome_servers list: {}".format(s, ori_servers) + ) if self.file_path: with self as r: r._update_genome_servers(ori_servers, reset=True) @@ -1183,16 +2278,17 @@ def getseq(self, genome, locus, as_str=False): :return str | pyfaidx.FastaRecord | pyfaidx.Sequence: selected sequence """ import pyfaidx - fa = pyfaidx.Fasta(self.seek(genome, "fasta", strict_exists=True)) + + fa = pyfaidx.Fasta(self.seek_src(genome, "fasta", strict_exists=True)) locus_split = locus.split(":") chr = fa[locus_split[0]] if len(locus_split) == 1: return str(chr) if as_str else chr start, end = locus_split[1].split("-") - _LOGGER.debug("chr: '{}', start: '{}', end: '{}'". - format(locus_split[0], start, end)) - return str(chr[int(start):int(end)]) \ - if as_str else chr[int(start):int(end)] + _LOGGER.debug( + "chr: '{}', start: '{}', end: '{}'".format(locus_split[0], start, end) + ) + return str(chr[int(start) : int(end)]) if as_str else chr[int(start) : int(end)] def get_genome_attributes(self, genome): """ @@ -1202,8 +2298,11 @@ def get_genome_attributes(self, genome): :param str genome: genome to get the attributes dict for :return Mapping[str, str]: available genome attributes """ - return {k: self[CFG_GENOMES_KEY][genome][k] - for k in CFG_GENOME_ATTRS_KEYS if k in self[CFG_GENOMES_KEY][genome]} + return { + k: self[CFG_GENOMES_KEY][genome][k] + for k in CFG_GENOME_ATTRS_KEYS + if k in self[CFG_GENOMES_KEY][genome] + } def is_asset_complete(self, genome, asset, tag): """ @@ -1215,11 +2314,13 @@ def is_asset_complete(self, genome, asset, tag): :param str tag: tag to be checked :return bool: the decision """ - tag_data = self[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][asset][CFG_ASSET_TAGS_KEY][tag] + tag_data = self[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][asset][ + CFG_ASSET_TAGS_KEY + ][tag] return all([r in tag_data for r in REQ_TAG_ATTRS]) def _invert_genomes(self, order=None): - """ Map each asset type/kind/name to a collection of assemblies. + """Map each asset type/kind/name to a collection of assemblies. A configuration file encodes assets by genome, but in some use cases it's helpful to invert the direction of this mapping. The value of the @@ -1240,42 +2341,42 @@ def _invert_genomes(self, order=None): assets = sorted(genomes.keys(), key=order) return OrderedDict([(a, sorted(genomes[a], key=order)) for a in assets]) - def _chk_digest_if_avail(self, genome, remote_asset_name, server_url): + def _chk_digest_if_avail(self, genome, remote_asset_name, remote_digest): """ - Check local asset digest against the remote one and populate children of the asset with the provided asset:tag. + Check local asset digest against the remote one and populate children of the + asset with the provided asset:tag. - In case the local asset does not exist, the config is populated with the remote asset digest and children data + In case the local asset does not exist, the config is populated with the + remote asset digest and children data :param str genome: name of the genome to check the asset digests for :param str remote_asset_name: asset and tag names, formatted like: asset:tag - :param str server_url: addres of the server to query for the digests + :param str server_url: address of the server to query for the digests :raise RefgenconfError: if the local digest does not match its remote counterpart """ remote_asset_data = prp(remote_asset_name) asset = remote_asset_data["item"] tag = remote_asset_data["tag"] - asset_digest_url = construct_request_url(server_url, API_ID_DIGEST).\ - format(genome=genome, asset=asset, tag=tag) - try: - remote_digest = _download_json(asset_digest_url) - except DownloadJsonError: - _LOGGER.warning("Parent asset ({}/{}:{}) not found on the server. The asset provenance was not verified.". - format(genome, asset, tag)) - return try: local_digest = self.id(genome, asset, tag) if remote_digest != local_digest: raise RemoteDigestMismatchError(asset, local_digest, remote_digest) except RefgenconfError: - _LOGGER.debug("Could not find '{}/{}:{}' digest. Digest for this parent will be populated " - "with the server one after the pull".format(genome, asset, tag)) + _LOGGER.debug( + f"Could not find '{genome}/{asset}:{tag}' digest. Digest for this " + f"parent will be populated with the server one after the pull" + ) return - def chk_digest_update_child(self, genome, remote_asset_name, child_name, server_url): + def chk_digest_update_child( + self, genome, remote_asset_name, child_name, server_url + ): """ - Check local asset digest against the remote one and populate children of the asset with the provided asset:tag. + Check local asset digest against the remote one and populate children of the + asset with the provided asset:tag. - In case the local asset does not exist, the config is populated with the remote asset digest and children data + In case the local asset does not exist, the config is populated with the remote + asset digest and children data :param str genome: name of the genome to check the asset digests for :param str remote_asset_name: asset and tag names, formatted like: asset:tag @@ -1286,27 +2387,41 @@ def chk_digest_update_child(self, genome, remote_asset_name, child_name, server_ remote_asset_data = prp(remote_asset_name) asset = remote_asset_data["item"] tag = remote_asset_data["tag"] - asset_digest_url = construct_request_url(server_url, API_ID_DIGEST).\ - format(genome=genome, asset=asset, tag=tag) + asset_digest_url = construct_request_url(server_url, API_ID_DIGEST).format( + genome=genome, asset=asset, tag=tag + ) try: - remote_digest = _download_json(asset_digest_url) + remote_digest = download_json(asset_digest_url) except DownloadJsonError: return try: - # we need to allow for missing seek_keys section so that the digest is respected even from the previously - # populated 'incomplete asset' from the server - _assert_gat_exists(self[CFG_GENOMES_KEY], genome, asset, tag, - allow_incomplete=not self.is_asset_complete(genome, asset, tag)) + # we need to allow for missing seek_keys section so that the digest is + # respected even from the previously populated 'incomplete asset' from + # the server + self._assert_gat_exists( + genome, + asset, + tag, + allow_incomplete=not self.is_asset_complete(genome, asset, tag), + ) except (KeyError, MissingAssetError, MissingGenomeError, MissingSeekKeyError): - self.update_tags(genome, asset, tag, {CFG_ASSET_CHECKSUM_KEY: remote_digest}) - _LOGGER.info("Could not find '{}/{}:{}' digest. Populating with server data".format(genome, asset, tag)) + self.update_tags( + genome, asset, tag, {CFG_ASSET_CHECKSUM_KEY: remote_digest} + ) + _LOGGER.info( + f"Could not find '{genome}/{asset}:{tag}' digest. " + f"Populating with server data" + ) else: - local_digest = self[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][asset][CFG_ASSET_TAGS_KEY] \ - [tag][CFG_ASSET_CHECKSUM_KEY] + local_digest = self[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][asset][ + CFG_ASSET_TAGS_KEY + ][tag][CFG_ASSET_CHECKSUM_KEY] if remote_digest != local_digest: raise RemoteDigestMismatchError(asset, local_digest, remote_digest) finally: - self.update_relatives_assets(genome, asset, tag, [child_name], children=True) + self.update_relatives_assets( + genome, asset, tag, [child_name], children=True + ) def id(self, genome, asset, tag=None): """ @@ -1318,13 +2433,55 @@ def id(self, genome, asset, tag=None): :param str tag: tag identifier :return str: asset digest for the tag """ - _assert_gat_exists(self[CFG_GENOMES_KEY], genome, asset, tag) + self._assert_gat_exists(genome, asset, tag) tag = tag or self.get_default_tag(genome, asset) a = self[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][asset] if CFG_ASSET_CHECKSUM_KEY in a[CFG_ASSET_TAGS_KEY][tag]: return a[CFG_ASSET_TAGS_KEY][tag][CFG_ASSET_CHECKSUM_KEY] - raise MissingConfigDataError("Digest does not exist for: {}/{}:{}". - format(genome, asset, tag)) + raise MissingConfigDataError( + "Digest does not exist for: {}/{}:{}".format(genome, asset, tag) + ) + + def compare(self, genome1, genome2, explain=False): + """ + Check genomes compatibility level. + Compares Annotated Sequence Digests (ASDs) -- digested sequences and + metadata + :param str genome1: name of the first genome to compare + :param str genome2: name of the first genome to compare + :param bool explain: whether the returned code explanation should + be displayed + :return int: compatibility code + """ + + def _get_asds_for_genome(rgc, genome): + """ + Read JSON file containing ASDs for a specified genome + :param refgenconf.RefGenConf rgc: object to find the genome for + :param str genome: genome to find the file for + :return list[dict]: list of ASDs, ready to compare + """ + g = rgc.get_genome_alias(genome, fallback=True) + error_msg = ( + f"File containing Annotated Sequence Digests (ASDs) not " + f"found for genome: {g}. Must pull or build '{g}/fasta' again to " + f"check the compatibility." + ) + try: + rgc.seek_src(genome, "fasta", strict_exists=True) + except MissingSeekKeyError: + raise MissingSeekKeyError(error_msg) + json_file = rgc.get_asds_path(genome) + if not os.path.exists(json_file): + raise OSError(error_msg) + with open(json_file, "r") as jfp: + return json.load(jfp) + + return SeqColClient({}).compare_asds( + _get_asds_for_genome(self, self.get_genome_alias_digest(genome1, True)), + _get_asds_for_genome(self, self.get_genome_alias_digest(genome2, True)), + explain=explain, + ) def run_plugins(self, hook): """ @@ -1342,52 +2499,312 @@ def write(self, filepath=None): If pre- and post-update plugins are defined, they will be executed automatically :param str filepath: a file path to write to - :raise OSError: when the object has been created in a read only mode or other process has locked the file + :raise OSError: when the object has been created in a read only mode or other + process has locked the file :raise TypeError: when the filepath cannot be determined. - This takes place only if YacAttMap initialized with a Mapping as an input, not read from file. + This takes place only if YacAttMap initialized with a Mapping as an input, + not read from file. :raise OSError: when the write is called on an object with no write capabilities or when writing to a file that is locked by a different object :return str: the path to the created files """ self.run_plugins(PRE_UPDATE_HOOK) - path = super(RefGenConf, self).write(filepath=filepath) + try: + path = super(RefGenConf, self).write(filepath=filepath, exclude_case=True) + except ValidationError: + _LOGGER.error("The changes were not written to the file") + raise self.run_plugins(POST_UPDATE_HOOK) return path - -class DownloadProgressBar(tqdm): - """ - from: https://github.com/tqdm/tqdm#hooks-and-callbacks - """ - def update_to(self, b=1, bsize=1, tsize=None): + def _genome_asset_path(self, gname, aname, tname, seek_key, enclosing_dir): + """ + Retrieve the raw path value for a particular asset for a particular genome. + + :param Mapping[str, Mapping[str, Mapping[str, object]]] genomes: nested + collection of key-value pairs, keyed at top level on genome ID, then by + asset name, then by asset attribute + :param str gname: top level key to query -- genome ID, e.g. mm10 + :param str aname: second-level key to query -- asset name, e.g. fasta + :param str tname: third-level key to query -- tag name, e.g. default + :param str seek_key: fourth-level key to query -- tag name, e.g. chrom_sizes + :param bool enclosing_dir: whether a path to the entire enclosing directory + should be returned, e.g. for a fasta asset that has 3 seek_keys pointing + to 3 files in an asset dir, that asset dir is returned + :return str: raw path value for a particular asset for a particular genome + :raise MissingGenomeError: if the given key-value pair collection does not + contain as a top-level key the given genome ID + :raise MissingAssetError: if the given key-value pair colelction does + contain the given genome ID, but that key's mapping doesn't contain + the given asset name as a key + :raise GenomeConfigFormatError: if it's discovered during the query that + the structure of the given genomes mapping suggests that it was + parsed from an improperly formatted/structured genome config file. + """ + self._assert_gat_exists(gname, aname, tname) + asset_tag_data = self[CFG_GENOMES_KEY][gname][CFG_ASSETS_KEY][aname][ + CFG_ASSET_TAGS_KEY + ][tname] + if enclosing_dir: + return os.path.join(asset_tag_data[CFG_ASSET_PATH_KEY], tname) + if seek_key is None: + if aname in asset_tag_data[CFG_SEEK_KEYS_KEY]: + seek_key = aname + else: + return os.path.join(asset_tag_data[CFG_ASSET_PATH_KEY], tname) + try: + seek_key_value = asset_tag_data[CFG_SEEK_KEYS_KEY][seek_key] + appendix = "" if seek_key_value == "." else seek_key_value + return os.path.join(asset_tag_data[CFG_ASSET_PATH_KEY], tname, appendix) + except KeyError: + raise MissingSeekKeyError( + f"genome/asset:tag bundle '{gname}/{aname}:{tname}' exists, but " + f"seek_key '{seek_key}' is missing" + ) + + def _get_genome_id(self, gname): + """ + Get the actual genome name used in the object regardless of whether + the query name is an actual name of an alias + + :param str gname: genome query name, can be the actual key or its alias + :return str: genome id + """ + self._assert_gat_exists(gname) + if gname in self[CFG_GENOMES_KEY].keys(): + return gname + return self[CFG_GENOMES_KEY].get_key(alias=gname) + + def _assert_gat_exists(self, gname, aname=None, tname=None, allow_incomplete=False): + """ + Make sure the genome/asset:tag combination exists in the provided mapping and + has any seek keys defined. + Seek keys are required for the asset completeness. + + :param Mapping[str, Mapping[str, Mapping[str, object]]] genomes: nested + collection of key-value pairs, keyed at top level on genome ID, then by + asset name, then by asset attribute + :param str gname: top level key to query -- genome ID, e.g. mm10 + :param str aname: second-level key to query -- asset name, e.g. fasta + :param str tname: third-level key to query -- tag name, e.g. default + :raise MissingGenomeError: if the given key-value pair collection does not + contain as a top-level key the given genome ID + :raise MissingAssetError: if the given key-value pair collection does + contain the given genome ID, but that key's mapping doesn't contain + the given asset name as a key + :raise GenomeConfigFormatError: if it's discovered during the query that + the structure of the given genomes mapping suggests that it was + parsed from an improperly formatted/structured genome config file. + """ + _LOGGER.debug(f"checking existence of: {gname}/{aname}:{tname}") + try: + genome = self[CFG_GENOMES_KEY][gname] + except KeyError: + raise MissingGenomeError(f"Your genomes do not include '{gname}'") + if aname is not None: + try: + asset_data = genome[CFG_ASSETS_KEY][aname] + except KeyError: + raise MissingAssetError( + f"Genome '{gname}' exists, but asset '{aname}' is missing" + ) + except TypeError: + _raise_not_mapping(asset_data, "Asset section ") + if tname is not None: + try: + tag_data = asset_data[CFG_ASSET_TAGS_KEY][tname] + except KeyError: + raise MissingTagError( + f"genome/asset bundle '{gname}/{aname}' exists, but tag " + f"'{tname}' is missing" + ) + except TypeError: + _raise_not_mapping(asset_data, "Asset section ") + try: + tag_data[CFG_SEEK_KEYS_KEY] + except KeyError: + if not allow_incomplete: + raise MissingSeekKeyError( + f"Asset incomplete. No seek keys are defined for " + f"'{gname}/{aname}:{tname}'. Build or pull the asset again." + ) + + def _list_remote( + self, + url, + genome, + ): """ - Update the progress bar + List genomes and assets available remotely. - :param int b: number of blocks transferred so far - :param int bsize: size of each block (in tqdm units) - :param int tsize: total size (in tqdm units) + :param url: location or ref genome config data + :return str, str: text reps of remotely available genomes and assets """ - if tsize is not None: - self.total = tsize - self.update(b * bsize - self.n) + genomes_data = download_json(url, params={"includeSeekKeys": True}) + return ( + {g: data for g, data in genomes_data.items() if g in genome} + if genome is not None + else genomes_data + ) + def _select_genomes( + self, genome=None, strict=False, order=None, external_genomes=None + ): + """ + Safely select a subset of genomes -def _download_json(url, params=None): + :param list[str] | str genome: genomes that the assets should be found for + :param bool strict: whether a non-existent genome should lead to a warning. + Specific genome request is disregarded otherwise + :param function(str) -> object order: a way to order the genomes in the output + :param list external_genomes: a collection of genomes to use instead + of the one defined in the object + :raise TypeError: if genome argument type is not a list or str + :return list: selected subset of genomes + """ + if external_genomes: + # expects remote genomes to be supplied as aliases; no conversion + genomes = sorted(external_genomes, key=order) + else: + genomes = [ + self.get_genome_alias(x, fallback=True) + for x in sorted(self[CFG_GENOMES_KEY].keys(), key=order) + ] + if not genome: + return genomes + genome = [ + self.get_genome_alias(digest=x, fallback=True) + for x in _make_list_of_str(genome) + ] + if strict: + missing = [] + filtered = [] + for g in genome: + if g in genomes: + filtered.append(g) + else: + missing.append(g) + if missing: + _LOGGER.warning(f"Genomes do not include: {', '.join(missing)}") + return None if not filtered else filtered + return genomes if not all(x in genomes for x in genome) else genome + + +def upgrade_config( + target_version, + filepath, + force=False, + get_json_url=lambda s, i: s + _get_server_endpoints_mapping(s)[i], + link_fun=lambda s, t: os.symlink(s, t), +): """ - Safely connect to the provided API endpoint and download JSON data. + Upgrade the config to a selected target version. - :param str url: server API endpoint - :param dict params: query parameters - :return dict: served data + Convert the config file to target_version format, update file structure + inside genome_folder. Drop genomes for which genome_digest is not available + on any of the servers and do not have a fasta asset locally. + + :param str target_version: the version updated to + :param str filepath: path to config file + :param bool force: whether the upgrade should be confirmed upfront + :param function(str, str) -> str get_json_url: how to build URL from + genome server URL base, genome, and asset + :param callable link_fun: function to use to link files, e.g os.symlink or os.link """ - import requests - _LOGGER.debug("Downloading JSON data; querying URL: '{}'".format(url)) - resp = requests.get(url, params=params) - if resp.ok: - return resp.json() - elif resp.status_code == 404: - resp = None - raise DownloadJsonError(resp) + # init rgc obj with provided config + current_version = yacman.YacAttMap(filepath=filepath)[CFG_VERSION_KEY] + + if current_version == 0.3: + from .refgenconf_v03 import _RefGenConfV03 as OldRefGenConf + + rgc = OldRefGenConf(filepath=filepath, writable=True) + + if target_version == "0.4": + from .helpers import format_config_03_04 as format_config + from .helpers import alter_file_tree_03_04 as alter_file_tree + else: + raise NotImplementedError( + f"Did not upgrade. Upgrade from v{current_version} config is not " + f"implemented." + ) + + if target_version not in CFG_UPGRADE[str(rgc[CFG_VERSION_KEY])]: + raise NotImplementedError( + f"Did not upgrade. Can't upgrade to the requested target " + f"version ({target_version}). Available target versions for " + f"{str(rgc[CFG_VERSION_KEY])} are " + f"{CFG_UPGRADE[str(rgc[CFG_VERSION_KEY])]}" + ) + + # prompt the user + url = "http://refgenie.databio.org/en/latest/upgrade_config/" + if not force and not query_yes_no( + f"Upgrading config to v{target_version}. Current genome identifiers" + f" will be replaced with sequence-derived digests and contents of " + f"'{rgc[CFG_FOLDER_KEY]}' will be moved to '{DATA_DIR}' and " + f"'{ALIAS_DIR}' directories. For more info visit: {url}. Would you " + f"like to proceed?" + ): + _LOGGER.info("Action aborted by the user.") + return False + + # test server(s) and prompt + cnt = 0 + incompat_servers = [] + for server in rgc[CFG_SERVERS_KEY]: + cnt += 1 + try: + get_json_url(server, API_VERSION + API_ID_ASSETS) + except (KeyError, ConnectionError, DownloadJsonError): + incompat_servers.append(server) + if incompat_servers: + _LOGGER.info( + f"The following refgenieserver instances are not " + f"compatible or do not exist: {incompat_servers}" + ) + + # check digest availability + missing_digest = [] + for genome, genome_v in rgc[CFG_GENOMES_KEY].items(): + try: + tag = rgc.get_default_tag(genome, "fasta") + asset_path = rgc.seek(genome, "fasta", tag, "fasta") + if not os.path.exists(asset_path): + raise FileNotFoundError + except (MissingAssetError, FileNotFoundError): + cnt = 0 + servers = rgc[CFG_SERVERS_KEY] + for server in servers: + cnt += 1 + try: + get_json_url(s=server, i=API_ID_ALIAS_DIGEST).format(alias=genome) + break + except (KeyError, ConnectionError, DownloadJsonError) as e: + if cnt == len(servers): + missing_digest.append(genome) + continue + + if ( + not force + and missing_digest + and not query_yes_no( + f"The following genomes will be lost due to the lack of local fasta " + f"assets and remote genome digests: {', '.join(missing_digest)}. " + f"Would you like to proceed?" + ) + ): + _LOGGER.info("Action aborted by the user.") + return False + + # reformat config file + format_config(rgc, get_json_url=get_json_url) + # alter genome_folder structure + alter_file_tree(rgc, link_fun=link_fun) + # change the config_version + rgc[CFG_VERSION_KEY] = target_version + # write over the config file + rgc.write() + return True def _download_url_progress(url, output_path, name, params=None): @@ -1399,12 +2816,49 @@ def _download_url_progress(url, output_path, name, params=None): :param str name: name to display in front of the progress bar :param dict params: query parameters to be added to the request """ - url = url if params is None else url + "?{}".format(urllib.parse.urlencode(params)) - with DownloadProgressBar(unit_scale=True, desc=name, unit="B", bar_format=CUSTOM_BAR_FMT, leave=False) as dpb: - urllib.request.urlretrieve(url, filename=output_path, reporthook=dpb.update_to) + class _HookProgress(Progress): + """ + Internal class to connect progress bar with URL retrieval context manager + """ + + @staticmethod + def rep_hook(count, blockSize, totalSize): + """ + Needs to take three arguments in this order + """ + progress.update(task_id, advance=blockSize) + + def _get_content_len(x): + """ + Get length of remote content + """ + f = urlopen(x) + content_len = f.info().get("Content-length") + f.close() + return int(content_len) + + progress = _HookProgress( + TextColumn("{task.fields[n]}", justify="right"), + BarColumn(bar_width=None), + "[magenta]{task.percentage:>3.1f}%", + "•", + _DownloadColumn(), + "•", + _TransferSpeedColumn(), + "•", + _TimeRemainingColumn(), + ) + + url = url if params is None else url + "?{}".format(urlencode(params)) + task_id = progress.add_task("download", n=name, total=_get_content_len(url)) + with progress as p: + urlretrieve(url, filename=output_path, reporthook=p.rep_hook) -def _genome_asset_path(genomes, gname, aname, tname, seek_key, enclosing_dir, no_tag=False): + +def _genome_asset_path( + genomes, gname, aname, tname, seek_key, enclosing_dir, no_tag=False +): """ Retrieve the raw path value for a particular asset for a particular genome. @@ -1415,8 +2869,9 @@ def _genome_asset_path(genomes, gname, aname, tname, seek_key, enclosing_dir, no :param str aname: second-level key to query -- asset name, e.g. fasta :param str tname: third-level key to query -- tag name, e.g. default :param str seek_key: fourth-level key to query -- tag name, e.g. chrom_sizes - :param bool enclosing_dir: whether a path to the entire enclosing directory should be returned, e.g. - for a fasta asset that has 3 seek_keys pointing to 3 files in an asset dir, that asset dir is returned + :param bool enclosing_dir: whether a path to the entire enclosing directory should + be returned, e.g. for a fasta asset that has 3 seek_keys pointing to 3 files + in an asset dir, that asset dir is returned :return str: raw path value for a particular asset for a particular genome :raise MissingGenomeError: if the given key-value pair collection does not contain as a top-level key the given genome ID @@ -1443,8 +2898,10 @@ def _genome_asset_path(genomes, gname, aname, tname, seek_key, enclosing_dir, no try: seek_key_value = asset_tag_data[CFG_SEEK_KEYS_KEY][seek_key] except KeyError: - raise MissingSeekKeyError("genome/asset:tag bundle '{}/{}:{}' exists, but seek_key '{}' is missing". - format(gname, aname, tname, seek_key)) + raise MissingSeekKeyError( + f"genome/asset:tag bundle '{gname}/{aname}:{tname}' exists, but " + f"seek_key '{seek_key}' is missing" + ) else: if no_tag: return os.path.join(asset_tag_data[CFG_ASSET_PATH_KEY], seek_key_value) @@ -1453,8 +2910,8 @@ def _genome_asset_path(genomes, gname, aname, tname, seek_key, enclosing_dir, no def _assert_gat_exists(genomes, gname, aname=None, tname=None, allow_incomplete=False): """ - Make sure the genome/asset:tag combination exists in the provided mapping and has any seek keys defined. - Seek keys are required for the asset completeness. + Make sure the genome/asset:tag combination exists in the provided mapping and has + any seek keys defined. Seek keys are required for the asset completeness. :param Mapping[str, Mapping[str, Mapping[str, object]]] genomes: nested collection of key-value pairs, keyed at top level on genome ID, then by @@ -1471,16 +2928,18 @@ def _assert_gat_exists(genomes, gname, aname=None, tname=None, allow_incomplete= the structure of the given genomes mapping suggests that it was parsed from an improperly formatted/structured genome config file. """ - _LOGGER.debug("checking existence of: {}/{}:{}".format(gname, aname, tname)) + _LOGGER.debug(f"checking existence of: {gname}/{aname}:{tname}") try: genome = genomes[gname] except KeyError: - raise MissingGenomeError("Your genomes do not include '{}'".format(gname)) + raise MissingGenomeError(f"Your genomes do not include '{gname}'") if aname is not None: try: asset_data = genome[CFG_ASSETS_KEY][aname] except KeyError: - raise MissingAssetError("Genome '{}' exists, but asset '{}' is missing".format(gname, aname)) + raise MissingAssetError( + f"Genome '{gname}' exists, but asset '{aname}' is missing" + ) except TypeError: _raise_not_mapping(asset_data, "Asset section ") if tname is not None: @@ -1488,15 +2947,19 @@ def _assert_gat_exists(genomes, gname, aname=None, tname=None, allow_incomplete= tag_data = asset_data[CFG_ASSET_TAGS_KEY][tname] except KeyError: raise MissingTagError( - "genome/asset bundle '{}/{}' exists, but tag '{}' is missing".format(gname, aname, tname)) + f"genome/asset bundle '{gname}/{aname}' exists, " + f"but tag '{tname}' is missing" + ) except TypeError: _raise_not_mapping(asset_data, "Asset section ") try: tag_data[CFG_SEEK_KEYS_KEY] except KeyError: if not allow_incomplete: - raise MissingSeekKeyError("Asset incomplete. No seek keys are defined for '{}/{}:{}'. " - "Build or pull the asset again.".format(gname, aname, tname)) + raise MissingSeekKeyError( + f"Asset incomplete. No seek keys are defined for " + f"'{gname}/{aname}:{tname}'. Build or pull the asset again." + ) def _is_large_archive(size, cutoff=10): @@ -1506,48 +2969,33 @@ def _is_large_archive(size, cutoff=10): :param str size: size string :return bool: the decision """ + def _str2float(x): """ Remove any letters from the file size string and cast the remainder to float """ - return float("".join(c for c in x if c in '0123456789.')) + return float("".join(c for c in x if c in "0123456789.")) - _LOGGER.debug("Checking archive size: '{}'".format(size)) + _LOGGER.debug(f"Checking archive size: '{size}'") if size.endswith("MB"): # convert to gigs - size = '{0:f}GB'.format(_str2float(size) / 1000) + size = "{0:f}GB".format(_str2float(size) / 1000) if size.endswith("KB"): # convert to gigs - size = '{0:f}GB'.format(_str2float(size) / 1000**2) + size = "{0:f}GB".format(_str2float(size) / 1000 ** 2) return size.endswith("TB") or (size.endswith("GB") and _str2float(size) > cutoff) -def _list_remote(url, genome, order=None, as_str=True): - """ - List genomes and assets available remotely. - - :param url: location or ref genome config data - :param function(str) -> object order: how to key genome IDs and asset - names for sort - :return str, str: text reps of remotely available genomes and assets - """ - genomes_data = _read_remote_data(url) - refgens = _select_genomes(sorted(genomes_data.keys(), key=order), genome, - strict=True) - if not refgens: - return None, None if as_str else dict() - filtered_genomes_data = OrderedDict( - [(rg, sorted(genomes_data[rg], key=order)) for rg in refgens] - ) - if not as_str: - return filtered_genomes_data - asset_texts = ["{}/ {}".format(g.rjust(20), ", ".join(a)) - for g, a in filtered_genomes_data.items()] - return ", ".join(refgens), "\n".join(asset_texts) - - -def _make_genome_assets_line(gen, assets, offset_text=" ", genome_assets_delim="/ ", asset_sep=", ", order=None, - asset_tag_delim=":"): +def _make_genome_assets_line( + gen, + assets, + offset_text=" ", + genome_assets_delim="/ ", + asset_sep=", ", + order=None, + asset_tag_delim=":", + rjust=20, +): """ Build a line of text for display of assets by genome @@ -1560,8 +3008,12 @@ def _make_genome_assets_line(gen, assets, offset_text=" ", genome_assets_delim= :param function(str) -> object order: how to key asset names for sort :return str: text representation of a single assembly's name and assets """ - tagged_assets = asset_sep.join(sorted(_make_asset_tags_product(assets, asset_tag_delim), key=order)) - return "{}{}{}{}".format(gen.rjust(20), genome_assets_delim, offset_text, tagged_assets) + tagged_assets = asset_sep.join( + sorted(_make_asset_tags_product(assets, asset_tag_delim), key=order) + ) + return "{}{}{}{}".format( + gen.rjust(rjust), genome_assets_delim, offset_text, tagged_assets + ) def _make_asset_tags_product(assets, asset_tag_delim=":", asset_sk_delim="."): @@ -1580,32 +3032,27 @@ def _make_asset_tags_product(assets, asset_tag_delim=":", asset_sk_delim="."): seek_keys = get_tag_seek_keys(tag) # proceed only if asset is 'complete' -- has seek_keys if seek_keys is not None: - # add seek_keys if exist and different from the asset name, otherwise just the asset name - sk_assets.extend([asset_sk_delim.join([aname, sk]) if sk != aname else aname for sk in seek_keys]) + # add seek_keys if exist and different from the asset name, + # otherwise just the asset name + sk_assets.extend( + [ + asset_sk_delim.join([aname, sk]) if sk != aname else aname + for sk in seek_keys + ] + ) # add tags to the asset.seek_key list - tagged_assets.extend([asset_tag_delim.join(i) for i in itertools.product(sk_assets, [tname])]) + tagged_assets.extend( + [asset_tag_delim.join(i) for i in itertools.product(sk_assets, [tname])] + ) return tagged_assets -def _read_remote_data(url): - """ - Read as JSON data from a URL request response. - - :param str url: data request - :return dict: JSON parsed from the response from given URL request - """ - with urllib.request.urlopen(url) as response: - encoding = response.info().get_content_charset('utf8') - return json.loads(response.read().decode(encoding)) - - def _check_insert_data(obj, datatype, name): """ Checks validity of an object """ if obj is None: return False if not isinstance(obj, datatype): - raise TypeError("{} must be {}; got {}".format( - name, datatype.__name__, type(obj).__name__)) + raise TypeError(f"{name} must be {datatype.__name__}; got {type(obj).__name__}") return True @@ -1617,8 +3064,12 @@ def _make_list_of_str(arg): :return list: list of strings :raise TypeError: if a fault argument was provided """ + def _raise_faulty_arg(): - raise TypeError("Provided argument has to be a list[str] or a str, got '{}'".format(arg.__class__.__name__)) + raise TypeError( + f"Provided argument has to be a list[str] or a str, " + f"got '{arg.__class__.__name__}'" + ) if isinstance(arg, str): return [arg] @@ -1642,40 +3093,12 @@ def _extend_unique(l1, l2): return l1 + list(set(l2) - set(l1)) -def _select_genomes(genomes, genome=None, strict=False): - """ - Safely select a subset of genomes - - :param list[str] | str genome: genomes that the assets should be found for - :param bool strict: whether a non-existent genome should lead to a warning. - Specific genome request is disregarded otherwise - :raise TypeError: if genome argument type is not a list or str - :return list: selected subset of genomes - """ - if genome: - genome = _make_list_of_str(genome) - else: - return genomes - if strict: - missing = [] - filtered = [] - for g in genome: - if g in genomes: - filtered.append(g) - else: - missing.append(g) - if missing: - _LOGGER.warning("Genomes do not include: {}".format(", ".join(missing))) - return None if not filtered else filtered - return genomes if not all(x in genomes for x in genome) else genome - - def get_asset_tags(asset): """ Return a list of asset tags. - These need an accession function since under the tag name key there are not only tag names, but also the - default tag pointer + These need an accession function since under the tag name key there are not only + tag names, but also the default tag pointer :param Mapping asset: a single asset part of the RefGenConf :return list: asset tags @@ -1693,39 +3116,61 @@ def get_tag_seek_keys(tag): return [s for s in tag[CFG_SEEK_KEYS_KEY]] if CFG_SEEK_KEYS_KEY in tag else None -def construct_request_url(server_url, operation_id): +def construct_request_url(server_url, operation_id, api_prefix=API_VERSION): """ Create a request URL based on a openAPI description :param str server_url: server URL :param str operation_id: the operationId of the endpoint + :param str api_prefix: a string to prepend to the operation id :return str: a complete URL for the request """ + exception_str = f"'{server_url}' is not a compatible refgenieserver instance. " try: - return server_url + _get_server_endpoints_mapping(server_url)[operation_id] + return ( + server_url + + _get_server_endpoints_mapping(server_url)[api_prefix + operation_id] + ) + except MissingSchema as e: + _LOGGER.error( + exception_str + f"Could not fetch OpenAPI schema: {server_url}/openapi.json" + ) except KeyError as e: - _LOGGER.error("'{}' is not a compatible refgenieserver instance. " - "Could not determine API endpoint defined by ID: {}".format(server_url, e)) - sys.exit(1) + _LOGGER.error( + exception_str + f"Could not determine API endpoint defined by ID: {e}" + ) def _get_server_endpoints_mapping(url): """ - Establishes the API with the server using operationId field in the openAPI JSON description + Establishes the API with the server using operationId field in the openAPI + JSON description :param str url: server URL :return dict: endpoints mapped by their operationIds """ - json = _download_json(url + "/openapi.json") - return map_paths_by_id(asciify_json_dict(json) if sys.version_info[0] == 2 else json) + json = download_json(url + "/openapi.json") + return map_paths_by_id( + asciify_json_dict(json) if sys.version_info[0] == 2 else json + ) def map_paths_by_id(json_dict): # check the required input dict characteristics to construct the mapping - if "openapi" not in json_dict or not isinstance(json_dict["openapi"], str) \ - or "paths" not in json_dict or not isinstance(json_dict["paths"], dict): - raise ValueError("The provided mapping is not a valid representation of a JSON openAPI description") - return {values["get"]["operationId"]: endpoint for endpoint, values in json_dict["paths"].items()} + if ( + "openapi" not in json_dict + or not isinstance(json_dict["openapi"], str) + or "paths" not in json_dict + or not isinstance(json_dict["paths"], dict) + ): + raise ValueError( + "The provided mapping is not a valid representation of a " + "JSON openAPI description" + ) + return { + values["get"]["operationId"]: endpoint + for endpoint, values in json_dict["paths"].items() + } def _remove(path): @@ -1736,12 +3181,13 @@ def _remove(path): :return str: removed path """ from shutil import rmtree + if os.path.isfile(path): os.remove(path) elif os.path.isdir(path): rmtree(path) else: - raise ValueError("path '{}' is neither a file nor a dir.".format(path)) + raise ValueError(f"path '{path}' is neither a file nor a dir.") return path @@ -1756,12 +3202,18 @@ def _entity_dir_removal_log(directory, entity_class, asset_dict, removed_entitie """ subclass = "asset" if entity_class == "genome" else "tag" if os.path.basename(directory) == asset_dict[entity_class]: - _LOGGER.info("Last {sub} for {ec} '{en}' has been removed, removing {ec} directory". - format(sub=subclass, ec=entity_class, en=asset_dict[entity_class])) + _LOGGER.info( + "Last {sub} for {ec} '{en}' has been removed, " + "removing {ec} directory".format( + sub=subclass, ec=entity_class, en=asset_dict[entity_class] + ) + ) removed_entities.append(_remove(directory)) else: - _LOGGER.debug("Didn't remove '{}' since it does not match the {} name: {}". - format(directory, entity_class, asset_dict[entity_class])) + _LOGGER.debug( + f"Didn't remove '{directory}' since it does not match the {entity_class} " + f"name: {asset_dict[entity_class]}" + ) def _safe_setdef(mapping, attr, val): @@ -1778,12 +3230,12 @@ def _safe_setdef(mapping, attr, val): try: mapping.setdefault(attr, val) except (TypeError, AttributeError): - _raise_not_mapping(mapping, "Cannot update; Section '{}' ".format(attr)) + _raise_not_mapping(mapping, f"Cannot update; Section '{attr}' ") return mapping def _raise_not_mapping(mapping, prefix=""): raise GenomeConfigFormatError( - prefix + "is not a mapping but '{}'. This is usually a result of " - "a previous error".format(type(mapping).__name__) - ) \ No newline at end of file + prefix + f"is not a mapping but '{type(mapping).__name__}'. " + f"This is usually a result of a previous error" + ) diff --git a/refgenconf/refgenconf_v03.py b/refgenconf/refgenconf_v03.py new file mode 100644 index 00000000..e470eb97 --- /dev/null +++ b/refgenconf/refgenconf_v03.py @@ -0,0 +1,2202 @@ +#!/usr/bin/env python + +import sys +import urllib.request +import itertools +import logging +import os +import signal +import warnings +import shutil +import json +import yacman + +from collections import Iterable, Mapping, OrderedDict +from functools import partial +from inspect import getfullargspec as finspect +from urllib.error import HTTPError, ContentTooShortError +from tqdm import tqdm +from pkg_resources import iter_entry_points +from tempfile import TemporaryDirectory + +from attmap import PathExAttMap as PXAM +from ubiquerg import ( + checksum, + is_url, + query_yes_no, + untar, + is_writable, + parse_registry_path as prp, +) + +from .const import * +from .helpers import ( + unbound_env_vars, + asciify_json_dict, + select_genome_config, + get_dir_digest, +) +from .exceptions import * + +_LOGGER = logging.getLogger(__name__) + +__all__ = ["_RefGenConfV03"] + + +def _handle_sigint(filepath): + def handle(sig, frame): + _LOGGER.warning("\nThe download was interrupted: {}".format(filepath)) + try: + os.remove(filepath) + except OSError: + _LOGGER.debug("'{}' not found, can't remove".format(filepath)) + else: + _LOGGER.info("Incomplete file '{}' was removed".format(filepath)) + sys.exit(0) + + return handle + + +class _RefGenConfV03(yacman.YacAttMap): + """ A sort of oracle of available reference genome assembly assets """ + + def __init__( + self, + filepath=None, + entries=None, + writable=False, + wait_max=60, + skip_read_lock=False, + ): + """ + Create the config instance by with a filepath or key-value pairs. + :param str filepath: a path to the YAML file to read + :param Iterable[(str, object)] | Mapping[str, object] entries: + config filepath or collection of key-value pairs + :param bool writable: whether to create the object with write capabilities + :param int wait_max: how long to wait for creating an object when the + file that data will be read from is locked + :param bool skip_read_lock: whether the file should not be locked for + reading when object is created in read only mode + :raise refgenconf.MissingConfigDataError: if a required configuration + item is missing + :raise ValueError: if entries is given as a string and is not a file + """ + + def _missing_key_msg(key, value): + _LOGGER.debug("Config lacks '{}' key. Setting to: {}".format(key, value)) + + super(_RefGenConfV03, self).__init__( + filepath=filepath, + entries=entries, + writable=writable, + wait_max=wait_max, + skip_read_lock=skip_read_lock, + ) + genomes = self.setdefault(CFG_GENOMES_KEY, PXAM()) + if not isinstance(genomes, PXAM): + if genomes: + _LOGGER.warning( + "'{k}' value is a {t_old}, not a {t_new}; setting to empty {t_new}".format( + k=CFG_GENOMES_KEY, + t_old=type(genomes).__name__, + t_new=PXAM.__name__, + ) + ) + self[CFG_GENOMES_KEY] = PXAM() + if CFG_FOLDER_KEY not in self: + self[CFG_FOLDER_KEY] = ( + os.path.dirname(entries) if isinstance(entries, str) else os.getcwd() + ) + _missing_key_msg(CFG_FOLDER_KEY, self[CFG_FOLDER_KEY]) + try: + version = self[CFG_VERSION_KEY] + except KeyError: + _missing_key_msg(CFG_VERSION_KEY, REQ_CFG_VERSION) + self[CFG_VERSION_KEY] = REQ_CFG_VERSION + else: + try: + version = float(version) + except ValueError: + _LOGGER.warning( + "Cannot parse config version as numeric: {}".format(version) + ) + else: + if version < 0.3: + msg = ( + "This genome config (v{}) is not compliant with v{} standards." + " To use it, please downgrade refgenie: " + "'pip install refgenie=={}'.".format( + self[CFG_VERSION_KEY], + str(REQ_CFG_VERSION), + REFGENIE_BY_CFG[str(version)], + ) + ) + raise ConfigNotCompliantError(msg) + else: + _LOGGER.debug("Config version is compliant: {}".format(version)) + if CFG_SERVERS_KEY not in self and CFG_SERVER_KEY in self: + # backwards compatibility after server config key change + self[CFG_SERVERS_KEY] = self[CFG_SERVER_KEY] + del self[CFG_SERVER_KEY] + _LOGGER.debug( + "Moved servers list from '{}' to '{}'".format( + CFG_SERVER_KEY, CFG_SERVERS_KEY + ) + ) + try: + if isinstance(self[CFG_SERVERS_KEY], list): + tmp_list = [ + server_url.rstrip("/") for server_url in self[CFG_SERVERS_KEY] + ] + self[CFG_SERVERS_KEY] = tmp_list + else: # Logic in pull_asset expects a list, even for a single server + self[CFG_SERVERS_KEY] = self[CFG_SERVERS_KEY].rstrip("/") + self[CFG_SERVERS_KEY] = [self[CFG_SERVERS_KEY]] + except KeyError: + _missing_key_msg(CFG_SERVERS_KEY, str([DEFAULT_SERVER])) + self[CFG_SERVERS_KEY] = [DEFAULT_SERVER] + + def __bool__(self): + minkeys = set(self.keys()) == set(RGC_REQ_KEYS) + return not minkeys or bool(self[CFG_GENOMES_KEY]) + + __nonzero__ = __bool__ + + @property + def plugins(self): + """ + Plugins registered by entry points in the current Python env + :return dict[dict[function(refgenconf.RefGenConf)]]: dict which keys + are names of all possible hooks and values are dicts mapping + registered functions names to their values + """ + return { + h: {ep.name: ep.load() for ep in iter_entry_points("refgenie.hooks." + h)} + for h in HOOKS + } + + @property + def file_path(self): + """ + Path to the genome configuration file + + :return str: path to the genome configuration file + """ + return self[yacman.IK][yacman.FILEPATH_KEY] + + def initialize_config_file(self, filepath=None): + """ + Initialize genome configuration file on disk + :param str filepath: a valid path where the configuration file should be initialized + :return str: the filepath the file was initialized at + :raise OSError: in case the file could not be initialized due to insufficient permissions or pre-existence + :raise TypeError: if no valid filepath cat be determined + """ + + def _write_fail_err(reason): + raise OSError("Can't initialize, {}: {} ".format(reason, filepath)) + + filepath = select_genome_config(filepath, check_exist=False) + if not isinstance(filepath, str): + raise TypeError( + "Could not determine a valid path to " + "initialize a configuration file: {}".format(str(filepath)) + ) + if os.path.exists(filepath): + _write_fail_err("file exists") + if not is_writable(filepath, check_exist=False): + _write_fail_err("insufficient permissions") + self.make_writable(filepath) + self.write() + self.make_readonly() + _LOGGER.info("Initialized genome configuration file: {}".format(filepath)) + return filepath + + def list(self, genome=None, order=None, include_tags=False): + """ + List local assets; map each namespace to a list of available asset names + :param callable(str) -> object order: how to key genome IDs for sort + :param list[str] | str genome: genomes that the assets should be found for + :param bool include_tags: whether asset tags should be included in the returned dict + :return Mapping[str, Iterable[str]]: mapping from assembly name to + collection of available asset names. + """ + self.run_plugins(PRE_LIST_HOOK) + refgens = _select_genomes( + sorted(self[CFG_GENOMES_KEY].keys(), key=order), genome + ) + if include_tags: + self.run_plugins(POST_LIST_HOOK) + return OrderedDict( + [ + ( + g, + sorted( + _make_asset_tags_product( + self[CFG_GENOMES_KEY][g][CFG_ASSETS_KEY], ":" + ), + key=order, + ), + ) + for g in refgens + ] + ) + self.run_plugins(POST_LIST_HOOK) + return OrderedDict( + [ + ( + g, + sorted( + list(self[CFG_GENOMES_KEY][g][CFG_ASSETS_KEY].keys()), key=order + ), + ) + for g in refgens + ] + ) + + def assets_str( + self, + offset_text=" ", + asset_sep=", ", + genome_assets_delim="/ ", + genome=None, + order=None, + ): + """ + Create a block of text representing genome-to-asset mapping. + :param str offset_text: text that begins each line of the text + representation that's produced + :param str asset_sep: the delimiter between names of types of assets, + within each genome line + :param str genome_assets_delim: the delimiter to place between + reference genome assembly name and its list of asset names + :param list[str] | str genome: genomes that the assets should be found for + :param function(str) -> object order: how to key genome IDs and asset + names for sort + :return str: text representing genome-to-asset mapping + """ + refgens = _select_genomes( + sorted(self[CFG_GENOMES_KEY].keys(), key=order), genome + ) + make_line = partial( + _make_genome_assets_line, + offset_text=offset_text, + genome_assets_delim=genome_assets_delim, + asset_sep=asset_sep, + order=order, + ) + return "\n".join( + [make_line(g, self[CFG_GENOMES_KEY][g][CFG_ASSETS_KEY]) for g in refgens] + ) + + def add(self, path, genome, asset, tag=None, seek_keys=None, force=False): + """ + Add an external asset to the config + :param str path: a path to the asset to add; must exist and be relative + to the genome_folder + :param str genome: genome name + :param str asset: asset name + :param str tag: tag name + :param dict seek_keys: seek keys to add + :param bool force: whether to force existing asset overwrite + """ + tag = tag or self.get_default_tag(genome, asset) + abspath = os.path.join(self[CFG_FOLDER_KEY], path) + remove = False + if not os.path.exists(abspath) or not os.path.isabs(abspath): + raise OSError( + "Provided path must exist and be relative to the" + " genome_folder: {}".format(self[CFG_FOLDER_KEY]) + ) + try: + _assert_gat_exists(self[CFG_GENOMES_KEY], genome, asset, tag) + except Exception: + pass + else: + if not force and not query_yes_no( + "'{}/{}:{}' exists. Do you want to overwrite?".format( + genome, asset, tag + ) + ): + _LOGGER.info("Aborted by a user, asset no added") + return False + remove = True + _LOGGER.info("Will remove existing to overwrite") + tag_data = { + CFG_ASSET_PATH_KEY: path, + CFG_ASSET_CHECKSUM_KEY: get_dir_digest(path) or "", + } + msg = "Added asset: {}/{}:{} {}".format( + genome, + asset, + tag, + "" if not seek_keys else "with seek keys: {}".format(seek_keys), + ) + if not self.file_path: + if remove: + self.cfg_remove_assets(genome, asset, tag) + self.update_tags(genome, asset, tag, tag_data) + self.update_seek_keys(genome, asset, tag, seek_keys or {asset: "."}) + self.set_default_pointer(genome, asset, tag) + _LOGGER.info(msg) + return True + with self as rgc: + if remove: + rgc.cfg_remove_assets(genome, asset, tag) + rgc.update_tags(genome, asset, tag, tag_data) + rgc.update_seek_keys(genome, asset, tag, seek_keys or {asset: "."}) + rgc.set_default_pointer(genome, asset, tag) + _LOGGER.info(msg) + return True + + def filepath(self, genome, asset, tag, ext=".tgz", dir=False): + """ + Determine path to a particular asset for a particular genome. + :param str genome: reference genome ID + :param str asset: asset name + :param str tag: tag name + :param str ext: file extension + :param bool dir: whether to return the enclosing directory instead of the file + :return str: path to asset for given genome and asset kind/name + """ + tag_dir = os.path.join(self[CFG_FOLDER_KEY], genome, asset, tag) + return os.path.join(tag_dir, asset + "__" + tag + ext) if not dir else tag_dir + + def genomes_list(self, order=None): + """ + Get a list of this configuration's reference genome assembly IDs. + :return Iterable[str]: list of this configuration's reference genome + assembly IDs + """ + return sorted(list(self[CFG_GENOMES_KEY].keys()), key=order) + + def genomes_str(self, order=None): + """ + Get as single string this configuration's reference genome assembly IDs. + :param function(str) -> object order: how to key genome IDs for sort + :return str: single string that lists this configuration's known + reference genome assembly IDs + """ + return ", ".join(self.genomes_list(order)) + + def seek( + self, + genome_name, + asset_name, + tag_name=None, + seek_key=None, + strict_exists=None, + enclosing_dir=False, + check_exist=lambda p: os.path.exists(p) or is_url(p), + ): + """ + Seek path to a specified genome-asset-tag + :param str genome_name: name of a reference genome assembly of interest + :param str asset_name: name of the particular asset to fetch + :param str tag_name: name of the particular asset tag to fetch + :param str seek_key: name of the particular subasset to fetch + :param bool | NoneType strict_exists: how to handle case in which + path doesn't exist; True to raise IOError, False to raise + RuntimeWarning, and None to do nothing at all. Default: None (do not check). + :param function(callable) -> bool check_exist: how to check for + asset/path existence + :param bool enclosing_dir: whether a path to the entire enclosing directory should be returned, e.g. + for a fasta asset that has 3 seek_keys pointing to 3 files in an asset dir, that asset dir is returned + :return str: path to the asset + :raise TypeError: if the existence check is not a one-arg function + :raise refgenconf.MissingGenomeError: if the named assembly isn't known + to this configuration instance + :raise refgenconf.MissingAssetError: if the names assembly is known to + this configuration instance, but the requested asset is unknown + """ + tag_name = tag_name or self.get_default_tag(genome_name, asset_name) + _LOGGER.debug( + "getting asset: '{}/{}.{}:{}'".format( + genome_name, asset_name, seek_key, tag_name + ) + ) + if not callable(check_exist) or len(finspect(check_exist).args) != 1: + raise TypeError("Asset existence check must be a one-arg function.") + # 3 'path' key options supported + # option1: absolute path + # get just the saute path value from the config + path_val = _genome_asset_path( + self[CFG_GENOMES_KEY], + genome_name, + asset_name, + tag_name, + enclosing_dir=True, + no_tag=True, + seek_key=None, + ) + _LOGGER.debug("Trying absolute path: {}".format(path_val)) + if seek_key: + path = os.path.join(path_val, seek_key) + else: + path = path_val + if os.path.isabs(path) and check_exist(path): + return path + # option2: relative to genome_folder/{genome} (default, canonical) + path = _genome_asset_path( + self[CFG_GENOMES_KEY], + genome_name, + asset_name, + tag_name, + seek_key, + enclosing_dir, + ) + fullpath = os.path.join(self[CFG_FOLDER_KEY], genome_name, path) + _LOGGER.debug( + "Trying relative to genome_folder/genome ({}/{}): {}".format( + self[CFG_FOLDER_KEY], genome_name, fullpath + ) + ) + if check_exist(fullpath): + return fullpath + # option3: relative to the genome_folder (if option2 does not exist) + gf_relpath = os.path.join( + self[CFG_FOLDER_KEY], + _genome_asset_path( + self[CFG_GENOMES_KEY], + genome_name, + asset_name, + tag_name, + seek_key, + enclosing_dir, + no_tag=True, + ), + ) + _LOGGER.debug( + "Trying path relative to genome_folder ({}): {}".format( + self[CFG_FOLDER_KEY], gf_relpath + ) + ) + if check_exist(gf_relpath): + return gf_relpath + + msg = "For genome '{}' the asset '{}.{}:{}' doesn't exist; tried: {}".format( + genome_name, + asset_name, + seek_key, + tag_name, + ",".join([path, gf_relpath, fullpath]), + ) + # return option2 if existence not enforced + if strict_exists is None: + _LOGGER.debug(msg) + elif strict_exists is True: + raise OSError(msg) + else: + warnings.warn(msg, RuntimeWarning) + return fullpath + + def get_default_tag(self, genome, asset, use_existing=True): + """ + Determine the asset tag to use as default. The one indicated by the 'default_tag' key in the asset + section is returned. + If no 'default_tag' key is found, by default the first listed tag is returned with a RuntimeWarning. + This behavior can be turned off with use_existing=False + :param str genome: name of a reference genome assembly of interest + :param str asset: name of the particular asset of interest + :param bool use_existing: whether the first tag in the config should be returned in case there is no default + tag defined for an asset + :return str: name of the tag to use as the default one + """ + try: + _assert_gat_exists(self[CFG_GENOMES_KEY], genome, asset) + except RefgenconfError: + _LOGGER.info( + "Using '{}' as the default tag for '{}/{}'".format( + DEFAULT_TAG, genome, asset + ) + ) + return DEFAULT_TAG + try: + return self[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][asset][ + CFG_ASSET_DEFAULT_TAG_KEY + ] + except KeyError: + alt = ( + self[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][asset][ + CFG_ASSET_TAGS_KEY + ].keys()[0] + if use_existing + else DEFAULT_TAG + ) + if isinstance(alt, str): + if alt != DEFAULT_TAG: + warnings.warn( + "Could not find the '{}' key for asset '{}/{}'. " + "Used the first one in the config instead: '{}'. " + "Make sure it does not corrupt your workflow.".format( + CFG_ASSET_DEFAULT_TAG_KEY, genome, asset, alt + ), + RuntimeWarning, + ) + else: + warnings.warn( + "Could not find the '{}' key for asset '{}/{}'. " + "Returning '{}' instead. Make sure it does not corrupt your workflow.".format( + CFG_ASSET_DEFAULT_TAG_KEY, genome, asset, alt + ), + RuntimeWarning, + ) + return alt + except TypeError: + _raise_not_mapping( + self[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][asset], "Asset section " + ) + + def set_default_pointer(self, genome, asset, tag, force=False): + """ + Point to the selected tag by default + :param str genome: name of a reference genome assembly of interest + :param str asset: name of the particular asset of interest + :param str tag: name of the particular asset tag to point to by default + :param bool force: whether the default tag change should be forced (even if it exists) + """ + _assert_gat_exists(self[CFG_GENOMES_KEY], genome, asset, tag) + if ( + CFG_ASSET_DEFAULT_TAG_KEY + not in self[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][asset] + or len( + self[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][asset][ + CFG_ASSET_DEFAULT_TAG_KEY + ] + ) + == 0 + or force + ): + self.update_assets(genome, asset, {CFG_ASSET_DEFAULT_TAG_KEY: tag}) + _LOGGER.info( + "Default tag for '{}/{}' set to: {}".format(genome, asset, tag) + ) + + def list_assets_by_genome(self, genome=None, order=None, include_tags=False): + """ + List types/names of assets that are available for one--or all--genomes. + :param str | NoneType genome: reference genome assembly ID, optional; + if omitted, the full mapping from genome to asset names + :param function(str) -> object order: how to key genome IDs and asset + names for sort + :param bool include_tags: whether asset tags should be included in the returned dict + :return Iterable[str] | Mapping[str, Iterable[str]]: collection of + asset type names available for particular reference assembly if + one is provided, else the full mapping between assembly ID and + collection available asset type names + """ + return ( + self.list(genome, order, include_tags=include_tags)[genome] + if genome is not None + else self.list(order, include_tags=include_tags) + ) + + def list_genomes_by_asset(self, asset=None, order=None): + """ + List assemblies for which a particular asset is available. + :param str | NoneType asset: name of type of asset of interest, optional + :param function(str) -> object order: how to key genome IDs and asset + names for sort + :return Iterable[str] | Mapping[str, Iterable[str]]: collection of + assemblies for which the given asset is available; if asset + argument is omitted, the full mapping from name of asset type to + collection of assembly names for which the asset key is available + will be returned. + """ + return ( + self._invert_genomes(order) + if not asset + else sorted( + [ + g + for g, data in self[CFG_GENOMES_KEY].items() + if asset in data.get(CFG_ASSETS_KEY) + ], + key=order, + ) + ) + + def get_local_data_str(self, genome=None, order=None): + """ + List locally available reference genome IDs and assets by ID. + :param list[str] | str genome: genomes that the assets should be found for + :param function(str) -> object order: how to key genome IDs and asset + names for sort + :return str, str: text reps of locally available genomes and assets + """ + exceptions = [] + if genome is not None: + if isinstance(genome, str): + genome = [genome] + for g in genome: + try: + _assert_gat_exists(self[CFG_GENOMES_KEY], g) + except MissingGenomeError as e: + exceptions.append(e) + if exceptions: + raise MissingGenomeError(", ".join(map(str, exceptions))) + genomes_str = ( + self.genomes_str(order=order) + if genome is None + else ", ".join( + _select_genomes(sorted(self[CFG_GENOMES_KEY].keys(), key=order), genome) + ) + ) + return genomes_str, self.assets_str(genome=genome, order=order) + + def get_remote_data_str( + self, + genome=None, + order=None, + get_url=lambda server, id: construct_request_url(server, id), + ): + """ + List genomes and assets available remotely. + :param function(refgenconf.RefGenConf) -> str get_url: how to determine + URL request, given RefGenConf instance + :param list[str] | str genome: genomes that the assets should be found for + :param function(str) -> object order: how to key genome IDs and asset + names for sort + :return str, str: text reps of remotely available genomes and assets + """ + warnings.warn( + "Please use listr method instead; get_remote_data_str will be " + "removed in the next release.", + category=DeprecationWarning, + ) + return self.listr(genome, order, get_url) + + def listr( + self, + genome=None, + order=None, + get_url=lambda server, id: construct_request_url(server, id), + as_str=False, + ): + """ + List genomes and assets available remotely on all servers the object + subscribes to + :param function(refgenconf.RefGenConf) -> str get_url: how to determine + URL request, given RefGenConf instance + :param list[str] | str genome: genomes that the assets should be found for + :param function(str) -> object order: how to key genome IDs and asset + names for sort + :return dict[OrderedDict[list]]: remotely available genomes and assets + keyed by genome keyed by source server endpoint + """ + data_by_server = {} + for url in self[CFG_SERVERS_KEY]: + url = get_url(url, API_ID_ASSETS) + data_by_server[url] = _list_remote(url, genome, order, as_str=as_str) + return data_by_server + + def tag(self, genome, asset, tag, new_tag, files=True): + """ + Retags the asset selected by the tag with the new_tag. + Prompts if default already exists and overrides upon confirmation. + This method does not override the original asset entry in the RefGenConf + object. It creates its copy and tags it with the new_tag. + Additionally, if the retagged asset has any children their parent will + be retagged as new_tag that was introduced upon this method execution. + By default, the files on disk will be also renamed to reflect the + genome configuration file changes + :param str genome: name of a reference genome assembly of interest + :param str asset: name of particular asset of interest + :param str tag: name of the tag that identifies the asset of interest + :param str new_tag: name of particular the new tag + :param bool files: whether the asset files on disk should be renamed + :raise ValueError: when the original tag is not specified + :return bool: a logical indicating whether the tagging was successful + """ + self.run_plugins(PRE_TAG_HOOK) + ori_path = self.seek(genome, asset, tag, enclosing_dir=True, strict_exists=True) + new_path = os.path.abspath(os.path.join(ori_path, os.pardir, new_tag)) + if self.file_path: + with self as r: + if not r.cfg_tag_asset(genome, asset, tag, new_tag): + sys.exit(0) + else: + if not self.cfg_tag_asset(genome, asset, tag, new_tag): + sys.exit(0) + if not files: + self.run_plugins(POST_TAG_HOOK) + return + try: + if os.path.exists(new_path): + _remove(new_path) + os.rename(ori_path, new_path) + except FileNotFoundError: + _LOGGER.warning( + "Could not rename original asset tag directory '{}'" + " to the new one '{}'".format(ori_path, new_path) + ) + else: + if self.file_path: + with self as r: + r.cfg_remove_assets(genome, asset, tag, relationships=False) + else: + self.cfg_remove_assets(genome, asset, tag, relationships=False) + _LOGGER.debug( + "Asset '{}/{}' tagged with '{}' has been removed from" + " the genome config".format(genome, asset, tag) + ) + _LOGGER.debug( + "Original asset has been moved from '{}' to '{}'".format( + ori_path, new_path + ) + ) + self.run_plugins(POST_TAG_HOOK) + + def cfg_tag_asset(self, genome, asset, tag, new_tag): + """ + Retags the asset selected by the tag with the new_tag. + Prompts if default already exists and overrides upon confirmation. + This method does not override the original asset entry in the RefGenConf object. It creates its copy and tags + it with the new_tag. + Additionally, if the retagged asset has any children their parent will be retagged as new_tag that was + introduced upon this method execution. + :param str genome: name of a reference genome assembly of interest + :param str asset: name of particular asset of interest + :param str tag: name of the tag that identifies the asset of interest + :param str new_tag: name of particular the new tag + :raise ValueError: when the original tag is not specified + :return bool: a logical indicating whether the tagging was successful + """ + _assert_gat_exists(self[CFG_GENOMES_KEY], genome, asset, tag) + asset_mapping = self[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][asset] + if tag is None: + raise ValueError( + "You must explicitly specify the tag of the asset " + "you want to reassign. Currently defined " + "tags for '{}/{}' are: {}".format( + genome, asset, ", ".join(get_asset_tags(asset_mapping)) + ) + ) + if new_tag in asset_mapping[CFG_ASSET_TAGS_KEY]: + if not query_yes_no( + "You already have a '{}' asset tagged as '{}', do you wish to override?".format( + asset, new_tag + ) + ): + _LOGGER.info("Tag action aborted by the user") + return + children = [] + parents = [] + if CFG_ASSET_CHILDREN_KEY in asset_mapping[CFG_ASSET_TAGS_KEY][tag]: + children = asset_mapping[CFG_ASSET_TAGS_KEY][tag][CFG_ASSET_CHILDREN_KEY] + if CFG_ASSET_PARENTS_KEY in asset_mapping[CFG_ASSET_TAGS_KEY][tag]: + parents = asset_mapping[CFG_ASSET_TAGS_KEY][tag][CFG_ASSET_PARENTS_KEY] + if len(children) > 0 or len(parents) > 0: + if not query_yes_no( + "The asset '{}/{}:{}' has {} children and {} parents. Refgenie will update the " + "relationship data. Do you want to proceed?".format( + genome, asset, tag, len(children), len(parents) + ) + ): + _LOGGER.info("Tag action aborted by the user") + return False + # updates children's parents + self._update_relatives_tags( + genome, asset, tag, new_tag, children, update_children=False + ) + # updates parents' children + self._update_relatives_tags( + genome, asset, tag, new_tag, parents, update_children=True + ) + self[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][asset][CFG_ASSET_TAGS_KEY][ + new_tag + ] = asset_mapping[CFG_ASSET_TAGS_KEY][tag] + if ( + CFG_ASSET_DEFAULT_TAG_KEY in asset_mapping + and asset_mapping[CFG_ASSET_DEFAULT_TAG_KEY] == tag + ): + self.set_default_pointer(genome, asset, new_tag, force=True) + self.cfg_remove_assets(genome, asset, tag) + return True + + def _update_relatives_tags( + self, genome, asset, tag, new_tag, relatives, update_children + ): + """ + Internal method used for tags updating in the 'asset_parents' section in the list of children. + :param str genome: name of a reference genome assembly of interest + :param str asset: name of particular asset of interest + :param str tag: name of the tag that identifies the asset of interest + :param str new_tag: name of particular the new tag + :param list[str] relatives: relatives to be updated. Format: ["asset_name:tag", "asset_name1:tag1"] + :param bool update_children: whether the children of the selected relatives should be updated. + """ + relative_key = ( + CFG_ASSET_CHILDREN_KEY if update_children else CFG_ASSET_PARENTS_KEY + ) + for r in relatives: + _LOGGER.debug( + "updating {} in '{}'".format( + "children" if update_children else "parents", r + ) + ) + r_data = prp(r) + try: + self[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][r_data["item"]][ + CFG_ASSET_TAGS_KEY + ][r_data["tag"]] + except KeyError: + _LOGGER.warning( + "The {} asset of '{}/{}' does not exist: {}".format( + "parent" if update_children else "child", genome, asset, r + ) + ) + continue + updated_relatives = [] + if ( + relative_key + in self[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][r_data["item"]][ + CFG_ASSET_TAGS_KEY + ][r_data["tag"]] + ): + relatives = self[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][ + r_data["item"] + ][CFG_ASSET_TAGS_KEY][r_data["tag"]][relative_key] + for relative in relatives: + ori_relative_data = prp(relative) + if ( + ori_relative_data["item"] == asset + and ori_relative_data["tag"] == tag + ): + ori_relative_data["tag"] = new_tag + updated_relatives.append( + "{}/{}:{}".format(genome, asset, new_tag) + ) + else: + updated_relatives.append( + "{}/{}:{}".format( + ori_relative_data["namespace"], + ori_relative_data["item"], + ori_relative_data["tag"], + ) + ) + self.update_relatives_assets( + genome, + r_data["item"], + r_data["tag"], + updated_relatives, + update_children, + ) + self[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][r_data["item"]][ + CFG_ASSET_TAGS_KEY + ][r_data["tag"]][relative_key] = updated_relatives + + def pull( + self, + genome, + asset, + tag, + unpack=True, + force=None, + force_large=None, + size_cutoff=10, + get_json_url=lambda server, operation_id: construct_request_url( + server, operation_id + ), + build_signal_handler=_handle_sigint, + ): + """ + Download and possibly unpack one or more assets for a given ref gen. + :param str genome: name of a reference genome assembly of interest + :param str asset: name of particular asset to fetch + :param str tag: name of particular tag to fetch + :param bool unpack: whether to unpack a tarball + :param bool | NoneType force: how to handle case in which asset path + already exists; null for prompt (on a per-asset basis), False to + effectively auto-reply No to the prompt to replace existing file, + and True to auto-replay Yes for existing asset replacement. + :param bool | NoneType force_large: how to handle case in large (> 5GB) + asset is to be pulled; null for prompt (on a per-asset basis), False + to effectively auto-reply No to the prompt, + and True to auto-replay Yes + :param float size_cutoff: maximum archive file size to download with + no prompt + :param function(str, str) -> str get_json_url: how to build URL from + genome server URL base, genome, and asset + :param function(str) -> function build_signal_handler: how to create + a signal handler to use during the download; the single argument + to this function factory is the download filepath + :return (list[str], dict, str): a list of genome, asset, tag names + and a key-value pair with which genome config file should be updated + if pull succeeds, else asset key and a null value + :raise refgenconf.UnboundEnvironmentVariablesError: if genome folder + path contains any env. var. that's unbound + :raise refgenconf.RefGenConfError: if the object update is requested in + a non-writable state + """ + self.run_plugins(PRE_PULL_HOOK) + missing_vars = unbound_env_vars(self[CFG_FOLDER_KEY]) + if missing_vars: + raise UnboundEnvironmentVariablesError(", ".join(missing_vars)) + + def _null_return(): + self.run_plugins(POST_PULL_HOOK) + return gat, None, None + + def _raise_unpack_error(): + raise NotImplementedError( + "Option to not extract tarballs is not yet supported." + ) + + num_servers = 0 + bad_servers = [] + no_asset_json = [] + if CFG_SERVERS_KEY not in self or self[CFG_SERVERS_KEY] is None: + _LOGGER.error("You are not subscribed to any asset servers") + return _null_return() + for server_url in self[CFG_SERVERS_KEY]: + num_servers += 1 + try: + determined_tag = ( + _download_json( + get_json_url(server_url, API_ID_DEFAULT_TAG).format( + genome=genome, asset=asset + ) + ) + if tag is None + else tag + ) + except DownloadJsonError: + _LOGGER.warning("Could not retrieve JSON from: {}".format(server_url)) + bad_servers.append(server_url) + continue + else: + determined_tag = str(determined_tag) + _LOGGER.debug("Determined tag: {}".format(determined_tag)) + unpack or _raise_unpack_error() + gat = [genome, asset, determined_tag] + url_asset_attrs = get_json_url(server_url, API_ID_ASSET_ATTRS).format( + genome=genome, asset=asset + ) + url_genome_attrs = get_json_url(server_url, API_ID_GENOME_ATTRS).format( + genome=genome + ) + url_archive = get_json_url(server_url, API_ID_ARCHIVE).format( + genome=genome, asset=asset + ) + + try: + archive_data = _download_json( + url_asset_attrs, params={"tag": determined_tag} + ) + except DownloadJsonError: + no_asset_json.append(server_url) + if num_servers == len(self[CFG_SERVERS_KEY]): + _LOGGER.error( + "Asset '{}/{}:{}' not available on any of the following servers: {}".format( + genome, + asset, + determined_tag, + ", ".join(self[CFG_SERVERS_KEY]), + ) + ) + return _null_return() + continue + else: + _LOGGER.debug("Determined server URL: {}".format(server_url)) + genome_archive_data = _download_json(url_genome_attrs) + + if sys.version_info[0] == 2: + archive_data = asciify_json_dict(archive_data) + + # local directory that the asset data will be stored in + tag_dir = os.path.dirname(self.filepath(*gat)) + # local directory the downloaded archive will be temporarily saved in + genome_dir_path = os.path.join(self[CFG_FOLDER_KEY], genome) + # local path to the temporarily saved archive + filepath = os.path.join( + genome_dir_path, asset + "__" + determined_tag + ".tgz" + ) + # check if the genome/asset:tag exists and get request user decision + if os.path.exists(tag_dir): + + def preserve(): + _LOGGER.info("Preserving existing: {}".format(tag_dir)) + return _null_return() + + if force is False: + return preserve() + elif force is None: + if not query_yes_no("Replace existing ({})?".format(tag_dir), "no"): + return preserve() + else: + _LOGGER.debug("Overwriting: {}".format(tag_dir)) + else: + _LOGGER.debug("Overwriting: {}".format(tag_dir)) + + # check asset digests local-server match for each parent + [ + self._chk_digest_if_avail(genome, x, server_url) + for x in archive_data[CFG_ASSET_PARENTS_KEY] + if CFG_ASSET_PARENTS_KEY in archive_data + ] + + bundle_name = "{}/{}:{}".format(*gat) + archsize = archive_data[CFG_ARCHIVE_SIZE_KEY] + _LOGGER.debug("'{}' archive size: {}".format(bundle_name, archsize)) + + if not force_large and _is_large_archive(archsize, size_cutoff): + if force_large is False: + _LOGGER.info( + "Skipping pull of {}/{}:{}; size: {}".format(*gat, archsize) + ) + return _null_return() + if not query_yes_no( + "This archive exceeds the size cutoff ({} > {:.1f}GB) " + "Do you want to proceed?".format(archsize, size_cutoff) + ): + _LOGGER.info( + "Skipping pull of {}/{}:{}; size: {}".format(*gat, archsize) + ) + return _null_return() + + if not os.path.exists(genome_dir_path): + _LOGGER.debug("Creating directory: {}".format(genome_dir_path)) + os.makedirs(genome_dir_path) + + # Download the file from `url` and save it locally under `filepath`: + _LOGGER.info("Downloading URL: {}".format(url_archive)) + try: + signal.signal(signal.SIGINT, build_signal_handler(filepath)) + _download_url_progress( + url_archive, filepath, bundle_name, params={"tag": determined_tag} + ) + except HTTPError: + _LOGGER.error( + "Asset archive '{}/{}:{}' is missing on the server: {s}".format( + *gat, s=server_url + ) + ) + if server_url == self[CFG_SERVERS_KEY][-1]: + # it this was the last server on the list, return + return _null_return() + else: + _LOGGER.info("Trying next server") + # set the tag value back to what user requested + determined_tag = tag + continue + except ConnectionRefusedError as e: + _LOGGER.error(str(e)) + _LOGGER.error( + "Server {}/{} refused download. " + "Check your internet settings".format(server_url, API_VERSION_2) + ) + return _null_return() + except ContentTooShortError as e: + _LOGGER.error(str(e)) + _LOGGER.error("'{}' download incomplete".format(bundle_name)) + return _null_return() + else: + _LOGGER.info("Download complete: {}".format(filepath)) + + new_checksum = checksum(filepath) + old_checksum = archive_data and archive_data.get(CFG_ARCHIVE_CHECKSUM_KEY) + if old_checksum and new_checksum != old_checksum: + _LOGGER.error( + "Downloaded archive ('{}') checksum mismatch: ({}, {})".format( + filepath, new_checksum, old_checksum + ) + ) + return _null_return() + else: + _LOGGER.debug("Matched checksum: '{}'".format(old_checksum)) + # successfully downloaded and moved tarball; untar it + if unpack and filepath.endswith(".tgz"): + _LOGGER.info( + "Extracting asset tarball and saving to: {}".format(tag_dir) + ) + with TemporaryDirectory(dir=genome_dir_path) as tmpdir: + # here we suspect the unarchived asset to be an asset-named + # directory with the asset data inside and we transfer it + # to the tag-named subdirectory + untar(filepath, tmpdir) + if os.path.isdir(tag_dir): + shutil.rmtree(tag_dir) + _LOGGER.info("Removed existing directory: {}".format(tag_dir)) + shutil.move(os.path.join(tmpdir, asset), tag_dir) + if os.path.isfile(filepath): + os.remove(filepath) + + if self.file_path: + with self as rgc: + [ + rgc.chk_digest_update_child( + gat[0], x, "{}/{}:{}".format(*gat), server_url + ) + for x in archive_data[CFG_ASSET_PARENTS_KEY] + if CFG_ASSET_PARENTS_KEY in archive_data + ] + rgc.update_tags( + *gat, + data={ + attr: archive_data[attr] + for attr in ATTRS_COPY_PULL + if attr in archive_data + } + ) + rgc.set_default_pointer(*gat) + rgc.update_genomes(genome=genome, data=genome_archive_data) + else: + [ + self.chk_digest_update_child( + gat[0], x, "{}/{}:{}".format(*gat), server_url + ) + for x in archive_data[CFG_ASSET_PARENTS_KEY] + if CFG_ASSET_PARENTS_KEY in archive_data + ] + self.update_tags( + *gat, + data={ + attr: archive_data[attr] + for attr in ATTRS_COPY_PULL + if attr in archive_data + } + ) + self.set_default_pointer(*gat) + self.update_genomes(genome=genome, data=genome_archive_data) + self.run_plugins(POST_PULL_HOOK) + return gat, archive_data, server_url + + def remove_asset_from_relatives(self, genome, asset, tag): + """ + Remove any relationship links associated with the selected asset + :param str genome: genome to be removed from its relatives' relatives list + :param str asset: asset to be removed from its relatives' relatives list + :param str tag: tag to be removed from its relatives' relatives list + """ + to_remove = "{}/{}:{}".format(genome, asset, tag) + for rel_type in CFG_ASSET_RELATIVES_KEYS: + tmp = CFG_ASSET_RELATIVES_KEYS[ + len(CFG_ASSET_RELATIVES_KEYS) + - 1 + - CFG_ASSET_RELATIVES_KEYS.index(rel_type) + ] + tag_data = self[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][asset][ + CFG_ASSET_TAGS_KEY + ][tag] + if rel_type not in tag_data: + continue + for rel in tag_data[rel_type]: + parsed = prp(rel) + _LOGGER.debug("Removing '{}' from '{}' {}".format(to_remove, rel, tmp)) + try: + self[CFG_GENOMES_KEY][parsed["namespace"] or genome][ + CFG_ASSETS_KEY + ][parsed["item"]][CFG_ASSET_TAGS_KEY][parsed["tag"]][tmp].remove( + to_remove + ) + except (KeyError, ValueError): + pass + + def update_relatives_assets( + self, genome, asset, tag=None, data=None, children=False + ): + """ + A convenience method which wraps the update assets and uses it to update the asset relatives of an asset. + :param str genome: genome to be added/updated + :param str asset: asset to be added/updated + :param str tag: tag to be added/updated + :param list data: asset parents to be added/updated + :param bool children: a logical indicating whether the relationship to be added is 'children' + :return RefGenConf: updated object + """ + tag = tag or self.get_default_tag(genome, asset) + relationship = CFG_ASSET_CHILDREN_KEY if children else CFG_ASSET_PARENTS_KEY + if _check_insert_data(data, list, "data"): + # creates/asserts the genome/asset:tag combination + self.update_tags(genome, asset, tag) + self[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][asset][CFG_ASSET_TAGS_KEY][ + tag + ].setdefault(relationship, list()) + self[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][asset][CFG_ASSET_TAGS_KEY][ + tag + ][relationship] = _extend_unique( + self[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][asset][ + CFG_ASSET_TAGS_KEY + ][tag][relationship], + data, + ) + + def update_seek_keys(self, genome, asset, tag=None, keys=None): + """ + A convenience method which wraps the updated assets and uses it to + update the seek keys for a tagged asset. + :param str genome: genome to be added/updated + :param str asset: asset to be added/updated + :param str tag: tag to be added/updated + :param Mapping keys: seek_keys to be added/updated + :return RefGenConf: updated object + """ + tag = tag or self.get_default_tag(genome, asset) + if _check_insert_data(keys, Mapping, "keys"): + self.update_tags(genome, asset, tag) + asset = self[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][asset] + _safe_setdef(asset[CFG_ASSET_TAGS_KEY][tag], CFG_SEEK_KEYS_KEY, PXAM()) + asset[CFG_ASSET_TAGS_KEY][tag][CFG_SEEK_KEYS_KEY].update(keys) + return self + + def update_tags(self, genome, asset=None, tag=None, data=None): + """ + Updates the genomes in RefGenConf object at any level. + If a requested genome-asset-tag mapping is missing, it will be created + :param str genome: genome to be added/updated + :param str asset: asset to be added/updated + :param str tag: tag to be added/updated + :param Mapping data: data to be added/updated + :return RefGenConf: updated object + """ + if _check_insert_data(genome, str, "genome"): + _safe_setdef(self[CFG_GENOMES_KEY], genome, PXAM()) + if _check_insert_data(asset, str, "asset"): + _safe_setdef(self[CFG_GENOMES_KEY][genome], CFG_ASSETS_KEY, PXAM()) + _safe_setdef( + self[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY], asset, PXAM() + ) + if _check_insert_data(tag, str, "tag"): + _safe_setdef( + self[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][asset], + CFG_ASSET_TAGS_KEY, + PXAM(), + ) + _safe_setdef( + self[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][asset][ + CFG_ASSET_TAGS_KEY + ], + tag, + PXAM(), + ) + if _check_insert_data(data, Mapping, "data"): + self[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][asset][ + CFG_ASSET_TAGS_KEY + ][tag].update(data) + return self + + def update_assets(self, genome, asset=None, data=None): + """ + Updates the genomes in RefGenConf object at any level. + If a requested genome-asset mapping is missing, it will be created + :param str genome: genome to be added/updated + :param str asset: asset to be added/updated + :param Mapping data: data to be added/updated + :return RefGenConf: updated object + """ + if _check_insert_data(genome, str, "genome"): + _safe_setdef(self[CFG_GENOMES_KEY], genome, PXAM()) + if _check_insert_data(asset, str, "asset"): + _safe_setdef(self[CFG_GENOMES_KEY][genome], CFG_ASSETS_KEY, PXAM()) + _safe_setdef( + self[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY], asset, PXAM() + ) + if _check_insert_data(data, Mapping, "data"): + self[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][asset].update(data) + return self + + def remove( + self, genome, asset, tag=None, relationships=True, files=True, force=False + ): + """ + Remove data associated with a specified genome:asset:tag combination. + If no tags are specified, the entire asset is removed from the genome. + If no more tags are defined for the selected genome:asset after tag removal, + the parent asset will be removed as well + If no more assets are defined for the selected genome after asset removal, + the parent genome will be removed as well + :param str genome: genome to be removed + :param str asset: asset package to be removed + :param str tag: tag to be removed + :param bool relationships: whether the asset being removed should + be removed from its relatives as well + :param bool files: whether the asset files from disk should be removed + :param bool force: whether the removal prompts should be skipped + :raise TypeError: if genome argument type is not a list or str + :return RefGenConf: updated object + """ + tag = tag or self.get_default_tag(genome, asset, use_existing=False) + if files: + req_dict = {"genome": genome, "asset": asset, "tag": tag} + _LOGGER.debug("Attempting removal: {}".format(req_dict)) + if not force and not query_yes_no( + "Remove '{genome}/{asset}:{tag}'?".format(**req_dict) + ): + _LOGGER.info("Action aborted by the user") + return + removed = [] + asset_path = self.seek( + genome, asset, tag, enclosing_dir=True, strict_exists=False + ) + if os.path.exists(asset_path): + removed.append(_remove(asset_path)) + if self.file_path: + with self as r: + r.cfg_remove_assets(genome, asset, tag, relationships) + else: + self.cfg_remove_assets(genome, asset, tag, relationships) + else: + _LOGGER.warning( + "Selected asset does not exist on disk ({}). " + "Removing from genome config.".format(asset_path) + ) + if self.file_path: + with self as r: + r.cfg_remove_assets(genome, asset, tag, relationships) + return + else: + self.cfg_remove_assets(genome, asset, tag, relationships) + return + try: + self[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][asset] + except (KeyError, TypeError): + asset_dir = os.path.abspath(os.path.join(asset_path, os.path.pardir)) + _entity_dir_removal_log(asset_dir, "asset", req_dict, removed) + try: + self[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY] + except (KeyError, TypeError): + genome_dir = os.path.abspath( + os.path.join(asset_dir, os.path.pardir) + ) + _entity_dir_removal_log(genome_dir, "genome", req_dict, removed) + try: + if self.file_path: + with self as r: + del r[CFG_GENOMES_KEY][genome] + else: + del self[CFG_GENOMES_KEY][genome] + except (KeyError, TypeError): + _LOGGER.debug( + "Could not remove genome '{}' from the config; it " + "does not exist".format(genome) + ) + _LOGGER.info( + "Successfully removed entities:\n- {}".format("\n- ".join(removed)) + ) + else: + if self.file_path: + with self as r: + r.cfg_remove_assets(genome, asset, tag, relationships) + else: + self.cfg_remove_assets(genome, asset, tag, relationships) + + def cfg_remove_assets(self, genome, asset, tag=None, relationships=True): + """ + Remove data associated with a specified genome:asset:tag combination. + If no tags are specified, the entire asset is removed from the genome. + If no more tags are defined for the selected genome:asset after tag removal, + the parent asset will be removed as well + If no more assets are defined for the selected genome after asset removal, + the parent genome will be removed as well + :param str genome: genome to be removed + :param str asset: asset package to be removed + :param str tag: tag to be removed + :param bool relationships: whether the asset being removed should + be removed from its relatives as well + :raise TypeError: if genome argument type is not a list or str + :return RefGenConf: updated object + """ + + def _del_if_empty(obj, attr, alt=None): + """ + Internal function for Mapping attribute deleting. + Check if attribute exists and delete it if its length is zero. + :param Mapping obj: an object to check + :param str attr: Mapping attribute of interest + :param list[Mapping, str] alt: a list of length 2 that indicates alternative + Mapping-attribute combination to remove + """ + if attr in obj and len(obj[attr]) == 0: + if alt is None: + del obj[attr] + else: + if alt[1] in alt[0]: + del alt[0][alt[1]] + + tag = tag or self.get_default_tag(genome, asset) + if _check_insert_data(genome, str, "genome"): + if _check_insert_data(asset, str, "asset"): + if _check_insert_data(tag, str, "tag"): + if relationships: + self.remove_asset_from_relatives(genome, asset, tag) + del self[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][asset][ + CFG_ASSET_TAGS_KEY + ][tag] + _del_if_empty( + self[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][asset], + CFG_ASSET_TAGS_KEY, + [self[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY], asset], + ) + _del_if_empty(self[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY], asset) + _del_if_empty( + self[CFG_GENOMES_KEY][genome], + CFG_ASSETS_KEY, + [self[CFG_GENOMES_KEY], genome], + ) + _del_if_empty(self[CFG_GENOMES_KEY], genome) + try: + default_tag = self[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][ + asset + ][CFG_ASSET_DEFAULT_TAG_KEY] + except KeyError: + pass + else: + if default_tag == tag: + del self[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][asset][ + CFG_ASSET_DEFAULT_TAG_KEY + ] + if len(self[CFG_GENOMES_KEY]) == 0: + self[CFG_GENOMES_KEY] = None + return self + + def update_genomes(self, genome, data=None): + """ + Updates the genomes in RefGenConf object at any level. + If a requested genome is missing, it will be added + :param str genome: genome to be added/updated + :param Mapping data: data to be added/updated + :return RefGenConf: updated object + """ + if _check_insert_data(genome, str, "genome"): + _safe_setdef(self[CFG_GENOMES_KEY], genome, PXAM({CFG_ASSETS_KEY: PXAM()})) + if _check_insert_data(data, Mapping, "data"): + self[CFG_GENOMES_KEY][genome].update(data) + return self + + def _update_genome_servers(self, url, reset=False): + """ + Update the list of genome_servers. + Use reset argument to overwrite the current list. Otherwise the current one will be appended to. + :param list[str] | str url: url(s) to update the genome_servers list with + :param bool reset: whether the current list should be overwritten + """ + urls = _make_list_of_str(url) + if CFG_SERVERS_KEY in self: + if reset: + self[CFG_SERVERS_KEY] = _extend_unique([], urls) + else: + self[CFG_SERVERS_KEY] = _extend_unique(self[CFG_SERVERS_KEY], urls) + else: + raise GenomeConfigFormatError( + "The '{}' is missing. Can't update the server list".format( + CFG_SERVERS_KEY + ) + ) + + def subscribe(self, urls, reset=False): + """ + Add URLs the list of genome_servers. + Use reset argument to overwrite the current list. + Otherwise the current one will be appended to. + :param list[str] | str urls: urls to update the genome_servers list with + :param bool reset: whether the current list should be overwritten + """ + if self.file_path: + with self as r: + r._update_genome_servers(url=urls, reset=reset) + else: + self._update_genome_servers(url=urls, reset=reset) + _LOGGER.info("Subscribed to: {}".format(", ".join(urls))) + + def unsubscribe(self, urls): + """ + Remove URLs the list of genome_servers. + :param list[str] | str urls: urls to update the genome_servers list with + """ + unsub_list = [] + ori_servers = self[CFG_SERVERS_KEY] + for s in urls: + try: + ori_servers.remove(s) + unsub_list.append(s) + except ValueError: + _LOGGER.warning( + "URL '{}' not in genome_servers list: {}".format(s, ori_servers) + ) + if self.file_path: + with self as r: + r._update_genome_servers(ori_servers, reset=True) + else: + self._update_genome_servers(ori_servers, reset=True) + if unsub_list: + _LOGGER.info("Unsubscribed from: {}".format(", ".join(unsub_list))) + + def getseq(self, genome, locus, as_str=False): + """ + Return the sequence found in a selected range and chromosome. + Something like the refget protocol. + :param str genome: name of the sequence identifier + :param str locus: coordinates of desired sequence, e.g. 'chr1:1-10' + :param bool as_str: whether to convert the resurned object to string + and return just the sequence + :return str | pyfaidx.FastaRecord | pyfaidx.Sequence: selected sequence + """ + import pyfaidx + + fa = pyfaidx.Fasta(self.seek(genome, "fasta", strict_exists=True)) + locus_split = locus.split(":") + chr = fa[locus_split[0]] + if len(locus_split) == 1: + return str(chr) if as_str else chr + start, end = locus_split[1].split("-") + _LOGGER.debug( + "chr: '{}', start: '{}', end: '{}'".format(locus_split[0], start, end) + ) + return str(chr[int(start) : int(end)]) if as_str else chr[int(start) : int(end)] + + def get_genome_attributes(self, genome): + """ + Get the dictionary attributes, like checksum, contents, description. + Does not return the assets. + :param str genome: genome to get the attributes dict for + :return Mapping[str, str]: available genome attributes + """ + return { + k: self[CFG_GENOMES_KEY][genome][k] + for k in CFG_GENOME_ATTRS_KEYS + if k in self[CFG_GENOMES_KEY][genome] + } + + def is_asset_complete(self, genome, asset, tag): + """ + Check whether all required tag attributes are defined in the RefGenConf object. + This is the way we determine tag completeness. + :param str genome: genome to be checked + :param str asset: asset package to be checked + :param str tag: tag to be checked + :return bool: the decision + """ + tag_data = self[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][asset][ + CFG_ASSET_TAGS_KEY + ][tag] + return all([r in tag_data for r in REQ_TAG_ATTRS]) + + def _invert_genomes(self, order=None): + """Map each asset type/kind/name to a collection of assemblies. + A configuration file encodes assets by genome, but in some use cases + it's helpful to invert the direction of this mapping. The value of the + asset key/name may differ by genome, so that information is + necessarily lost in this inversion, but we can collect genome IDs by + asset ID. + :param function(str) -> object order: how to key genome IDs and asset + names for sort + :return OrderedDict[str, Iterable[str]] binding between asset kind/key/name + and collection of reference genome assembly names for which the + asset type is available + """ + genomes = {} + for g, am in self[CFG_GENOMES_KEY].items(): + for a in am[CFG_ASSETS_KEY].keys(): + genomes.setdefault(a, []).append(g) + assets = sorted(genomes.keys(), key=order) + return OrderedDict([(a, sorted(genomes[a], key=order)) for a in assets]) + + def _chk_digest_if_avail(self, genome, remote_asset_name, server_url): + """ + Check local asset digest against the remote one and populate children of the asset with the provided asset:tag. + In case the local asset does not exist, the config is populated with the remote asset digest and children data + :param str genome: name of the genome to check the asset digests for + :param str remote_asset_name: asset and tag names, formatted like: asset:tag + :param str server_url: addres of the server to query for the digests + :raise RefgenconfError: if the local digest does not match its remote counterpart + """ + remote_asset_data = prp(remote_asset_name) + asset = remote_asset_data["item"] + tag = remote_asset_data["tag"] + asset_digest_url = construct_request_url(server_url, API_ID_DIGEST).format( + genome=genome, asset=asset, tag=tag + ) + try: + remote_digest = _download_json(asset_digest_url) + except DownloadJsonError: + _LOGGER.warning( + "Parent asset ({}/{}:{}) not found on the server. The asset provenance was not verified.".format( + genome, asset, tag + ) + ) + return + try: + local_digest = self.id(genome, asset, tag) + if remote_digest != local_digest: + raise RemoteDigestMismatchError(asset, local_digest, remote_digest) + except RefgenconfError: + _LOGGER.debug( + "Could not find '{}/{}:{}' digest. Digest for this parent will be populated " + "with the server one after the pull".format(genome, asset, tag) + ) + return + + def chk_digest_update_child( + self, genome, remote_asset_name, child_name, server_url + ): + """ + Check local asset digest against the remote one and populate children of the asset with the provided asset:tag. + In case the local asset does not exist, the config is populated with the remote asset digest and children data + :param str genome: name of the genome to check the asset digests for + :param str remote_asset_name: asset and tag names, formatted like: asset:tag + :param str child_name: name to be appended to the children of the parent + :param str server_url: address of the server to query for the digests + :raise RefgenconfError: if the local digest does not match its remote counterpart + """ + remote_asset_data = prp(remote_asset_name) + asset = remote_asset_data["item"] + tag = remote_asset_data["tag"] + asset_digest_url = construct_request_url(server_url, API_ID_DIGEST).format( + genome=genome, asset=asset, tag=tag + ) + try: + remote_digest = _download_json(asset_digest_url) + except DownloadJsonError: + return + try: + # we need to allow for missing seek_keys section so that the digest is respected even from the previously + # populated 'incomplete asset' from the server + _assert_gat_exists( + self[CFG_GENOMES_KEY], + genome, + asset, + tag, + allow_incomplete=not self.is_asset_complete(genome, asset, tag), + ) + except (KeyError, MissingAssetError, MissingGenomeError, MissingSeekKeyError): + self.update_tags( + genome, asset, tag, {CFG_ASSET_CHECKSUM_KEY: remote_digest} + ) + _LOGGER.info( + "Could not find '{}/{}:{}' digest. Populating with server data".format( + genome, asset, tag + ) + ) + else: + local_digest = self[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][asset][ + CFG_ASSET_TAGS_KEY + ][tag][CFG_ASSET_CHECKSUM_KEY] + if remote_digest != local_digest: + raise RemoteDigestMismatchError(asset, local_digest, remote_digest) + finally: + self.update_relatives_assets( + genome, asset, tag, [child_name], children=True + ) + + def id(self, genome, asset, tag=None): + """ + Returns the digest for the specified asset. + The defined default tag will be used if not provided as an argument + :param str genome: genome identifier + :param str asset: asset identifier + :param str tag: tag identifier + :return str: asset digest for the tag + """ + _assert_gat_exists(self[CFG_GENOMES_KEY], genome, asset, tag) + tag = tag or self.get_default_tag(genome, asset) + a = self[CFG_GENOMES_KEY][genome][CFG_ASSETS_KEY][asset] + if CFG_ASSET_CHECKSUM_KEY in a[CFG_ASSET_TAGS_KEY][tag]: + return a[CFG_ASSET_TAGS_KEY][tag][CFG_ASSET_CHECKSUM_KEY] + raise MissingConfigDataError( + "Digest does not exist for: {}/{}:{}".format(genome, asset, tag) + ) + + def run_plugins(self, hook): + """ + Runs all installed plugins for the specified hook. + :param str hook: hook identifier + """ + for name, func in self.plugins[hook].items(): + _LOGGER.debug("Running {} plugin: {}".format(hook, name)) + func(self) + + def write(self, filepath=None): + """ + Write the contents to a file. + If pre- and post-update plugins are defined, they will be executed automatically + :param str filepath: a file path to write to + :raise OSError: when the object has been created in a read only mode or other process has locked the file + :raise TypeError: when the filepath cannot be determined. + This takes place only if YacAttMap initialized with a Mapping as an input, not read from file. + :raise OSError: when the write is called on an object with no write capabilities + or when writing to a file that is locked by a different object + :return str: the path to the created files + """ + self.run_plugins(PRE_UPDATE_HOOK) + path = super(_RefGenConfV03, self).write(filepath=filepath) + self.run_plugins(POST_UPDATE_HOOK) + return path + + +class DownloadProgressBar(tqdm): + """ + from: https://github.com/tqdm/tqdm#hooks-and-callbacks + """ + + def update_to(self, b=1, bsize=1, tsize=None): + """ + Update the progress bar + :param int b: number of blocks transferred so far + :param int bsize: size of each block (in tqdm units) + :param int tsize: total size (in tqdm units) + """ + if tsize is not None: + self.total = tsize + self.update(b * bsize - self.n) + + +def _download_json(url, params=None): + """ + Safely connect to the provided API endpoint and download JSON data. + :param str url: server API endpoint + :param dict params: query parameters + :return dict: served data + """ + import requests + + _LOGGER.debug("Downloading JSON data; querying URL: '{}'".format(url)) + resp = requests.get(url, params=params) + if resp.ok: + return resp.json() + elif resp.status_code == 404: + resp = None + raise DownloadJsonError(resp) + + +def _download_url_progress(url, output_path, name, params=None): + """ + Download asset at given URL to given filepath, show progress along the way. + :param str url: server API endpoint + :param str output_path: path to file to save download + :param str name: name to display in front of the progress bar + :param dict params: query parameters to be added to the request + """ + url = url if params is None else url + "?{}".format(urllib.parse.urlencode(params)) + with DownloadProgressBar( + unit_scale=True, desc=name, unit="B", bar_format=CUSTOM_BAR_FMT, leave=False + ) as dpb: + urllib.request.urlretrieve(url, filename=output_path, reporthook=dpb.update_to) + + +def _genome_asset_path( + genomes, gname, aname, tname, seek_key, enclosing_dir, no_tag=False +): + """ + Retrieve the raw path value for a particular asset for a particular genome. + :param Mapping[str, Mapping[str, Mapping[str, object]]] genomes: nested + collection of key-value pairs, keyed at top level on genome ID, then by + asset name, then by asset attribute + :param str gname: top level key to query -- genome ID, e.g. mm10 + :param str aname: second-level key to query -- asset name, e.g. fasta + :param str tname: third-level key to query -- tag name, e.g. default + :param str seek_key: fourth-level key to query -- tag name, e.g. chrom_sizes + :param bool enclosing_dir: whether a path to the entire enclosing directory should be returned, e.g. + for a fasta asset that has 3 seek_keys pointing to 3 files in an asset dir, that asset dir is returned + :return str: raw path value for a particular asset for a particular genome + :raise MissingGenomeError: if the given key-value pair collection does not + contain as a top-level key the given genome ID + :raise MissingAssetError: if the given key-value pair collection does + contain the given genome ID, but that key's mapping doesn't contain + the given asset name as a key + :raise GenomeConfigFormatError: if it's discovered during the query that + the structure of the given genomes mapping suggests that it was + parsed from an improperly formatted/structured genome config file. + """ + _assert_gat_exists(genomes, gname, aname, tname) + asset_tag_data = genomes[gname][CFG_ASSETS_KEY][aname][CFG_ASSET_TAGS_KEY][tname] + if enclosing_dir: + if no_tag: + return asset_tag_data[CFG_ASSET_PATH_KEY] + return os.path.join(asset_tag_data[CFG_ASSET_PATH_KEY], tname) + if seek_key is None: + if aname in asset_tag_data[CFG_SEEK_KEYS_KEY]: + seek_key = aname + else: + if no_tag: + return asset_tag_data[CFG_ASSET_PATH_KEY] + return os.path.join(asset_tag_data[CFG_ASSET_PATH_KEY], tname) + try: + seek_key_value = asset_tag_data[CFG_SEEK_KEYS_KEY][seek_key] + except KeyError: + raise MissingSeekKeyError( + "genome/asset:tag bundle '{}/{}:{}' exists, but seek_key '{}' is missing".format( + gname, aname, tname, seek_key + ) + ) + else: + if no_tag: + return os.path.join(asset_tag_data[CFG_ASSET_PATH_KEY], seek_key_value) + return os.path.join(asset_tag_data[CFG_ASSET_PATH_KEY], tname, seek_key_value) + + +def _assert_gat_exists(genomes, gname, aname=None, tname=None, allow_incomplete=False): + """ + Make sure the genome/asset:tag combination exists in the provided mapping and has any seek keys defined. + Seek keys are required for the asset completeness. + :param Mapping[str, Mapping[str, Mapping[str, object]]] genomes: nested + collection of key-value pairs, keyed at top level on genome ID, then by + asset name, then by asset attribute + :param str gname: top level key to query -- genome ID, e.g. mm10 + :param str aname: second-level key to query -- asset name, e.g. fasta + :param str tname: third-level key to query -- tag name, e.g. default + :raise MissingGenomeError: if the given key-value pair collection does not + contain as a top-level key the given genome ID + :raise MissingAssetError: if the given key-value pair collection does + contain the given genome ID, but that key's mapping doesn't contain + the given asset name as a key + :raise GenomeConfigFormatError: if it's discovered during the query that + the structure of the given genomes mapping suggests that it was + parsed from an improperly formatted/structured genome config file. + """ + _LOGGER.debug("checking existence of: {}/{}:{}".format(gname, aname, tname)) + try: + genome = genomes[gname] + except KeyError: + raise MissingGenomeError("Your genomes do not include '{}'".format(gname)) + if aname is not None: + try: + asset_data = genome[CFG_ASSETS_KEY][aname] + except KeyError: + raise MissingAssetError( + "Genome '{}' exists, but asset '{}' is missing".format(gname, aname) + ) + except TypeError: + _raise_not_mapping(asset_data, "Asset section ") + if tname is not None: + try: + tag_data = asset_data[CFG_ASSET_TAGS_KEY][tname] + except KeyError: + raise MissingTagError( + "genome/asset bundle '{}/{}' exists, but tag '{}' is missing".format( + gname, aname, tname + ) + ) + except TypeError: + _raise_not_mapping(asset_data, "Asset section ") + try: + tag_data[CFG_SEEK_KEYS_KEY] + except KeyError: + if not allow_incomplete: + raise MissingSeekKeyError( + "Asset incomplete. No seek keys are defined for '{}/{}:{}'. " + "Build or pull the asset again.".format(gname, aname, tname) + ) + + +def _is_large_archive(size, cutoff=10): + """ + Determines if the file is large based on a string formatted as follows: 15.4GB + :param str size: size string + :return bool: the decision + """ + + def _str2float(x): + """ + Remove any letters from the file size string and cast the remainder to float + """ + return float("".join(c for c in x if c in "0123456789.")) + + _LOGGER.debug("Checking archive size: '{}'".format(size)) + if size.endswith("MB"): + # convert to gigs + size = "{0:f}GB".format(_str2float(size) / 1000) + if size.endswith("KB"): + # convert to gigs + size = "{0:f}GB".format(_str2float(size) / 1000 ** 2) + return size.endswith("TB") or (size.endswith("GB") and _str2float(size) > cutoff) + + +def _list_remote(url, genome, order=None, as_str=True): + """ + List genomes and assets available remotely. + :param url: location or ref genome config data + :param function(str) -> object order: how to key genome IDs and asset + names for sort + :return str, str: text reps of remotely available genomes and assets + """ + genomes_data = _read_remote_data(url) + refgens = _select_genomes( + sorted(genomes_data.keys(), key=order), genome, strict=True + ) + if not refgens: + return None, None if as_str else dict() + filtered_genomes_data = OrderedDict( + [(rg, sorted(genomes_data[rg], key=order)) for rg in refgens] + ) + if not as_str: + return filtered_genomes_data + asset_texts = [ + "{}/ {}".format(g.rjust(20), ", ".join(a)) + for g, a in filtered_genomes_data.items() + ] + return ", ".join(refgens), "\n".join(asset_texts) + + +def _make_genome_assets_line( + gen, + assets, + offset_text=" ", + genome_assets_delim="/ ", + asset_sep=", ", + order=None, + asset_tag_delim=":", +): + """ + Build a line of text for display of assets by genome + :param str gen: reference assembly ID, e.g. hg38 + :param Iterable[str] assets: collection of asset names for the given genome + :param str offset_text: prefix for the line, e.g. a kind of whitespace + :param str genome_assets_delim: delimiter between a genome ID and text + showing names of assets for that genome + :param str asset_sep: delimiter between asset names + :param function(str) -> object order: how to key asset names for sort + :return str: text representation of a single assembly's name and assets + """ + tagged_assets = asset_sep.join( + sorted(_make_asset_tags_product(assets, asset_tag_delim), key=order) + ) + return "{}{}{}{}".format( + gen.rjust(20), genome_assets_delim, offset_text, tagged_assets + ) + + +def _make_asset_tags_product(assets, asset_tag_delim=":", asset_sk_delim="."): + """ + Make a product of assets and tags available in the provided mapping + :param Mapping assets: the assets for a selected genome + :param str asset_tag_delim: how to represent the asset-tag link + :param str asset_sk_delim: how to represent the asset-seek_key link + :return list: list representation of tagged assets + """ + tagged_assets = [] + for aname, asset in assets.items(): + for tname, tag in asset[CFG_ASSET_TAGS_KEY].items(): + sk_assets = [] + seek_keys = get_tag_seek_keys(tag) + # proceed only if asset is 'complete' -- has seek_keys + if seek_keys is not None: + # add seek_keys if exist and different from the asset name, otherwise just the asset name + sk_assets.extend( + [ + asset_sk_delim.join([aname, sk]) if sk != aname else aname + for sk in seek_keys + ] + ) + # add tags to the asset.seek_key list + tagged_assets.extend( + [asset_tag_delim.join(i) for i in itertools.product(sk_assets, [tname])] + ) + return tagged_assets + + +def _read_remote_data(url): + """ + Read as JSON data from a URL request response. + :param str url: data request + :return dict: JSON parsed from the response from given URL request + """ + with urllib.request.urlopen(url) as response: + encoding = response.info().get_content_charset("utf8") + return json.loads(response.read().decode(encoding)) + + +def _check_insert_data(obj, datatype, name): + """ Checks validity of an object """ + if obj is None: + return False + if not isinstance(obj, datatype): + raise TypeError( + "{} must be {}; got {}".format(name, datatype.__name__, type(obj).__name__) + ) + return True + + +def _make_list_of_str(arg): + """ + Convert a str to list of str or ensure a list is a list of str + :param list[str] | str arg: string or a list of strings to listify + :return list: list of strings + :raise TypeError: if a fault argument was provided + """ + + def _raise_faulty_arg(): + raise TypeError( + "Provided argument has to be a list[str] or a str, got '{}'".format( + arg.__class__.__name__ + ) + ) + + if isinstance(arg, str): + return [arg] + elif isinstance(arg, list): + if not all(isinstance(i, str) for i in arg): + _raise_faulty_arg() + else: + return arg + else: + _raise_faulty_arg() + + +def _extend_unique(l1, l2): + """ + Extend a list with no duplicates + :param list l1: original list + :param list l2: list with items to add + :return list: an extended list + """ + return l1 + list(set(l2) - set(l1)) + + +def _select_genomes(genomes, genome=None, strict=False): + """ + Safely select a subset of genomes + :param list[str] | str genome: genomes that the assets should be found for + :param bool strict: whether a non-existent genome should lead to a warning. + Specific genome request is disregarded otherwise + :raise TypeError: if genome argument type is not a list or str + :return list: selected subset of genomes + """ + if genome: + genome = _make_list_of_str(genome) + else: + return genomes + if strict: + missing = [] + filtered = [] + for g in genome: + if g in genomes: + filtered.append(g) + else: + missing.append(g) + if missing: + _LOGGER.warning("Genomes do not include: {}".format(", ".join(missing))) + return None if not filtered else filtered + return genomes if not all(x in genomes for x in genome) else genome + + +def get_asset_tags(asset): + """ + Return a list of asset tags. + These need an accession function since under the tag name key there are not only tag names, but also the + default tag pointer + :param Mapping asset: a single asset part of the RefGenConf + :return list: asset tags + """ + return [t for t in asset[CFG_ASSET_TAGS_KEY]] + + +def get_tag_seek_keys(tag): + """ + Return a list of tag seek keys. + :param Mapping tag: a single tag part of the RefGenConf + :return list: tag seek keys + """ + return [s for s in tag[CFG_SEEK_KEYS_KEY]] if CFG_SEEK_KEYS_KEY in tag else None + + +def construct_request_url(server_url, operation_id): + """ + Create a request URL based on a openAPI description + :param str server_url: server URL + :param str operation_id: the operationId of the endpoint + :return str: a complete URL for the request + """ + try: + return server_url + _get_server_endpoints_mapping(server_url)[operation_id] + except KeyError as e: + _LOGGER.error( + "'{}' is not a compatible refgenieserver instance. " + "Could not determine API endpoint defined by ID: {}".format(server_url, e) + ) + sys.exit(1) + + +def _get_server_endpoints_mapping(url): + """ + Establishes the API with the server using operationId field in the openAPI JSON description + :param str url: server URL + :return dict: endpoints mapped by their operationIds + """ + json = _download_json(url + "/openapi.json") + return map_paths_by_id( + asciify_json_dict(json) if sys.version_info[0] == 2 else json + ) + + +def map_paths_by_id(json_dict): + # check the required input dict characteristics to construct the mapping + if ( + "openapi" not in json_dict + or not isinstance(json_dict["openapi"], str) + or "paths" not in json_dict + or not isinstance(json_dict["paths"], dict) + ): + raise ValueError( + "The provided mapping is not a valid representation of a JSON openAPI description" + ) + return { + values["get"]["operationId"]: endpoint + for endpoint, values in json_dict["paths"].items() + } + + +def _remove(path): + """ + remove asset if it is a dir or a file + :param str path: path to the entity to remove, either a file or a dir + :return str: removed path + """ + from shutil import rmtree + + if os.path.isfile(path): + os.remove(path) + elif os.path.isdir(path): + rmtree(path) + else: + raise ValueError("path '{}' is neither a file nor a dir.".format(path)) + return path + + +def _entity_dir_removal_log(directory, entity_class, asset_dict, removed_entities): + """ + Message and save removed entity data + :param str directory: removed dir + :param str entity_class: class of the entity + :param dict asset_dict: selected genome/asset:tag combination + :param list removed_entities: list of the removed entities to append to + """ + subclass = "asset" if entity_class == "genome" else "tag" + if os.path.basename(directory) == asset_dict[entity_class]: + _LOGGER.info( + "Last {sub} for {ec} '{en}' has been removed, removing {ec} directory".format( + sub=subclass, ec=entity_class, en=asset_dict[entity_class] + ) + ) + removed_entities.append(_remove(directory)) + else: + _LOGGER.debug( + "Didn't remove '{}' since it does not match the {} name: {}".format( + directory, entity_class, asset_dict[entity_class] + ) + ) + + +def _safe_setdef(mapping, attr, val): + """ + Set default value for a mapping, but catch errors caused by the mapping to + be updated being an object of incorrect type. Raise an informative error. + :param Mapping mapping: mapping to update + :param str attr: attribute to update + :param val: value to assign as the default + :raise GenomeConfigFormatError: if mapping is of incorrect class + :return Mapping: updated mapping + """ + try: + mapping.setdefault(attr, val) + except (TypeError, AttributeError): + _raise_not_mapping(mapping, "Cannot update; Section '{}' ".format(attr)) + return mapping + + +def _raise_not_mapping(mapping, prefix=""): + raise GenomeConfigFormatError( + prefix + "is not a mapping but '{}'. This is usually a result of " + "a previous error".format(type(mapping).__name__) + ) diff --git a/refgenconf/schemas/AnnotatedSequenceDigestList.yaml b/refgenconf/schemas/AnnotatedSequenceDigestList.yaml new file mode 100644 index 00000000..10cd6a0c --- /dev/null +++ b/refgenconf/schemas/AnnotatedSequenceDigestList.yaml @@ -0,0 +1,20 @@ +description: "Schema for List of ASDs" +henge_class: "AnnotatedSequenceDigestList" +recursive: true +type: array +items: + description: "Schema for an Annotated Sequence Digest; a digested Sequence plus metadata" + type: object + henge_class: ASD + properties: + name: + type: string + length: + type: "integer" + sequence: + description: "Sequence digest" + type: string + required: + - length + - name + - sequence diff --git a/refgenconf/schemas/AnnotatedSequenceList.yaml b/refgenconf/schemas/AnnotatedSequenceList.yaml new file mode 100644 index 00000000..320fdf9e --- /dev/null +++ b/refgenconf/schemas/AnnotatedSequenceList.yaml @@ -0,0 +1,33 @@ +description: "Schema for List of ASDs" +henge_class: "AnnotatedSequenceList" +recursive: true +type: array +items: + description: "Schema for an Annotated Sequence Digest; a digested Sequence plus metadata" + type: object + henge_class: ASD + properties: + name: + type: string + length: + type: "integer" + topology: + type: string + enum: ["circular", "linear"] + default: "linear" + sequence: + description: "Schema for a single raw sequence" + henge_class: sequence + type: object + properties: + sequence: + type: string + description: "Actual sequence content" + required: + - sequence + required: + - length + - name + - topology + recursive: + - sequence diff --git a/refgenconf/schemas/genome_config_schema.yaml b/refgenconf/schemas/genome_config_schema.yaml new file mode 100644 index 00000000..44917dd7 --- /dev/null +++ b/refgenconf/schemas/genome_config_schema.yaml @@ -0,0 +1,72 @@ +description: "refgenie genome configuration file schema" +version: "0.4" +properties: + config_version: + type: [string, number] + genome_folder: + type: string + remote_url_base: + type: string + genome_archive_folder: + type: string + genome_archive_config: + type: string + genome_servers: + type: array + items: + type: string + genomes: + additionalProperties: false + patternProperties: + ^.*$: { "$ref": "#/definitions/genome" } + +definitions: + genome: + type: object + properties: + genome_description: + type: string + aliases: + type: array + items: + type: string + assets: + additionalProperties: false + patternProperties: + ^.*$: { "$ref": "#/definitions/asset" } + asset: + type: object + properties: + my_required_prop: + type: integer + asset_description: + type: string + tags: + additionalProperties: false + patternProperties: + ^.*$: { "$ref": "#/definitions/tag" } + tag: + type: object + properties: + asset_path: + type: string + asset_digest: + type: string + seek_keys: + additionalProperties: false + patternProperties: + ^.*$: { "$ref": "#/definitions/seek_key" } + asset_parents: + type: array + items: + type: string + asset_children: + type: array + items: + type: string + seek_key: + type: string + relatives: + type: array + items: + type: string \ No newline at end of file diff --git a/refgenconf/seqcol.py b/refgenconf/seqcol.py new file mode 100644 index 00000000..19864bf8 --- /dev/null +++ b/refgenconf/seqcol.py @@ -0,0 +1,249 @@ +import os +import logging +import hashlib +import binascii + +from gzip import open as gzopen + +from .henge import ITEM_TYPE, Henge +from .exceptions import RefgenconfError + + +def trunc512_digest(seq, offset=24): + digest = hashlib.sha512(seq.encode()).digest() + hex_digest = binascii.hexlify(digest[:offset]) + return hex_digest.decode() + + +# module constants +def _schema_path(name): + return os.path.join(SCHEMA_FILEPATH, name) + + +CONTENT_ALL_A_IN_B = 2 ** 0 +CONTENT_ALL_B_IN_A = 2 ** 1 +LENGTHS_ALL_A_IN_B = 2 ** 2 +LENGTHS_ALL_B_IN_A = 2 ** 3 +NAMES_ALL_A_IN_B = 2 ** 4 +NAMES_ALL_B_IN_A = 2 ** 5 +CONTENT_A_ORDER = 2 ** 6 +CONTENT_B_ORDER = 2 ** 7 +CONTENT_ANY_SHARED = 2 ** 8 +LENGTHS_ANY_SHARED = 2 ** 9 +NAMES_ANY_SHARED = 2 ** 10 + +FLAGS = { + CONTENT_ALL_A_IN_B: "CONTENT_ALL_A_IN_B", + CONTENT_ALL_B_IN_A: "CONTENT_ALL_B_IN_A", + LENGTHS_ALL_A_IN_B: "LENGTHS_ALL_A_IN_B", + LENGTHS_ALL_B_IN_A: "LENGTHS_ALL_B_IN_A", + NAMES_ALL_A_IN_B: "NAMES_ALL_A_IN_B", + NAMES_ALL_B_IN_A: "NAMES_ALL_B_IN_A", + CONTENT_ANY_SHARED: "CONTENT_ANY_SHARED", + LENGTHS_ANY_SHARED: "LENGTHS_ANY_SHARED", + NAMES_ANY_SHARED: "NAMES_ANY_SHARED", + CONTENT_A_ORDER: "CONTENT_A_ORDER", + CONTENT_B_ORDER: "CONTENT_B_ORDER", +} + +NAME_KEY = "name" +SEQ_KEY = "sequence" +LEN_KEY = "length" + +# internal schemas paths determination +ASL_NAME = "AnnotatedSequenceList" +ASDL_NAME = "AnnotatedSequenceDigestList" +SCHEMA_NAMES = [ASL_NAME, ASDL_NAME] +SCHEMA_FILEPATH = os.path.join(os.path.dirname(__file__), "schemas") +INTERNAL_SCHEMAS = [_schema_path(f"{s}.yaml") for s in SCHEMA_NAMES] + +_LOGGER = logging.getLogger(__name__) + + +class SeqColClient(Henge): + """ + Extension of henge that accommodates collections of sequences. + """ + + def __init__( + self, database, schemas=None, henges=None, checksum_function=trunc512_digest + ): + """ + A user interface to insert and retrieve decomposable recursive unique + identifiers (DRUIDs). + + :param dict database: Dict-like lookup database with sequences + and hashes + :param dict schemas: One or more jsonschema schemas describing the + data types stored by this Henge + :param function(str) -> str checksum_function: Default function to + handle the digest of the + serialized items stored in this henge. + """ + assert all([os.path.exists(s) for s in INTERNAL_SCHEMAS]), RefgenconfError( + f"Missing schema files: {INTERNAL_SCHEMAS}" + ) + super(SeqColClient, self).__init__( + database=database, + schemas=schemas or INTERNAL_SCHEMAS, + henges=henges, + checksum_function=checksum_function, + ) + + def load_fasta(self, fa_file, skip_seq=False, gzipped=False): + """ + Load a sequence collection into the database + + :param str fa_file: path to the FASTA file to parse and load + :param bool skip_seq: whether to disregard the actual sequences, + load just the names and lengths + :param bool skip_seq: whether to disregard the actual sequences, + load just the names and lengths + """ + seq = "" + name = "" + init = False + aslist = [] + openfun = gzopen if gzipped else open + with openfun(fa_file, "rt") as f: + for line in f: + line = line.strip("\n") + if line.startswith(">"): + if not init: + name = line.replace(">", "") + else: + aslist.append( + { + NAME_KEY: name, + LEN_KEY: len(seq), + SEQ_KEY: "" if skip_seq else trunc512_digest(seq), + } + ) + name = line.replace(">", "") + seq = "" + continue + init = True + seq = seq + line + aslist.append( + { + NAME_KEY: name, + LEN_KEY: len(seq), + SEQ_KEY: "" if skip_seq else trunc512_digest(seq), + } + ) + + collection_checksum = self.insert(aslist, ASDL_NAME) + _LOGGER.info(f"Loaded {ASDL_NAME} ({len(aslist)} sequences)") + return collection_checksum, aslist + + @staticmethod + def compare_asds(asdA, asdB, explain=False): + """ + Compare Annotated Sequence Digests (ASDs) -- digested sequences and metadata + + :param str asdA: ASD for first sequence collection to compare. + :param str asdB: ASD for second sequence collection to compare. + :param bool explain: Print an explanation of the flag? [Default: False] + """ + + def _xp(prop, lst): + """ Extract property from a list of dicts """ + return list(map(lambda x: x[prop], lst)) + + def _index(x, lst): + """ Find an index of a sequence element in a list of dicts """ + try: + return _xp(SEQ_KEY, lst).index(x) + except: + return None + + def _get_common_content(lstA, lstB): + """ + Find the intersection between two list of dicts with sequences + """ + return list( + filter(None.__ne__, [_index(x, lstB) for x in _xp(SEQ_KEY, lstA)]) + ) + + # Not ideal, but we expect these to return lists, but if the item was + # singular only a dict is returned + if not isinstance(asdA, list): + asdA = [asdA] + if not isinstance(asdB, list): + asdB = [asdB] + + ainb = [x in _xp(SEQ_KEY, asdB) for x in _xp(SEQ_KEY, asdA)] + bina = [x in _xp(SEQ_KEY, asdA) for x in _xp(SEQ_KEY, asdB)] + + return_flag = 0 # initialize + if any(ainb): + ordA = _get_common_content(asdA, asdB) + if ordA == sorted(ordA): + return_flag += CONTENT_A_ORDER + if any(bina): + ordB = _get_common_content(asdB, asdA) + if ordB == sorted(ordB): + return_flag += CONTENT_B_ORDER + + ainb_len = [x in _xp(LEN_KEY, asdB) for x in _xp(LEN_KEY, asdA)] + bina_len = [x in _xp(LEN_KEY, asdA) for x in _xp(LEN_KEY, asdB)] + + ainb_name = [x in _xp(NAME_KEY, asdB) for x in _xp(NAME_KEY, asdA)] + bina_name = [x in _xp(NAME_KEY, asdA) for x in _xp(NAME_KEY, asdB)] + + if any(ainb): + return_flag += CONTENT_ANY_SHARED + if all(ainb): + return_flag += CONTENT_ALL_A_IN_B + if all(bina): + return_flag += CONTENT_ALL_B_IN_A + + if any(ainb_name): + return_flag += NAMES_ANY_SHARED + if all(ainb_name): + return_flag += NAMES_ALL_A_IN_B + if all(bina_name): + return_flag += NAMES_ALL_B_IN_A + + if any(ainb_len): + return_flag += LENGTHS_ANY_SHARED + if all(ainb_len): + return_flag += LENGTHS_ALL_A_IN_B + if all(bina_len): + return_flag += LENGTHS_ALL_B_IN_A + + if explain: + explain_flag(return_flag) + return return_flag + + def compare(self, digestA, digestB, explain=False): + """ + Given two collection checksums in the database, provide some information + about how they are related. + + :param str digestA: Digest for first sequence collection to compare. + :param str digestB: Digest for second sequence collection to compare. + :param bool explain: Print an explanation of the flag? [Default: False] + """ + typeA = self.database[digestA + ITEM_TYPE] + typeB = self.database[digestB + ITEM_TYPE] + + if typeA != typeB: + _LOGGER.error( + f"Can't compare objects of different types: " f"{typeA} vs {typeB}" + ) + + asdA = self.retrieve(digestA, reclimit=1) + asdB = self.retrieve(digestB, reclimit=1) + return self.compare_asds(asdA, asdB, explain=explain) + + +# Static functions below (these don't require a database) + + +def explain_flag(flag): + """ Explains a compare flag """ + print(f"Flag: {flag}\nBinary: {bin(flag)}\n") + for e in range(0, 13): + if flag & 2 ** e: + print(FLAGS[2 ** e]) diff --git a/refgenie.yaml b/refgenie.yaml deleted file mode 100644 index 8bdb5d89..00000000 --- a/refgenie.yaml +++ /dev/null @@ -1,22 +0,0 @@ -# Build configuration - -tools: - # absolute paths to required tools - bowtie2build: bowtie2-build - bismark_genome_preparation: bismark_genome_preparation - epilog_indexer: epilog_indexer.py - samtools: samtools - kallisto: kallisto - hisat2build: hisat2-build - -index: - bowtie2: True - bismark_bt1: False - bismark_bt2: False - epilog: False - hisat: False - kallisto: True - -param: - epilog: - context: "cg" \ No newline at end of file diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt index 502c6c28..27d3f420 100644 --- a/requirements/requirements-all.txt +++ b/requirements/requirements-all.txt @@ -1,6 +1,7 @@ -attmap>=0.12.5 pyyaml requests -tqdm>=4.38.0 -yacman>=0.6.9 +yacman>=0.8.0 +future +jsonschema>=3.0.1 +rich>=9.0.1 pyfaidx \ No newline at end of file diff --git a/requirements/requirements-test.txt b/requirements/requirements-test.txt index e92ee43f..ca6d31e8 100644 --- a/requirements/requirements-test.txt +++ b/requirements/requirements-test.txt @@ -3,4 +3,5 @@ pytest pytest-remotedata mock veracitools -git+git://github.com/databio/refgenie_myplugin@master#egg=refgenie_myplugin +tqdm +git+git://github.com/databio/refgenie_myplugin@master#egg=refgenie_myplugin \ No newline at end of file diff --git a/setup.py b/setup.py index 98a129dc..f908237c 100644 --- a/setup.py +++ b/setup.py @@ -9,34 +9,35 @@ for line in reqs_file: if not line.strip(): continue - #DEPENDENCIES.append(line.split("=")[0].rstrip("<>")) + # DEPENDENCIES.append(line.split("=")[0].rstrip("<>")) DEPENDENCIES.append(line) # Additional keyword arguments for setup(). extra = {"install_requires": DEPENDENCIES} -if sys.version_info >= (3, ): +if sys.version_info >= (3,): extra["use_2to3"] = True -with open("refgenconf/_version.py", 'r') as versionfile: +with open("refgenconf/_version.py", "r") as versionfile: version = versionfile.readline().split()[-1].strip("\"'\n") # Handle the pypi README formatting. try: import pypandoc - long_description = pypandoc.convert_file('README.md', 'rst') + + long_description = pypandoc.convert_file("README.md", "rst") msg = "\033[032mPandoc conversion succeeded.\033[0m" -except(IOError, ImportError, OSError): +except (IOError, ImportError, OSError): msg = "\033[0;31mWarning: pandoc conversion failed!\033[0m" - long_description = open('README.md').read() + long_description = open("README.md").read() setup( name=PACKAGE_NAME, packages=[PACKAGE_NAME], version=version, - description='A standardized configuration object for reference genome assemblies', - long_description=long_description, - long_description_content_type='text/markdown', + description="A standardized configuration object for reference genome assemblies", + long_description=long_description, + long_description_content_type="text/markdown", classifiers=[ "Development Status :: 4 - Beta", "License :: OSI Approved :: BSD License", @@ -44,16 +45,19 @@ "Programming Language :: Python :: 3.6", "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", - "Topic :: Scientific/Engineering :: Bio-Informatics" - ], + "Topic :: Scientific/Engineering :: Bio-Informatics", + ], license="BSD2", keywords="bioinformatics, sequencing, ngs", test_suite="tests", + include_package_data=True, tests_require=(["pytest"]), - setup_requires=(["pytest-runner"] if {"test", "pytest", "ptr"} & set(sys.argv) else []), - url='https://refgenie.databio.org', - author=u'Nathan Sheffield, Vince Reuter, Michal Stolarczyk', + setup_requires=( + ["pytest-runner"] if {"test", "pytest", "ptr"} & set(sys.argv) else [] + ), + url="https://refgenie.databio.org", + author=u"Nathan Sheffield, Vince Reuter, Michal Stolarczyk", **extra ) -print(msg) \ No newline at end of file +print(msg) diff --git a/tests/conftest.py b/tests/conftest.py index 87563539..3dec2ac8 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -14,25 +14,36 @@ __author__ = "Vince Reuter" __email__ = "vreuter@virginia.edu" +TEST_SERVER = "http://rg.databio.org" + IDX_BT2_VAL = "indexed_bowtie2" HG38_DATA = [ - ("bowtie2", IDX_BT2_VAL), ("hisat2", "indexed_hisat2"), - ("tss_annotation", "TSS.bed.gz"), ("gtf", "blah.gtf")] + ("bowtie2", IDX_BT2_VAL), + ("hisat2", "indexed_hisat2"), + ("tss_annotation", "TSS.bed.gz"), + ("gtf", "blah.gtf"), +] HG38_DATA = [ - ("bowtie2", IDX_BT2_VAL), ("hisat2", "indexed_hisat2"), - ("tss_annotation", "TSS.bed.gz"), ("gtf", "blah.gtf")] - + ("bowtie2", IDX_BT2_VAL), + ("hisat2", "indexed_hisat2"), + ("tss_annotation", "TSS.bed.gz"), + ("gtf", "blah.gtf"), +] MM10_DATA = [("bowtie2", IDX_BT2_VAL), ("blacklist", "blacklist/mm10.bed")] MITO_DATA = [("bowtie2", IDX_BT2_VAL), ("bowtie", "indexed_bowtie")] -REMOTE_ASSETS = {"rCRSd": {"bowtie2_index": ".tgz", "fasta": ".tgz"}, - "mouse_chrM2x": {"bowtie2_index": ".tgz", "fasta": ".tgz"}} -REQUESTS = [(g, a, "default") for g, ext_by_asset in REMOTE_ASSETS.items() for a in ext_by_asset] +REMOTE_ASSETS = { + "rCRSd": {"bowtie2_index": ".tgz", "fasta": ".tgz"}, + "mouse_chrM2x": {"bowtie2_index": ".tgz", "fasta": ".tgz"}, +} +REQUESTS = [ + (g, a, "default") for g, ext_by_asset in REMOTE_ASSETS.items() for a in ext_by_asset +] URL_BASE = "https://raw.githubusercontent.com/databio/refgenieserver/master/files" @@ -45,8 +56,8 @@ def lift_into_path_pair(name): CONF_DATA = [ - (g, {CFG_ASSETS_KEY: PathExAttMap(_bind_to_path(data))}) for g, data - in [("hg38", HG38_DATA), ("mm10", MM10_DATA), ("rCRSd", MITO_DATA)] + (g, {CFG_ASSETS_KEY: PathExAttMap(_bind_to_path(data))}) + for g, data in [("hg38", HG38_DATA), ("mm10", MM10_DATA), ("rCRSd", MITO_DATA)] ] @@ -75,7 +86,7 @@ def cfg_file(data_path): @pytest.fixture def cfg_file_old(data_path): - return os.path.join(data_path, "genomes_v2.yaml") + return os.path.join(data_path, "genomes_v3.yaml") @pytest.fixture @@ -110,8 +121,9 @@ def remove_asset_and_file(rgc, gname, aname, tname): :param str tname: tag name to remove """ try: - shutil.rmtree(rgc.seek(gname, aname, tname, strict_exists=True, - enclosing_dir=True)) + shutil.rmtree( + rgc.seek(gname, aname, tname, strict_exists=True, enclosing_dir=True) + ) except Exception as e: print("file not removed: {}".format(e)) pass @@ -126,13 +138,15 @@ def remove_asset_and_file(rgc, gname, aname, tname): def made_genome_config_file(temp_genome_config_file): """ Make the test session's genome config file. """ genome_folder = os.path.dirname(temp_genome_config_file) - extra_kv_lines = ["{}: {}".format(CFG_FOLDER_KEY, genome_folder), - "{}: {}".format(CFG_SERVERS_KEY, "https://refgenomes.databio.org/"), - "{}: {}".format(CFG_VERSION_KEY, package_version), - "{}:".format(CFG_GENOMES_KEY)] + extra_kv_lines = [ + "{}: {}".format(CFG_FOLDER_KEY, genome_folder), + "{}: {}".format(CFG_SERVERS_KEY, "http://rg.databio.org"), + "{}: {}".format(CFG_VERSION_KEY, REQ_CFG_VERSION), + "{}:".format(CFG_GENOMES_KEY), + ] gen_data_lines = PathExAttMap(CONF_DATA).get_yaml_lines() fp = temp_genome_config_file - with open(fp, 'w') as f: + with open(fp, "w") as f: f.write("\n".join(extra_kv_lines + [" " + l for l in gen_data_lines])) return fp @@ -140,7 +154,7 @@ def made_genome_config_file(temp_genome_config_file): @pytest.fixture def rgc(made_genome_config_file): """ Provide test case with a genome config instance. """ - with open(made_genome_config_file, 'r') as f: + with open(made_genome_config_file, "r") as f: return RefGenConf(entries=yaml.load(f, yaml.SafeLoader)) @@ -156,7 +170,8 @@ def ro_rgc(cfg_file): @pytest.fixture def all_genomes(ro_rgc): - return ro_rgc[CFG_GENOMES_KEY].keys() + gs = ro_rgc[CFG_GENOMES_KEY].keys() + return gs @pytest.fixture @@ -174,3 +189,11 @@ def remove_genome_folder(request): def temp_genome_config_file(tmpdir_factory): """ The genome configuration file for the test suite. """ return tmpdir_factory.mktemp("data").join("refgenie.yaml").strpath + + +# seqcol configuration - to be removed when we split the projects + + +@pytest.fixture +def fasta_path(data_path): + return os.path.join(data_path, "demo_fasta") diff --git a/tests/data/demo_fasta/demo.fa.fai b/tests/data/demo_fasta/demo.fa.fai new file mode 100644 index 00000000..c55c6c17 --- /dev/null +++ b/tests/data/demo_fasta/demo.fa.fai @@ -0,0 +1,2 @@ +chr1 4 6 4 5 +chr2 4 17 4 5 diff --git a/tests/data/demo_fasta/demo.fa.gz b/tests/data/demo_fasta/demo.fa.gz new file mode 100644 index 00000000..051dd921 Binary files /dev/null and b/tests/data/demo_fasta/demo.fa.gz differ diff --git a/tests/data/demo_fasta/demo2.fa b/tests/data/demo_fasta/demo2.fa new file mode 100644 index 00000000..4f095e8d --- /dev/null +++ b/tests/data/demo_fasta/demo2.fa @@ -0,0 +1,6 @@ +>chr1 +ACGT +>chr2 +TCGA +>chrX +TTCCGGAA diff --git a/tests/data/demo_fasta/demo2.fa.fai b/tests/data/demo_fasta/demo2.fa.fai new file mode 100644 index 00000000..02f100b9 --- /dev/null +++ b/tests/data/demo_fasta/demo2.fa.fai @@ -0,0 +1,3 @@ +chr1 4 6 4 5 +chr2 4 17 4 5 +chrX 8 28 8 9 diff --git a/tests/data/demo_fasta/demo3.fa b/tests/data/demo_fasta/demo3.fa new file mode 100644 index 00000000..4fb88c5c --- /dev/null +++ b/tests/data/demo_fasta/demo3.fa @@ -0,0 +1,6 @@ +>1 +ACGT +>2 +TCGA +>X +TTCCGGAA diff --git a/tests/data/demo_fasta/demo3.fa.fai b/tests/data/demo_fasta/demo3.fa.fai new file mode 100644 index 00000000..8b60b955 --- /dev/null +++ b/tests/data/demo_fasta/demo3.fa.fai @@ -0,0 +1,3 @@ +1 4 3 4 5 +2 4 11 4 5 +X 8 19 8 9 diff --git a/tests/data/demo_fasta/demo4.fa b/tests/data/demo_fasta/demo4.fa new file mode 100644 index 00000000..ade4e7a8 --- /dev/null +++ b/tests/data/demo_fasta/demo4.fa @@ -0,0 +1,2 @@ +>chrX +TTCCGGAA diff --git a/tests/data/demo_fasta/demo4.fa.fai b/tests/data/demo_fasta/demo4.fa.fai new file mode 100644 index 00000000..0d84e86b --- /dev/null +++ b/tests/data/demo_fasta/demo4.fa.fai @@ -0,0 +1 @@ +chrX 8 6 8 9 diff --git a/tests/data/demo_fasta/demo5.fa.fai b/tests/data/demo_fasta/demo5.fa.fai new file mode 100644 index 00000000..02f100b9 --- /dev/null +++ b/tests/data/demo_fasta/demo5.fa.fai @@ -0,0 +1,3 @@ +chr1 4 6 4 5 +chr2 4 17 4 5 +chrX 8 28 8 9 diff --git a/tests/data/demo_fasta/demo5.fa.gz b/tests/data/demo_fasta/demo5.fa.gz new file mode 100644 index 00000000..051dd921 Binary files /dev/null and b/tests/data/demo_fasta/demo5.fa.gz differ diff --git a/tests/data/genomes.yaml b/tests/data/genomes.yaml index d27e59e9..99a9cc41 100644 --- a/tests/data/genomes.yaml +++ b/tests/data/genomes.yaml @@ -1,50 +1,5 @@ -config_version: 0.3 +config_version: 0.4 genome_folder: /tmp -genome_servers: ['http://refgenomes.databio.org'] -genomes: - mouse_chrM2x: - assets: - bwa_index: - tags: - default: - seek_keys: - bwa_index: mouse_chrM2x.fa - asset_parents: [] - asset_path: bwa_index - asset_digest: 914dec83dcfab73e056717d33ecfd465 - default_tag: default - rCRSd: - assets: - bowtie2_index: - tags: - default: - seek_keys: - bowtie2_index: rCRSd - asset_parents: [] - asset_path: bowtie2_index - asset_digest: 1262e30d4a87db9365d501de8559b3b4 - default_tag: default - fasta: - tags: - default: - seek_keys: - fasta: rCRSd.fa - fai: rCRSd.fa.fai - chrom_sizes: rCRSd.chrom.sizes - asset_parents: [] - asset_path: fasta - asset_digest: 4eb430296bc02ed7e4006624f1d5ac53 - default_tag: default - human_repeats: - assets: - fasta: - tags: - default: - seek_keys: - fasta: human_repeats.fa - fai: human_repeats.fa.fai - chrom_sizes: human_repeats.chrom.sizes - asset_parents: [] - asset_path: fasta - asset_digest: 4a749d4e74b057d0efa0c8398ebcb871 - default_tag: default +genome_servers: + - http://rg.databio.org +genomes: null \ No newline at end of file diff --git a/tests/data/genomes_v2.yaml b/tests/data/genomes_v2.yaml deleted file mode 100644 index 29deb065..00000000 --- a/tests/data/genomes_v2.yaml +++ /dev/null @@ -1,8 +0,0 @@ -config_version: 0.2 -genome_folder: /tmp -genome_server: https://refgenomes.databio.org/ -genomes: - rCRSd: - assets: - bowtie2_index: - path: bowtie2_index \ No newline at end of file diff --git a/tests/data/genomes_v3.yaml b/tests/data/genomes_v3.yaml new file mode 100644 index 00000000..6d765f31 --- /dev/null +++ b/tests/data/genomes_v3.yaml @@ -0,0 +1,33 @@ +config_version: 0.3 +genome_folder: /tmp/old +genome_servers: + - http://rg.databio.org +genomes: + human_repeats: + assets: + fasta: + tags: + default: + seek_keys: + fasta: human_repeats.fa + fai: human_repeats.fa.fai + chrom_sizes: human_repeats.chrom.sizes + asset_parents: [] + asset_path: fasta + asset_digest: 4a749d4e74b057d0efa0c8398ebcb871 + default_tag: default + genome_description: Manually curated collection of human repeat sequences from GenBank + rCRSd: + assets: + fasta: + tags: + default: + seek_keys: + fasta: rCRSd.fa + fai: rCRSd.fa.fai + chrom_sizes: rCRSd.chrom.sizes + asset_parents: [] + asset_path: fasta + asset_digest: 4eb430296bc02ed7e4006624f1d5ac53 + default_tag: default + genome_description: The revised cambridge reference sequence. This is the human mitochondrial reference genome diff --git a/tests/test_1pull_asset.py b/tests/test_1pull_asset.py index e9522f23..c9489d1b 100644 --- a/tests/test_1pull_asset.py +++ b/tests/test_1pull_asset.py @@ -4,6 +4,7 @@ import mock import os import sys + if sys.version_info.major < 3: ConnectionRefusedError = Exception else: @@ -23,27 +24,34 @@ DOWNLOAD_FUNCTION = "refgenconf.refgenconf.{}".format(_download_url_progress.__name__) -@pytest.mark.parametrize(["gname", "aname"], [("human_repeats", 1), ("mouse_chrM2x", None)]) -def test_pull_asset_illegal_asset_name(rgc, gname, aname): +@pytest.mark.parametrize( + ["gname", "aname"], [("human_repeats", 1), ("mouse_chrM2x", None)] +) +def test_pull_asset_illegal_asset_name(my_rgc, gname, aname): """ TypeError occurs if asset argument is not iterable. """ with pytest.raises(TypeError): - rgc.pull(gname, aname) + my_rgc.pull(gname, aname) -@pytest.mark.parametrize(["gname", "aname", "tname"], - [("human_repeats", "bowtie2_index", "default"), ("mouse_chrM2x", "bwa_index", "default")]) +@pytest.mark.parametrize( + ["gname", "aname", "tname"], + [ + ("human_repeats", "bwa_index", "default"), + ("mouse_chrM2x", "bwa_index", "default"), + ], +) def test_download_interruption(my_rgc, gname, aname, tname, caplog): """ Download interruption provides appropriate warning message and halts. """ import signal - print("filepath: " + my_rgc._file_path) + + print("filepath: " + my_rgc.__internal.file_path) def kill_download(*args, **kwargs): os.kill(os.getpid(), signal.SIGINT) - with mock.patch(DOWNLOAD_FUNCTION, side_effect=kill_download), \ - mock.patch("refgenconf.refgenconf.query_yes_no", return_value=True), \ - caplog.at_level(logging.WARNING), \ - pytest.raises(SystemExit): + with mock.patch(DOWNLOAD_FUNCTION, side_effect=kill_download), mock.patch( + "refgenconf.refgenconf.query_yes_no", return_value=True + ), caplog.at_level(logging.WARNING), pytest.raises(SystemExit): my_rgc.pull(gname, aname, tname) records = caplog.records assert 1 == len(records) @@ -52,33 +60,46 @@ def kill_download(*args, **kwargs): assert "The download was interrupted" in r.msg -@pytest.mark.parametrize(["gname", "aname", "tname"], [("human_repeats", "fasta", "default"), ("mouse_chrM2x", "fasta", "default")]) +@pytest.mark.parametrize( + ["gname", "aname", "tname"], + [("human_repeats", "fasta", "default"), ("mouse_chrM2x", "fasta", "default")], +) def test_pull_asset(my_rgc, gname, aname, tname): with mock.patch("refgenconf.refgenconf.query_yes_no", return_value=True): print("\nPulling; genome: {}, asset: {}, tag: {}\n".format(gname, aname, tname)) my_rgc.pull(gname, aname, tname) -@pytest.mark.parametrize(["gname", "aname", "tname"], - [("rCRSd", "bowtie2_index", "default"), ("mouse_chrM2x", "bwa_index", "default")]) +@pytest.mark.parametrize( + ["gname", "aname", "tname"], + [("rCRSd", "bowtie2_index", "default"), ("mouse_chrM2x", "bwa_index", "default")], +) def test_parent_asset_mismatch(my_rgc, gname, aname, tname): """ Test that an exception is raised when remote and local parent checksums do not match on pull""" with mock.patch("refgenconf.refgenconf.query_yes_no", return_value=True): my_rgc.pull(gname, "fasta", tname) my_rgc.make_writable() my_rgc.write() - ori = my_rgc[CFG_GENOMES_KEY][gname][CFG_ASSETS_KEY]["fasta"][CFG_ASSET_TAGS_KEY][tname][CFG_ASSET_CHECKSUM_KEY] - my_rgc[CFG_GENOMES_KEY][gname][CFG_ASSETS_KEY]["fasta"][CFG_ASSET_TAGS_KEY][tname][CFG_ASSET_CHECKSUM_KEY] = "wrong" + ori = my_rgc[CFG_GENOMES_KEY][gname][CFG_ASSETS_KEY]["fasta"][CFG_ASSET_TAGS_KEY][ + tname + ][CFG_ASSET_CHECKSUM_KEY] + my_rgc[CFG_GENOMES_KEY][gname][CFG_ASSETS_KEY]["fasta"][CFG_ASSET_TAGS_KEY][tname][ + CFG_ASSET_CHECKSUM_KEY + ] = "wrong" with mock.patch("refgenconf.refgenconf.query_yes_no", return_value=True): with pytest.raises(RemoteDigestMismatchError): my_rgc.pull(gname, aname, tname) with my_rgc as r: - r[CFG_GENOMES_KEY][gname][CFG_ASSETS_KEY]["fasta"][CFG_ASSET_TAGS_KEY][tname][CFG_ASSET_CHECKSUM_KEY] = ori + r[CFG_GENOMES_KEY][gname][CFG_ASSETS_KEY]["fasta"][CFG_ASSET_TAGS_KEY][tname][ + CFG_ASSET_CHECKSUM_KEY + ] = ori my_rgc.make_readonly() -@pytest.mark.parametrize(["gname", "aname", "tname"], [("rCRSd", "bowtie2_index", "default"), - ("mouse_chrM2x", "bwa_index", "default")]) +@pytest.mark.parametrize( + ["gname", "aname", "tname"], + [("rCRSd", "bowtie2_index", "default"), ("mouse_chrM2x", "bwa_index", "default")], +) def test_pull_asset_updates_genome_config(cfg_file, gname, aname, tname): """ Test that the object that was identical prior to the asset pull differs afterwards @@ -98,11 +119,17 @@ def test_pull_asset_updates_genome_config(cfg_file, gname, aname, tname): assert isinstance(post_rgc.seek(gname, aname, tname), str) -@pytest.mark.parametrize(["gname", "aname", "tname", "state"], - [("rCRSd", "fasta", "default", True), - ("human_repeats", "fasta", "default", True), - ("mouse_chrM2x", "fasta", "default", False)]) -def test_pull_asset_works_with_nonwritable_and_writable_rgc(cfg_file, gname, aname, tname, state): +@pytest.mark.parametrize( + ["gname", "aname", "tname", "state"], + [ + ("rCRSd", "fasta", "default", True), + ("human_repeats", "fasta", "default", True), + ("mouse_chrM2x", "fasta", "default", False), + ], +) +def test_pull_asset_works_with_nonwritable_and_writable_rgc( + cfg_file, gname, aname, tname, state +): rgc = RefGenConf(filepath=cfg_file, writable=state) remove_asset_and_file(rgc, gname, aname, tname) print("\nPulling; genome: {}, asset: {}, tag: {}\n".format(gname, aname, tname)) diff --git a/tests/test_add.py b/tests/test_add.py index 316f6505..256f3b6a 100644 --- a/tests/test_add.py +++ b/tests/test_add.py @@ -1,4 +1,3 @@ - """ Tests for RefGenConf.add. These tests depend on successful completion of tests is test_1pull_asset.py """ import pytest @@ -7,38 +6,68 @@ class TestAdd: - @pytest.mark.parametrize(["pth", "gname", "aname", "tname"], - [("bogus/path/file.txt", "rCRSd", "fasta", "default"), - ("bogus/path/file.txt", "rCRSd", "fasta", "default")]) + @pytest.mark.parametrize( + ["pth", "gname", "aname", "tname"], + [ + ("bogus/path/file.txt", "rCRSd", "fasta", "default"), + ("bogus/path/file.txt", "human_repeats", "fasta", "default"), + ], + ) def test_nonexistent_file(self, cfg_file, pth, gname, aname, tname): rgc = RefGenConf(filepath=cfg_file) with pytest.raises(OSError): rgc.add(pth, gname, aname, tname) - @pytest.mark.parametrize(["gname", "aname", "tname"], - [("human_repeats", "fasta", "default"), - ("rCRSd", "fasta", "default")]) + @pytest.mark.parametrize( + ["gname", "aname", "tname"], + [("human_repeats", "fasta", "default"), ("rCRSd", "fasta", "default")], + ) def test_preexisting_asset_prompt(self, cfg_file, gname, aname, tname): rgc = RefGenConf(filepath=cfg_file) path = rgc.seek(genome_name=gname, asset_name=aname, tag_name=tname) with mock.patch("refgenconf.refgenconf.query_yes_no", return_value=False): assert not rgc.add(path, gname, aname, tname) - @pytest.mark.parametrize(["gname", "aname", "tname"], - [("human_repeats", "fasta", "default"), - ("rCRSd", "fasta", "default")]) - def test_force_overwrite_asset(self, cfg_file, gname, aname, tname): + @pytest.mark.parametrize( + ["gname", "aname", "tname"], + [ + ("human_repeats", "test_asset", "default"), + ("rCRSd", "test_asset", "default"), + ], + ) + def test_cant_add_without_digest_set_first(self, cfg_file, gname, aname, tname): rgc = RefGenConf(filepath=cfg_file) - path = rgc.seek(genome_name=gname, asset_name=aname, tag_name=tname, enclosing_dir=True) + path = rgc.seek( + genome_name=gname, asset_name="fasta", tag_name=tname, enclosing_dir=True + ) gname = gname + "_new" - assert rgc.add(path, gname, aname, tname) + assert not rgc.add(path, gname, aname, tname) + + @pytest.mark.parametrize( + ["gname", "aname", "tname"], + [ + ("human_repeats", "test_asset", "default"), + ("rCRSd", "test_asset", "default"), + ], + ) + def test_force_overwrite_asset(self, cfg_file, gname, aname, tname): + rgc = RefGenConf(filepath=cfg_file) + path = rgc.seek( + genome_name=gname, asset_name="fasta", tag_name=tname, enclosing_dir=True + ) + assert rgc.add(path, gname, aname, tname, force=True) assert rgc.add(path, gname, aname, tname, force=True) - @pytest.mark.parametrize(["gname", "aname", "tname"], - [("human_repeats", "fasta", "default"), - ("rCRSd", "fasta", "default")]) + @pytest.mark.parametrize( + ["gname", "aname", "tname"], + [ + ("human_repeats", "test_asset1", "default"), + ("rCRSd", "test_asset1", "default"), + ], + ) def test_nofile(self, cfg_file, gname, aname, tname): rgc = RefGenConf(filepath=cfg_file) - pth = rgc.seek(gname, aname, tname, enclosing_dir=True) - rgc_new = RefGenConf() - assert rgc_new.add(pth, gname, aname, tname, seek_keys={"file": "b"}) \ No newline at end of file + path = rgc.seek( + genome_name=gname, asset_name="fasta", tag_name=tname, enclosing_dir=True + ) + assert rgc.add(path, gname, aname, tname, seek_keys={"file": "b"}, force=True) diff --git a/tests/test_alias.py b/tests/test_alias.py new file mode 100644 index 00000000..02635575 --- /dev/null +++ b/tests/test_alias.py @@ -0,0 +1,182 @@ +import pytest +import os +from yacman import UndefinedAliasError +from refgenconf.const import CFG_GENOMES_KEY, CFG_ALIASES_KEY +from shutil import rmtree + +DEMO_FILES = ["demo.fa.gz", "demo2.fa", "demo3.fa", "demo4.fa", "demo5.fa.gz"] + + +class TestAliasSetting: + @pytest.mark.parametrize(["alias", "digest"], [(["human_repeats", "rCRSd"], None)]) + def test_set_genome_alias_server_more_than_1(self, my_rgc, alias, digest): + """ Multi digest lookup is not implemented """ + with pytest.raises(NotImplementedError): + my_rgc.set_genome_alias(genome=alias, digest=digest) + + @pytest.mark.parametrize(["alias", "digest"], [("human_repeats", None)]) + @pytest.mark.xfail + def test_set_genome_alias_server(self, my_rgc, alias, digest): + """ Lookup aliases for a single digest """ + my_rgc.set_genome_alias(genome=alias, digest=digest) + assert alias in my_rgc.get_genome_alias(digest=digest, all_aliases=True) + + @pytest.mark.parametrize( + ["alias", "digest"], + [ + (["hr"], "7319f9237651755047bc40d7f7a9770d42a537e840f4e105"), + (["hr", "h_r"], "7319f9237651755047bc40d7f7a9770d42a537e840f4e105"), + ], + ) + def test_set_genome_alias(self, my_rgc, alias, digest): + """ + Set aliases, check whether all exist in the object and as + directories on disk and remove + """ + my_rgc.set_genome_alias(genome=alias, digest=digest) + assert all( + [ + a in my_rgc.get_genome_alias(digest=digest, all_aliases=True) + for a in alias + ] + ) + assert all([os.path.exists(os.path.join(my_rgc.alias_dir, a)) for a in alias]) + my_rgc.remove_genome_aliases(digest=digest, aliases=alias) + + @pytest.mark.parametrize( + ["alias", "digest"], + [ + (["hr"], "7319f9237651755047bc40d7f7a9770d42a537e840f4e105"), + (["hr", "h_r"], "7319f9237651755047bc40d7f7a9770d42a537e840f4e105"), + ], + ) + def test_set_genome_alias_reset(self, my_rgc, alias, digest): + """ + Get original aliases, wipe out all current aliases and set new ones, + check whether all exist in the object and as + directories on disk and remove and bring the original state back + """ + ori_state = my_rgc.get_genome_alias(digest=digest, all_aliases=True) + my_rgc.set_genome_alias(genome=alias, digest=digest, reset_digest=True) + assert all( + [ + a in my_rgc.get_genome_alias(digest=digest, all_aliases=True) + for a in alias + ] + ) + assert all([os.path.exists(os.path.join(my_rgc.alias_dir, a)) for a in alias]) + assert len(my_rgc.get_genome_alias(digest=digest, all_aliases=True)) == len( + alias + ) + my_rgc.set_genome_alias(genome=ori_state, digest=digest, reset_digest=True) + + +class TestAliasGetting: + @pytest.mark.parametrize( + "digest", ["7319f9237651755047bc40d7f7a9770d42a537e840f4e105"] + ) + def test_get_genome_alias_basic(self, my_rgc, digest): + """ + Get a single alias, first from the list, if multiple and then use + the result to get the digest back + """ + alias = my_rgc.get_genome_alias(digest=digest) + assert isinstance(alias, str) + assert my_rgc.get_genome_alias_digest(alias=alias) == digest + # test fallback + assert my_rgc.get_genome_alias_digest(alias=digest, fallback=True) == digest + + @pytest.mark.parametrize( + "digest", ["7319f9237651755047bc40d7f7a9770d42a537e840f4e105"] + ) + def test_get_genome_alias_multi(self, my_rgc, digest): + """ Get muliple single aliases, result is always a list """ + assert isinstance( + my_rgc.get_genome_alias(digest=digest, all_aliases=True), list + ) + + @pytest.mark.parametrize("digest", ["human_repeats"]) + def test_get_genome_alias_no_fallback(self, my_rgc, digest): + """ + If an alias instead of digest is provided, an appropriate + exception is risen + """ + with pytest.raises(UndefinedAliasError): + my_rgc.get_genome_alias(digest=digest) + + @pytest.mark.parametrize("digest", ["human_repeats", "rCRSd", "mouse_chrM2x"]) + def test_get_genome_alias_fallback(self, my_rgc, digest): + """ + If an alias instead of digest is provided, an appropriate + exception is risen + """ + assert isinstance(my_rgc.get_genome_alias(digest=digest, fallback=True), str) + + @pytest.mark.parametrize("digest", ["human_repeats_bogus", "nonexistent"]) + def test_get_genome_alias_fallback_nomatch(self, my_rgc, digest): + """ + If an alias instead of digest is provided, an appropriate + exception is risen + """ + with pytest.raises(UndefinedAliasError): + my_rgc.get_genome_alias(digest=digest, fallback=True) + + +class TestAliasRemoval: + @pytest.mark.parametrize( + "digest", ["7319f9237651755047bc40d7f7a9770d42a537e840f4e105"] + ) + def test_remove_genome_alias_all(self, my_rgc, digest): + """ + Save original aliases state, remove all, check that aliases have + been removed from the object and disk, bring back the original state + """ + ori_state = my_rgc.get_genome_alias(digest=digest) + my_rgc.set_genome_alias(digest=digest, genome=ori_state) + my_rgc.remove_genome_aliases(digest=digest) + with pytest.raises(UndefinedAliasError): + my_rgc.get_genome_alias(digest=digest) + assert all( + [not os.path.exists(os.path.join(my_rgc.alias_dir, a)) for a in ori_state] + ) + my_rgc.set_genome_alias(digest=digest, genome=ori_state) + assert isinstance( + my_rgc.get_genome_alias(digest=digest, all_aliases=True), list + ) + + @pytest.mark.parametrize( + ["alias", "digest"], + [ + (["hr"], "7319f9237651755047bc40d7f7a9770d42a537e840f4e105"), + (["hr", "h_r"], "7319f9237651755047bc40d7f7a9770d42a537e840f4e105"), + ], + ) + def test_remove_genome_alias_specific(self, my_rgc, digest, alias): + """ + Set selected aliases and an additional one remove the selected ones, + verify the additional one exists + """ + my_rgc.set_genome_alias(digest=digest, genome=alias + ["human_repeats"]) + my_rgc.remove_genome_aliases(digest=digest, aliases=alias) + assert "human_repeats" in my_rgc.get_genome_alias( + digest=digest, all_aliases=True + ) + + +class TestInitializeGenome: + @pytest.mark.parametrize("fasta_name", DEMO_FILES) + def test_initialize_genome(self, my_rgc, fasta_name, fasta_path): + """ + Save original aliases state, remove all, check that aliases have + been removed from the object and disk, bring back the original state + """ + d, asds = my_rgc.initialize_genome( + fasta_path=os.path.join(fasta_path, fasta_name), + alias=fasta_name, + fasta_unzipped=not fasta_name.endswith(".gz"), + ) + assert d in my_rgc[CFG_GENOMES_KEY] + assert fasta_name in my_rgc[CFG_GENOMES_KEY][d][CFG_ALIASES_KEY] + with my_rgc as r: + del r[CFG_GENOMES_KEY][d] + rmtree(os.path.join(my_rgc.alias_dir, fasta_name)) diff --git a/tests/test_assets_basic.py b/tests/test_assets_basic.py deleted file mode 100644 index 7041f343..00000000 --- a/tests/test_assets_basic.py +++ /dev/null @@ -1,38 +0,0 @@ -""" Basic RGC asset tests """ - -from collections import OrderedDict -import pytest -__author__ = "Michal Stolarczyk" -__email__ = "michal@virginia.edu" - - -class AssetDictTest: - @pytest.mark.parametrize("gname", ["nonexistent", None]) - def test_with_nonexistent_genome(self, ro_rgc, gname): - """ Verify asset dict is always returned, even if the requested genome does not exist """ - assert isinstance(ro_rgc.list(genome=gname), OrderedDict) - - @pytest.mark.parametrize("gname", ["nonexistent", None]) - def test_length(self, ro_rgc, all_genomes, gname): - """ Verify asset dict is larger if nonexistent or no genome specified than ones that are - returned for a specific genome""" - for g in all_genomes: - assert len(ro_rgc.list(genome=gname)) > len(ro_rgc.list(genome=g)) - - def test_multiple_genomes(self, ro_rgc, all_genomes): - """ Verify asset dict works with multiple genomes and returns all of them """ - assert sorted(ro_rgc.list(genome=all_genomes).keys()) == sorted(ro_rgc.list().keys()) - - -class ListAssetsByGenomeTest: - def test_returns_entire_mapping_when_no_genonome_specified(self, ro_rgc): - assert ro_rgc.list_assets_by_genome() == ro_rgc.list() - - def test_returns_list(self, ro_rgc, all_genomes): - for g in all_genomes: - assert isinstance(ro_rgc.list_assets_by_genome(g), list) - - @pytest.mark.parametrize("gname", ["nonexistent", "genome"]) - def test_exception_on_nonexistent_genome(self, ro_rgc, gname): - with pytest.raises(KeyError): - ro_rgc.list_assets_by_genome(genome=gname) diff --git a/tests/test_config_constructor.py b/tests/test_config_constructor.py index f321e60f..d227f1dd 100644 --- a/tests/test_config_constructor.py +++ b/tests/test_config_constructor.py @@ -3,9 +3,15 @@ import os import pytest from attmap import PathExAttMap +from yacman import AliasedYacAttMap from refgenconf import RefGenConf, ConfigNotCompliantError -from refgenconf.const import CFG_FOLDER_KEY, CFG_GENOMES_KEY, CFG_SERVERS_KEY, \ - DEFAULT_SERVER, RGC_REQ_KEYS +from refgenconf.const import ( + CFG_FOLDER_KEY, + CFG_GENOMES_KEY, + CFG_SERVERS_KEY, + DEFAULT_SERVER, + RGC_REQ_KEYS, +) __author__ = "Vince Reuter" __email__ = "vreuter@virginia.edu" @@ -23,33 +29,22 @@ def test_genome_folder_is_pwd_if_no_folder_key_and_raw_entries_passed(self, ro_r new_rgc = RefGenConf(entries=data) assert os.getcwd() == new_rgc[CFG_FOLDER_KEY] - def test_genome_folder_is_value_from_config_file_if_key_present(self, tmpdir_factory, tmpdir, made_genome_config_file): - conf_file = tmpdir_factory.mktemp("data2").join("refgenie.yaml").strpath - expected = tmpdir.strpath - with open(made_genome_config_file, 'r') as fin, open(conf_file, 'w') as fout: - found = False - for l in fin: - if l.startswith(CFG_FOLDER_KEY): - fout.write("{}: {}\n".format(CFG_FOLDER_KEY, expected)) - else: - fout.write(l) - if l.startswith(CFG_SERVERS_KEY): - found = True - if not found: - fout.write("{}: {}".format(CFG_SERVERS_KEY, DEFAULT_SERVER)) - rgc = RefGenConf(filepath=conf_file) - assert expected != os.path.dirname(conf_file) - assert expected == rgc[CFG_FOLDER_KEY] - - @pytest.mark.parametrize("genomes", [None, "genomes", 10] + [dt(["mm10", "hg38"]) for dt in [list, set, tuple]]) - def test_illegal_genomes_mapping_type_gets_converted_to_empty_mapping(self, genomes, tmpdir): - rgc = RefGenConf(entries={ - CFG_FOLDER_KEY: tmpdir.strpath, - CFG_GENOMES_KEY: genomes, - CFG_SERVERS_KEY: DEFAULT_SERVER - }) + @pytest.mark.parametrize( + "genomes", + [None, "genomes", 10] + [dt(["mm10", "hg38"]) for dt in [list, set, tuple]], + ) + def test_illegal_genomes_mapping_type_gets_converted_to_empty_mapping( + self, genomes, tmpdir + ): + rgc = RefGenConf( + entries={ + CFG_FOLDER_KEY: tmpdir.strpath, + CFG_GENOMES_KEY: genomes, + CFG_SERVERS_KEY: [DEFAULT_SERVER], + } + ) res = rgc[CFG_GENOMES_KEY] - assert isinstance(res, PathExAttMap) + assert isinstance(res, AliasedYacAttMap) assert 0 == len(res) def test_errors_on_old_cfg(self, cfg_file_old): diff --git a/tests/test_config_unbound_env_vars.py b/tests/test_config_unbound_env_vars.py deleted file mode 100644 index e276bb5a..00000000 --- a/tests/test_config_unbound_env_vars.py +++ /dev/null @@ -1,32 +0,0 @@ -""" Tests regarding unboudn environment variables in a genome config file. """ - -import os -import pytest -from refgenconf import CFG_FOLDER_KEY, UnboundEnvironmentVariablesError as UEVErr - -__author__ = "Vince Reuter" -__email__ = "vreuter@virginia.edu" - - -@pytest.mark.parametrize(["genome", "asset", "tag"], [("rCRSd", "fasta", "default"), ("rCRSd", "fasta", "default")]) -@pytest.mark.parametrize("evs", [["NOT_A_VAR"], ["NOT_A_VAR", "RANDNAME"]]) -def test_missing_env_vars_in_genome_config_path_raises_exception(rgc, tmpdir, evs, genome, asset, tag, - remove_genome_folder, cfg_file_copy): - """ Unbound env var(s) in genome folder path cause error. """ - assert all(_is_unbound(v) for v in evs) - path_parts = ["$" + v for v in [tmpdir.strpath] + evs] - path = os.path.join(*path_parts) - print("Genome folder path: {}".format(path)) - rgc[CFG_FOLDER_KEY] = path - assert path == rgc[CFG_FOLDER_KEY] - assert not os.path.exists(path) - with pytest.raises(UEVErr) as err_ctx: - rgc.pull(genome=genome, asset=asset, tag=tag) - err_msg = str(err_ctx.value) - print("Observed error message: {}".format(err_msg)) - missing = [v for v in evs if v not in err_msg] - assert [] == missing - - -def _is_unbound(ev): - return os.getenv(ev) is None and ev not in os.environ diff --git a/tests/test_genome_config_format_error.py b/tests/test_genome_config_format_error.py deleted file mode 100644 index f34ec34c..00000000 --- a/tests/test_genome_config_format_error.py +++ /dev/null @@ -1,37 +0,0 @@ -""" Tests for genome config format exception """ - -import pytest -from refgenconf import * -from refgenconf.const import CFG_ASSETS_KEY -from refgenconf.exceptions import DOC_URL -from tests.conftest import bind_to_assets -from ubiquerg import powerset - -__author__ = "Vince Reuter" -__email__ = "vreuter@virginia.edu" - - -FIXED_KV_PAIRS = [ - (CFG_ASSET_SIZE_KEY, "1G"), (CFG_ARCHIVE_SIZE_KEY, "2G"), - (CFG_ARCHIVE_CHECKSUM_KEY, "dummy-checksum")] - - -@pytest.fixture -def base_rgc_data(tmpdir): - return {CFG_FOLDER_KEY: tmpdir.strpath, CFG_SERVER_KEY: DEFAULT_SERVER} - - -@pytest.fixture -def rgc(base_rgc_data): - return RefGenConf(base_rgc_data) - - -@pytest.mark.parametrize( - ["msg", "exp"], [(".", ". "), ("?", "? "), ("a", "a; ")]) -@pytest.mark.parametrize( - "check", [lambda m, e: m.startswith(e), lambda m, _: m.endswith(DOC_URL)]) -def test_config_format_error_message_formatting(msg, exp, check): - """ Check config format error message formatting and docs URL inclusion. """ - msg = str(GenomeConfigFormatError(msg)) - assert check(msg, exp) - diff --git a/tests/test_genomes.py b/tests/test_genomes.py index ccde7f20..d06cfc13 100644 --- a/tests/test_genomes.py +++ b/tests/test_genomes.py @@ -1,16 +1,23 @@ """ Tests for querying available reference genome assembly names """ from tests.conftest import get_conf_genomes +from refgenconf.const import CFG_GENOMES_KEY __author__ = "Vince Reuter" __email__ = "vreuter@virginia.edu" -def test_genomes_list(rgc): +def test_genomes_list(my_rgc): """ List of available genomes is as expected. """ - assert get_conf_genomes() == rgc.genomes_list() + listed_aliases = my_rgc.genomes_list() + digests = my_rgc[CFG_GENOMES_KEY].keys() + aliases = [my_rgc.get_genome_alias(digest=d) for d in digests] + assert aliases == listed_aliases -def test_genomes_str(rgc): +def test_genomes_str(my_rgc): """ Text of available genomes is as expected. """ - assert ", ".join(get_conf_genomes()) == rgc.genomes_str() + listed_aliases = my_rgc.genomes_str() + digests = my_rgc[CFG_GENOMES_KEY].keys() + aliases = [my_rgc.get_genome_alias(digest=d) for d in digests] + assert ", ".join(aliases) == listed_aliases diff --git a/tests/test_get_asset.py b/tests/test_get_asset.py index 236a087f..c60eb6eb 100644 --- a/tests/test_get_asset.py +++ b/tests/test_get_asset.py @@ -1,49 +1,48 @@ - """ Tests for RefGenConf.get_asset. These tests depend on successful completion of tests is test_1pull_asset.py """ import os import pytest from refgenconf.exceptions import * +from yacman.exceptions import UndefinedAliasError from refgenconf.const import * from tests.conftest import CONF_DATA from shutil import rmtree class TestGetAsset: - @pytest.mark.parametrize(["gname", "aname", "tname", "seek_key"], - [("rCRSd", "fasta", "default", "fasta"), ("rCRSd", "fasta", "default", None)]) + @pytest.mark.parametrize( + ["gname", "aname", "tname", "seek_key"], + [("rCRSd", "fasta", "default", "fasta"), ("rCRSd", "fasta", "default", None)], + ) def test_result(self, ro_rgc, gname, aname, tname, seek_key): assert isinstance(ro_rgc.seek(gname, aname, tname, seek_key), str) - @pytest.mark.parametrize(["gname", "aname", "tname", "seek_key", "etype"], - [("rCRSd", "missing", "default", None, MissingAssetError), - ("missing", "bowtie2_index", "default", None, MissingGenomeError), - ("rCRSd", "bowtie2_index", "missing", None, MissingTagError), - ("rCRSd", "bowtie2_index", "default", "missing", MissingSeekKeyError)]) + @pytest.mark.parametrize( + ["gname", "aname", "tname", "seek_key", "etype"], + [ + ("rCRSd", "missing", "default", None, MissingAssetError), + ("missing", "bowtie2_index", "default", None, UndefinedAliasError), + ("rCRSd", "bowtie2_index", "missing", None, MissingTagError), + ("rCRSd", "bowtie2_index", "default", "missing", MissingSeekKeyError), + ], + ) def test_all_exceptions(self, ro_rgc, gname, aname, tname, seek_key, etype): with pytest.raises(etype): ro_rgc.seek(gname, aname, tname, seek_key) @pytest.mark.parametrize("check_exist", [lambda: True, lambda _1, _2: True]) - @pytest.mark.parametrize( - ["gname", "aname"], [(g, a) for g, data in CONF_DATA for a in data]) + @pytest.mark.parametrize("gname,aname", [("human_repeats", "fasta")]) def test_check_exist_param_type(self, ro_rgc, check_exist, gname, aname): """ The asset existence check must be a one-arg function. """ with pytest.raises(TypeError): ro_rgc.seek(gname, aname, check_exist=check_exist) @pytest.mark.parametrize( - ["gname", "aname", "tname"], [("rCRSd", "fasta", "default"), ("mouse_chrM2x", "fasta", "default")]) + ["gname", "aname", "tname"], + [("rCRSd", "fasta", "default"), ("mouse_chrM2x", "fasta", "default")], + ) def test_result_correctness(self, ro_rgc, gname, aname, tname): """ The FASTA file asset is returned when fasta asset is requested, not the entire dir """ - assert os.path.join(ro_rgc[CFG_FOLDER_KEY], gname, aname, tname) != ro_rgc.seek(gname, aname, tname) - - @pytest.mark.parametrize( - ["gname", "aname", "tname", "seek_key"], - [("rCRSd", "fasta", "default", "fai"), - ("mouse_chrM2x", "fasta", "default", "fai")]) - def test_result_correctness_seek_keys(self, ro_rgc, gname, aname, tname, seek_key): - tag_data = ro_rgc[CFG_GENOMES_KEY][gname][CFG_ASSETS_KEY][aname][CFG_ASSET_TAGS_KEY][tname] - seek_key_value = tag_data[CFG_SEEK_KEYS_KEY][seek_key] - pth = os.path.join(ro_rgc[CFG_FOLDER_KEY], gname, aname, tname, seek_key_value) - assert pth == ro_rgc.seek(gname, aname, tname, seek_key) \ No newline at end of file + assert os.path.join(ro_rgc[CFG_FOLDER_KEY], gname, aname, tname) != ro_rgc.seek( + gname, aname, tname + ) diff --git a/tests/test_getseq.py b/tests/test_getseq.py index f65c92b2..0cb7f737 100644 --- a/tests/test_getseq.py +++ b/tests/test_getseq.py @@ -1,4 +1,3 @@ - """ Tests for RefGenConf.getseq. These tests depend on successful completion of tests is test_1pull_asset.py """ import pytest @@ -6,15 +5,17 @@ class TestGetSeq: - @pytest.mark.parametrize(["gname", "chr"], [("rCRSd", "rCRSd"), ("human_repeats", "U14567.1")]) + @pytest.mark.parametrize( + ["gname", "chr"], [("rCRSd", "rCRSd"), ("human_repeats", "U14567.1")] + ) def test_qetseq_just_chr(self, ro_rgc, gname, chr): assert isinstance(ro_rgc.getseq(genome=gname, locus=chr), FastaRecord) - @pytest.mark.parametrize(["gname", "chr"], - [("rCRSd", "rCRSd"), ("human_repeats", "U14567.1")]) - @pytest.mark.parametrize(["start", "end"], - [(1, 20), (2, 30), (1, 2), (2, 100)]) + @pytest.mark.parametrize( + ["gname", "chr"], [("rCRSd", "rCRSd"), ("human_repeats", "U14567.1")] + ) + @pytest.mark.parametrize(["start", "end"], [(1, 20), (2, 30), (1, 2), (2, 100)]) def test_qetseq_interval(self, ro_rgc, gname, chr, start, end): seq = ro_rgc.getseq(genome=gname, locus="{}:{}-{}".format(chr, start, end)) assert isinstance(seq, Sequence) - assert len(seq) == end-start + assert len(seq) == end - start diff --git a/tests/test_init.py b/tests/test_init.py index cda13ef8..fd49841a 100644 --- a/tests/test_init.py +++ b/tests/test_init.py @@ -7,7 +7,7 @@ import shutil -class TestAdd: +class TestInitialize: def test_init_exists(self): rgc = RefGenConf() tf = tempfile.NamedTemporaryFile(prefix="/tmp/", suffix=".yaml") @@ -31,4 +31,4 @@ def test_init_success(self): def test_invalid_path(self, pth): rgc = RefGenConf() with pytest.raises(TypeError): - rgc.initialize_config_file(filepath=pth) \ No newline at end of file + rgc.initialize_config_file(filepath=pth) diff --git a/tests/test_list_remote.py b/tests/test_list_remote.py index 89810d1f..18709adc 100644 --- a/tests/test_list_remote.py +++ b/tests/test_list_remote.py @@ -1,18 +1,43 @@ """ Tests for listing remotely available genomes and assets. """ from collections import OrderedDict -from refgenconf import RefGenConf, CFG_FOLDER_KEY, CFG_GENOMES_KEY, \ - CFG_SERVERS_KEY, DEFAULT_SERVER -from refgenconf.refgenconf import _download_json +from refgenconf import ( + RefGenConf, + CFG_FOLDER_KEY, + CFG_GENOMES_KEY, + CFG_SERVERS_KEY, + API_VERSION, +) +from refgenconf.helpers import download_json +import pytest -def test_list_remote(rgc, tmpdir): +@pytest.mark.parametrize( + "genome", [["human_repeats"], ["human_repeats", "rCRSd"], None] +) +def test_list_remote(my_rgc, genome): """ Verify expected behavior of remote genome/asset listing. """ - new_rgc = RefGenConf(entries={CFG_FOLDER_KEY: tmpdir.strpath, - CFG_SERVERS_KEY: [DEFAULT_SERVER], - CFG_GENOMES_KEY: rgc[CFG_GENOMES_KEY]}) - result = new_rgc.listr() - assert list(result.keys())[0].startswith(DEFAULT_SERVER) - for server_url, asset_dict in result.items(): - assert isinstance(asset_dict, OrderedDict) - assert len(asset_dict) == len(_download_json(DEFAULT_SERVER + "/genomes")) + assert len(my_rgc[CFG_SERVERS_KEY]) == 1, "Expected only one test server" + server = my_rgc[CFG_SERVERS_KEY][0] + result = my_rgc.listr(genome=genome, as_digests=True) + assert ( + len(result.keys()) == 1 + ), "More servers in list remote result than subscribed to" + server_key = list(result.keys())[0] + assert server_key.startswith(server) + json_genomes = download_json(server_key, params={"includeSeekKeys": True}) + if not genome: + assert len(json_genomes) == len(result[server_key]) + for g, assets in json_genomes.items(): + assert len(assets) == len(result[server_key][g]) + else: + assert len(genome) == len(result[server_key]) + + +def test_list_remote_faulty(my_rgc): + my_rgc[CFG_SERVERS_KEY].append("www.google.com") + assert len(my_rgc[CFG_SERVERS_KEY]) == 2, "Expected two test servers" + result = my_rgc.listr() + assert ( + len(result.keys()) == 1 + ), "More servers in list remote result than subscribed to" diff --git a/tests/test_listing.py b/tests/test_listing.py new file mode 100644 index 00000000..9c7e6b54 --- /dev/null +++ b/tests/test_listing.py @@ -0,0 +1,40 @@ +""" Basic RGC asset tests """ + +from collections import OrderedDict +import pytest +from refgenconf.const import CFG_GENOMES_KEY +from yacman.exceptions import UndefinedAliasError + +__author__ = "Michal Stolarczyk" +__email__ = "michal@virginia.edu" + + +class ListTest: + @pytest.mark.parametrize("gname", [None]) + def test_length(self, ro_rgc, all_genomes, gname): + """ + Verify asset dict is larger if no genome specified than ones that + are returned for a specific genome + """ + for g in all_genomes: + assert len(ro_rgc.list(genome=gname)) > len(ro_rgc.list(genome=g)) + + def test_multiple_genomes(self, ro_rgc, all_genomes): + """ Verify asset dict works with multiple genomes and returns all of them """ + assert sorted(ro_rgc.list(genome=all_genomes).keys()) == sorted( + ro_rgc.list().keys() + ) + + +class ListByGenomeTest: + def test_returns_entire_mapping_when_no_genonome_specified(self, my_rgc): + assert my_rgc.list_assets_by_genome() == my_rgc.list() + + def test_returns_list(self, my_rgc): + for g in my_rgc[CFG_GENOMES_KEY].keys(): + assert isinstance(my_rgc.list_assets_by_genome(genome=g), list) + + @pytest.mark.parametrize("gname", ["nonexistent", "genome"]) + def test_exception_on_nonexistent_genome(self, ro_rgc, gname): + with pytest.raises(UndefinedAliasError): + ro_rgc.list_assets_by_genome(genome=gname) diff --git a/tests/test_packaging.py b/tests/test_packaging.py index 8174d7ff..9ee20c50 100644 --- a/tests/test_packaging.py +++ b/tests/test_packaging.py @@ -14,19 +14,24 @@ def _is_custom_error(obj): @pytest.mark.parametrize( ["obj_name", "typecheck"], - [("RefGenConf", isclass), ("select_genome_config", isfunction), - ("DownloadJsonError", _is_custom_error), - ("GenomeConfigFormatError", _is_custom_error), - ("MissingAssetError", _is_custom_error), - ("MissingConfigDataError", _is_custom_error), - ("MissingGenomeError", _is_custom_error), - ("MissingSeekKeyError", _is_custom_error), - ("MissingTagError", _is_custom_error), - ("ConfigNotCompliantError", _is_custom_error), - ("UnboundEnvironmentVariablesError", _is_custom_error)]) + [ + ("RefGenConf", isclass), + ("select_genome_config", isfunction), + ("DownloadJsonError", _is_custom_error), + ("GenomeConfigFormatError", _is_custom_error), + ("MissingAssetError", _is_custom_error), + ("MissingConfigDataError", _is_custom_error), + ("MissingGenomeError", _is_custom_error), + ("MissingSeekKeyError", _is_custom_error), + ("MissingTagError", _is_custom_error), + ("ConfigNotCompliantError", _is_custom_error), + ("UnboundEnvironmentVariablesError", _is_custom_error), + ], +) def test_top_level_exports(obj_name, typecheck): """ At package level, validate object availability and type. """ import refgenconf + try: obj = getattr(refgenconf, obj_name) except AttributeError: diff --git a/tests/test_plugins.py b/tests/test_plugins.py index 88e10542..0aeb96df 100644 --- a/tests/test_plugins.py +++ b/tests/test_plugins.py @@ -30,15 +30,23 @@ def set_flag(rgc): raise FileExistsError("Flag file already exists: {}".format(pth)) -PLUGINS_DICT = {'pre_list': {'my_func': set_flag}, 'pre_pull': {}, - 'pre_tag': {}, 'pre_update': {}, 'post_tag': {}, - 'post_list': {}, 'post_pull': {}, 'post_update': {}} +PLUGINS_DICT = { + "pre_list": {"my_func": set_flag}, + "pre_pull": {}, + "pre_tag": {}, + "pre_update": {}, + "post_tag": {}, + "post_list": {}, + "post_pull": {}, + "post_update": {}, +} class TestPlugins: def test_prelist_plugins_called(self, cfg_file): - with mock.patch("refgenconf.refgenconf.RefGenConf.plugins", - new_callable=mock.PropertyMock) as mock_plugins: + with mock.patch( + "refgenconf.refgenconf.RefGenConf.plugins", new_callable=mock.PropertyMock + ) as mock_plugins: mock_plugins.return_value = PLUGINS_DICT rgc = RefGenConf(cfg_file, writable=False) rgc.list() @@ -52,4 +60,4 @@ def test_plugin_entrypoints_scanning(self, ro_rgc): in current Python environment. Properly defined ones are included in the plugins property return value """ - assert any([len(fun) > 0 for plugin, fun in ro_rgc.plugins.items()]) \ No newline at end of file + assert any([len(fun) > 0 for plugin, fun in ro_rgc.plugins.items()]) diff --git a/tests/test_removal.py b/tests/test_removal.py index 6fb25ff3..9a746d87 100644 --- a/tests/test_removal.py +++ b/tests/test_removal.py @@ -7,7 +7,10 @@ class TestRemoveAssets: - @pytest.mark.parametrize(["gname", "aname", "tname"], [("rCRSd", "fasta", None), ("mouse_chrM2x", "fasta", None)]) + @pytest.mark.parametrize( + ["gname", "aname", "tname"], + [("rCRSd", "fasta", None), ("mouse_chrM2x", "fasta", None)], + ) def test_default_tag_removal(self, my_rgc, gname, aname, tname): """ The default asset is removed if specific not provided """ with mock.patch("refgenconf.refgenconf.query_yes_no", return_value=True): @@ -16,14 +19,15 @@ def test_default_tag_removal(self, my_rgc, gname, aname, tname): with pytest.raises(MissingAssetError): my_rgc.seek(gname, aname, tname) - @pytest.mark.parametrize(["gname", "aname"], [("rCRSd", "fasta"), ("mouse_chrM2x", "fasta")]) + @pytest.mark.parametrize( + ["gname", "aname"], [("rCRSd", "fasta"), ("mouse_chrM2x", "fasta")] + ) def test_asset_removal_after_last_tag_removed(self, my_rgc, gname, aname): """ The asset is removed when last tag is removed """ my_rgc.pull(gname, aname, "default") asset = my_rgc.genomes[gname].assets[aname] for t in asset[CFG_ASSET_TAGS_KEY]: - with mock.patch("refgenconf.refgenconf.query_yes_no", - return_value=True): + with mock.patch("refgenconf.refgenconf.query_yes_no", return_value=True): my_rgc.remove(gname, aname, t) with pytest.raises(MissingAssetError): my_rgc.seek(gname, aname, t) diff --git a/tests/test_select_genome_config.py b/tests/test_select_genome_config.py index 6667db12..f81a3f1b 100644 --- a/tests/test_select_genome_config.py +++ b/tests/test_select_genome_config.py @@ -14,7 +14,7 @@ def _touch(p): """ Ensure path existence, whether file or folder. """ if os.path.splitext(p)[1]: - with open(p, 'w'): + with open(p, "w"): pass else: os.makedirs(p) @@ -33,11 +33,14 @@ def test_select_null(): assert select_genome_config(None) is None -@pytest.mark.parametrize(["setup", "expect"], [ - (lambda d: d.join("test-conf.yaml").strpath, lambda _: Exception), - (lambda d: _touch(os.path.join(d.strpath, "test-conf")), lambda _: Exception), - (lambda d: _touch(d.join("test-conf.yaml").strpath), lambda fp: fp) -]) +@pytest.mark.parametrize( + ["setup", "expect"], + [ + (lambda d: d.join("test-conf.yaml").strpath, lambda _: Exception), + (lambda d: _touch(os.path.join(d.strpath, "test-conf")), lambda _: Exception), + (lambda d: _touch(d.join("test-conf.yaml").strpath), lambda fp: fp), + ], +) def test_select_local_config_file(tmpdir, setup, expect): """ Selection of local filepath hinges on its existence as a file """ with TmpEnv(overwrite=True, **{ev: "" for ev in CFG_ENV_VARS}): @@ -53,7 +56,7 @@ def test_select_via_env_var_implicit(env_var, tmpdir): """ Config file selection can leverage default environmanent variables. """ conf_file = tmpdir.join("test-refgenconf-conf.yaml").strpath assert not os.path.exists(conf_file) - with open(conf_file, 'w'): + with open(conf_file, "w"): pass assert os.path.isfile(conf_file) with TmpEnv(overwrite=True, **{env_var: conf_file}): diff --git a/tests/test_seqcol.py b/tests/test_seqcol.py new file mode 100644 index 00000000..ad738ba2 --- /dev/null +++ b/tests/test_seqcol.py @@ -0,0 +1,97 @@ +import pytest +from refgenconf.seqcol import * + +DEMO_FILES = ["demo.fa.gz", "demo2.fa", "demo3.fa", "demo4.fa", "demo5.fa.gz"] + +CMP_SETUP = [ + ( + ( + CONTENT_ALL_A_IN_B + + CONTENT_ALL_B_IN_A + + LENGTHS_ALL_A_IN_B + + LENGTHS_ALL_B_IN_A + + NAMES_ALL_A_IN_B + + NAMES_ALL_B_IN_A + + CONTENT_A_ORDER + + CONTENT_B_ORDER + + CONTENT_ANY_SHARED + + NAMES_ANY_SHARED + + LENGTHS_ANY_SHARED + ), + DEMO_FILES[1], + DEMO_FILES[1], + ), + ( + ( + CONTENT_ALL_A_IN_B + + LENGTHS_ALL_A_IN_B + + NAMES_ALL_A_IN_B + + CONTENT_A_ORDER + + CONTENT_B_ORDER + + CONTENT_ANY_SHARED + + LENGTHS_ANY_SHARED + + NAMES_ANY_SHARED + ), + DEMO_FILES[0], + DEMO_FILES[1], + ), + ( + ( + LENGTHS_ALL_B_IN_A + + CONTENT_ALL_B_IN_A + + CONTENT_ANY_SHARED + + LENGTHS_ANY_SHARED + + CONTENT_A_ORDER + + CONTENT_B_ORDER + ), + DEMO_FILES[2], + DEMO_FILES[4], + ), +] + + +class TestSCCGeneral: + def test_no_schemas_required(self): + """ + In contrast to the generic Henge object, SeqColClient does not + require schemas as input, they are predefined in the constructor + """ + assert isinstance(SeqColClient({}), SeqColClient) + + +class TestSCCFastaInserting: + @pytest.mark.parametrize("fasta_name", DEMO_FILES) + def test_fasta_loading_works(self, fasta_name, fasta_path): + scc = SeqColClient({}) + f = os.path.join(fasta_path, fasta_name) + print("Fasta file to be loaded: {}".format(f)) + res = scc.load_fasta(f, gzipped=fasta_name.endswith(".gz")) + assert len(res) == 2 # returns digest and list of AnnotatedSequencesList + + +class TestSCCRetrieval: + @pytest.mark.parametrize("fasta_name", DEMO_FILES) + def test_retrieval_works(self, fasta_name, fasta_path): + scc = SeqColClient({}) + f = os.path.join(fasta_path, fasta_name) + print("Fasta file to be loaded: {}".format(f)) + d, asds = scc.load_fasta(f, gzipped=fasta_name.endswith(".gz")) + # convert integers in the dicts to strings + lst = [ + {k: str(v) if isinstance(v, int) else v for k, v in asd.items()} + for asd in asds + ] + assert scc.retrieve(d) == lst + + +class TestSCCCompare: + @pytest.mark.parametrize(["code", "fasta1", "fasta2"], CMP_SETUP) + def test_fasta_compare(self, code, fasta1, fasta2, fasta_path): + scc = SeqColClient({}) + d, _ = scc.load_fasta( + os.path.join(fasta_path, fasta1), gzipped=fasta1.endswith(".gz") + ) + d2, _ = scc.load_fasta( + os.path.join(fasta_path, fasta2), gzipped=fasta2.endswith(".gz") + ) + assert scc.compare(d, d2) == code diff --git a/tests/test_table.py b/tests/test_table.py new file mode 100644 index 00000000..d2655800 --- /dev/null +++ b/tests/test_table.py @@ -0,0 +1,15 @@ +import pytest +import os + + +class TestAliasTable: + def test_alias_table_dimensions(self, my_rgc): + assert len(my_rgc.genomes_list()) == my_rgc.genome_aliases_table.row_count + assert len(my_rgc.genome_aliases_table.columns) == 2 + + +class TestAssetTable: + def test_asset_table_dimensions(self, my_rgc): + assert my_rgc.genome_aliases_table.row_count == len( + my_rgc.list_assets_by_genome() + ) diff --git a/tests/test_update_genomes.py b/tests/test_update_genomes.py index c8383a96..6b4faacb 100644 --- a/tests/test_update_genomes.py +++ b/tests/test_update_genomes.py @@ -2,8 +2,13 @@ import pytest from attmap import PathExAttMap -from refgenconf import CFG_FOLDER_KEY, CFG_GENOMES_KEY, CFG_SERVER_KEY, \ - DEFAULT_SERVER, RefGenConf as RGC +from refgenconf import ( + CFG_FOLDER_KEY, + CFG_GENOMES_KEY, + CFG_SERVER_KEY, + DEFAULT_SERVER, + RefGenConf as RGC, +) from refgenconf.const import CFG_ASSETS_KEY from tests.conftest import bind_to_assets, get_conf_genomes, CONF_DATA @@ -15,81 +20,53 @@ def _asset_data_is_pxam(a, g, c): return isinstance(c[CFG_GENOMES_KEY][g][CFG_ASSETS_KEY][a], PathExAttMap) -@pytest.fixture(scope="function") -def rgc(tmpdir): - """ Provide an RGC instance; avoid disk read/write and stay in memory. """ - return RGC(entries={CFG_GENOMES_KEY: dict(CONF_DATA), - CFG_FOLDER_KEY: tmpdir.strpath, - CFG_SERVER_KEY: "http://staging.refgenomes.databio.org/"}) - - -@pytest.mark.parametrize("assembly", ["dm3"]) -@pytest.mark.parametrize("validate", [ - lambda g, c: g in c[CFG_GENOMES_KEY], - lambda g, c: isinstance(c[CFG_GENOMES_KEY], PathExAttMap)]) -def test_new_genome(rgc, assembly, validate): - """ update_genomes can insert new assembly. """ - assert assembly not in rgc[CFG_GENOMES_KEY] - rgc.update_assets(assembly) - assert validate(assembly, rgc) - - -@pytest.mark.parametrize("assembly", get_conf_genomes()) +@pytest.mark.parametrize("assembly", ["human_repeats", "rCRSd"]) @pytest.mark.parametrize("asset", ["brand_new_asset", "align_index"]) -@pytest.mark.parametrize("validate", [ - lambda a, g, c: a in c[CFG_GENOMES_KEY][g][CFG_ASSETS_KEY], - _asset_data_is_pxam]) -def test_new_asset(rgc, assembly, asset, validate): +@pytest.mark.parametrize( + "validate", + [lambda a, g, c: a in c[CFG_GENOMES_KEY][g][CFG_ASSETS_KEY], _asset_data_is_pxam], +) +def test_new_asset(my_rgc, assembly, asset, validate): """ update_genomes can insert new asset for existing assembly. """ - assert assembly in rgc[CFG_GENOMES_KEY] - assert asset not in rgc[CFG_GENOMES_KEY][assembly][CFG_ASSETS_KEY] - rgc.update_assets(assembly, asset) - assert validate(asset, assembly, rgc) - - -@pytest.mark.parametrize("assembly", ["dm3"]) -@pytest.mark.parametrize("asset", ["brand_new_asset", "align_index"]) -@pytest.mark.parametrize("validate", [ - lambda _, g, c: g in c[CFG_GENOMES_KEY], - lambda a, g, c: a in c[CFG_GENOMES_KEY][g][CFG_ASSETS_KEY], - lambda a, g, c: isinstance(c[CFG_GENOMES_KEY][g], PathExAttMap), - _asset_data_is_pxam -]) -def test_new_genome_and_asset(rgc, assembly, asset, validate): - """ update_genomes can insert assembly and asset. """ - assert assembly not in rgc[CFG_GENOMES_KEY] - rgc.update_assets(assembly, asset) - assert validate(asset, assembly, rgc) - - -@pytest.mark.parametrize(["old_data", "new_data", "expected"], [ - ({"size": "4G"}, {"path": "/home/res/gen/bt2.hg38"}, - {"size": "4G", "path": "/home/res/gen/bt2.hg38"}), - ({}, {"size": "4G"}, {"size": "4G"}), - ({}, {"path": "/home/res/gen/bt2.hg38"}, {"path": "/home/res/gen/bt2.hg38"}), - ({}, {"size": "4G", "path": "/home/res/gen/bt2.hg38"}, - {"size": "4G", "path": "/home/res/gen/bt2.hg38"}), - ({"size": "4G"}, {"size": "2G"}, {"size": "2G"}) -]) -def test_update_asset_data(tmpdir, old_data, new_data, expected): - """ update_genomes can modify data for existing assembly and asset. """ - assembly = "hg38" - asset = "idx_bt2" - c = RGC(entries={CFG_GENOMES_KEY: {assembly: bind_to_assets({asset: old_data})}, - CFG_FOLDER_KEY: tmpdir.strpath, - CFG_SERVER_KEY: DEFAULT_SERVER}) - - def get_asset_data(refgencfg, a_name): - return refgencfg[CFG_GENOMES_KEY][assembly][CFG_ASSETS_KEY][a_name].to_dict() - assert expected != get_asset_data(c, asset) - c.update_assets(assembly, asset, new_data) - assert expected == get_asset_data(c, asset) - - -@pytest.mark.parametrize("args", [ - ("hg38", ["a1", "a2"]), (["g1", "g2"], "new_tool_index"), - ("mm10", "align_index", "not_a_map")]) -def test_illegal_argtype(rgc, args): + assert assembly in my_rgc[CFG_GENOMES_KEY] + assert asset not in my_rgc[CFG_GENOMES_KEY][assembly][CFG_ASSETS_KEY] + my_rgc.update_assets(assembly, asset) + assert validate(asset, assembly, my_rgc) + + +# @pytest.mark.parametrize(["old_data", "new_data", "expected"], [ +# ({"size": "4G"}, {"path": "/home/res/gen/bt2.hg38"}, +# {"size": "4G", "path": "/home/res/gen/bt2.hg38"}), +# ({}, {"size": "4G"}, {"size": "4G"}), +# ({}, {"path": "/home/res/gen/bt2.hg38"}, {"path": "/home/res/gen/bt2.hg38"}), +# ({}, {"size": "4G", "path": "/home/res/gen/bt2.hg38"}, +# {"size": "4G", "path": "/home/res/gen/bt2.hg38"}), +# ({"size": "4G"}, {"size": "2G"}, {"size": "2G"}) +# ]) +# def test_update_asset_data(tmpdir, old_data, new_data, expected): +# """ update_genomes can modify data for existing assembly and asset. """ +# assembly = "hg38" +# asset = "idx_bt2" +# c = RGC(entries={CFG_GENOMES_KEY: {assembly: bind_to_assets({asset: old_data})}, +# CFG_FOLDER_KEY: tmpdir.strpath, +# CFG_SERVER_KEY: DEFAULT_SERVER}) +# +# def get_asset_data(refgencfg, a_name): +# return refgencfg[CFG_GENOMES_KEY][assembly][CFG_ASSETS_KEY][a_name].to_dict() +# assert expected != get_asset_data(c, asset) +# c.update_assets(assembly, asset, new_data) +# assert expected == get_asset_data(c, asset) + + +@pytest.mark.parametrize( + "args", + [ + ("human_repeats", ["a1", "a2"]), + (["g1", "g2"], "new_tool_index"), + ("rCRSd", "align_index", "not_a_map"), + ], +) +def test_illegal_argtype(my_rgc, args): """ update_genomes accurately restricts argument types. """ with pytest.raises(TypeError): - rgc.update_assets(*args) + my_rgc.update_assets(*args) diff --git a/tests/test_update_servers.py b/tests/test_update_servers.py index 464ad06a..81af9c6b 100644 --- a/tests/test_update_servers.py +++ b/tests/test_update_servers.py @@ -2,6 +2,7 @@ import pytest from refgenconf.const import * +from .conftest import TEST_SERVER class TestUpdateServers: @@ -18,20 +19,19 @@ def test_faulty_url(self, my_rgc, urls): @pytest.mark.parametrize("urls", [["www.new_url.com", "www.url.pl"]]) def test_multiple_urls(self, my_rgc, urls): my_rgc.subscribe(urls=urls) - assert urls[0] in my_rgc[CFG_SERVERS_KEY] and \ - urls[1] in my_rgc[CFG_SERVERS_KEY] + assert urls[0] in my_rgc[CFG_SERVERS_KEY] and urls[1] in my_rgc[CFG_SERVERS_KEY] @pytest.mark.parametrize("urls", [["www.new_url.com", "www.new_url.com"]]) def test_reset(self, my_rgc, urls): my_rgc.subscribe(urls=urls, reset=True) assert len(my_rgc[CFG_SERVERS_KEY]) == 1 - @pytest.mark.parametrize("urls", [["http://refgenomes.databio.org"]]) + @pytest.mark.parametrize("urls", [[TEST_SERVER]]) def test_reset(self, my_rgc, urls): my_rgc.subscribe(urls=urls, reset=True) assert len(my_rgc[CFG_SERVERS_KEY]) == 1 - @pytest.mark.parametrize("urls", [["http://refgenomes.databio.org"]]) + @pytest.mark.parametrize("urls", [[TEST_SERVER]]) def test_unsubscribe(self, my_rgc, urls): my_rgc.subscribe(urls=urls) my_rgc.unsubscribe(urls=urls) @@ -39,7 +39,7 @@ def test_unsubscribe(self, my_rgc, urls): @pytest.mark.parametrize("urls", [["http://refge"], ["what"]]) def test_unsubscribe_invalid(self, my_rgc, urls): - my_rgc.subscribe(urls=["http://refgenomes.databio.org"]) + my_rgc.subscribe(urls=[TEST_SERVER]) servers = my_rgc[CFG_SERVERS_KEY] my_rgc.unsubscribe(urls=urls) assert my_rgc[CFG_SERVERS_KEY] == servers diff --git a/tests/test_upgrade.py b/tests/test_upgrade.py new file mode 100644 index 00000000..f036a3ce --- /dev/null +++ b/tests/test_upgrade.py @@ -0,0 +1,76 @@ +import mock +import os +import pytest +from refgenconf import upgrade_config +from refgenconf.refgenconf_v03 import _RefGenConfV03 +from refgenconf.const import * +from refgenconf.exceptions import * +from refgenconf.refgenconf import _download_url_progress +from refgenconf import RefGenConf + +import urllib.request + +__author__ = "Michal Stolarczyk" +__email__ = "michal@virginia.edu" + + +DOWNLOAD_FUNCTION = f"refgenconf.refgenconf.{_download_url_progress.__name__}" + + +class TestUpgradeExceptions: + def test_cfg_v03_errors_with_new_constructor(self, cfg_file_old): + with pytest.raises(ConfigNotCompliantError): + RefGenConf(filepath=cfg_file_old) + + @pytest.mark.parametrize("target_version", ["0.5", 0.1, "IDK", [1, 2, 3]]) + def test_unavailable_conversions(self, target_version, cfg_file_old): + with pytest.raises(NotImplementedError): + upgrade_config(filepath=cfg_file_old, target_version=target_version) + + +class TestUpgrade03to04: + @pytest.mark.parametrize("genome", ["human_repeats", "rCRSd"]) + def test_get_old_data(self, cfg_file_old, genome): + old_rgc = _RefGenConfV03(cfg_file_old) + # get some old asset data on disk + with mock.patch("refgenconf.refgenconf_v03.query_yes_no", return_value=True): + print(f"\nPulling: {genome}/fasta:default\n") + old_rgc.pull(genome=genome, asset="fasta", tag="default") + + def test_all_server_local_mix(self, cfg_file_old): + """ + Test config upgrade from v0.3 to v0.4 when a mix of genomes in terms of + remote digest availability is in defined the old config + """ + old_rgc = _RefGenConfV03(cfg_file_old) + # get some old asset data on disk + g, a, t = "human_alu", "fasta", "default" + try: + pth = old_rgc.seek(g, "fasta", "default", strict_exists=True) + except MissingGenomeError: + src_url = f"http://big.databio.org/refgenie_raw/files.{g}.{a}.{a}" + target_archive = f"/tmp/old/{g}.fa.gz" + target_file = f"/tmp/old/{g}.fa" + target_dir = f"/tmp/old/{g}/{a}/{t}" + os.makedirs(target_dir, exist_ok=True) + urllib.request.urlretrieve(src_url, target_archive) + from subprocess import run + + run( + f"gunzip {target_archive}; " f"mv {target_file} {target_dir}", + shell=True, + ) + old_rgc.add( + path=target_dir, + genome=g, + asset=a, + tag="default", + seek_keys={a: f"{g}.fa"}, + force=True, + ) + else: + print(f"{pth} exists") + finally: + upgrade_config(filepath=cfg_file_old, target_version="0.4", force=True) + rgc = RefGenConf(cfg_file_old) + assert rgc[CFG_VERSION_KEY] == REQ_CFG_VERSION diff --git a/tests/test_utils.py b/tests/test_utils.py index 356c1423..6ed16f48 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -3,11 +3,11 @@ @pytest.mark.parametrize(["genome", "asset", "tag"], [("rCRSd", "fasta", "default")]) -def test_is_asset_complete_returns_correct_result(genome, asset, tag, ro_rgc): - ro_rgc.pull(genome, asset, tag) - assert ro_rgc.is_asset_complete(genome, asset, tag) +def test_is_asset_complete_returns_correct_result(genome, asset, tag, my_rgc): + my_rgc.pull(genome, asset, tag) + assert my_rgc.is_asset_complete(genome, asset, tag) @pytest.mark.parametrize("genome", ["rCRSd"]) -def test_get_genome_attributes(genome, ro_rgc): - assert isinstance(ro_rgc.get_genome_attributes(genome), Mapping) +def test_get_genome_attributes(genome, my_rgc): + assert isinstance(my_rgc.get_genome_attributes(genome), Mapping)