From bae82d20cce34045c32936972ba68a385b6f8209 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A9lix=20Raimundo?= Date: Thu, 4 Apr 2024 11:35:08 -0400 Subject: [PATCH] Some cleanup on the ensembldb code (#67) * Some cleanup on the ensembldb code * fix __repr__ * fix comments * Update src/genomic_features/ensembl/ensembldb.py --------- Co-authored-by: Felix Raimundo Co-authored-by: Isaac Virshup --- src/genomic_features/ensembl/ensembldb.py | 19 +++++++++++-------- tests/test_basic.py | 2 +- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/src/genomic_features/ensembl/ensembldb.py b/src/genomic_features/ensembl/ensembldb.py index 5d96d29..93bc8ae 100644 --- a/src/genomic_features/ensembl/ensembldb.py +++ b/src/genomic_features/ensembl/ensembldb.py @@ -20,10 +20,7 @@ PKG_CACHE_DIR = "genomic-features" BIOC_ANNOTATION_HUB_URL = ( - "https://bioconductorhubs.blob.core.windows.net/annotationhub/" -) -ENSEMBL_URL_TEMPLATE = ( - BIOC_ANNOTATION_HUB_URL + "AHEnsDbs/v{version}/EnsDb.{species}.v{version}.sqlite" + "https://bioconductorhubs.blob.core.windows.net/annotationhub" ) ANNOTATION_HUB_URL = ( "https://annotationhub.bioconductor.org/metadata/annotationhub.sqlite3" @@ -56,7 +53,7 @@ def annotation( """ try: sqlite_file_path = retrieve_annotation( - ENSEMBL_URL_TEMPLATE.format(species=species, version=version) + f'{BIOC_ANNOTATION_HUB_URL}/AHEnsDbs/v{version}/EnsDb.{species}.v{version}.sqlite' ) if backend == "sqlite": @@ -74,7 +71,8 @@ def annotation( except HTTPError as err: if err.response.status_code == 404: raise ValueError( - f"No Ensembl database found for {species} v{version}. Check available versions with `genomic_features.ensembl.list_versions`." + f"No Ensembl database found for {species} v{version}. Check " + f"available versions with `genomic_features.ensembl.list_ensdb_annotations `." ) from err else: raise HTTPError from err @@ -125,7 +123,8 @@ def list_ensdb_annotations(species: None | str | list[str] = None) -> DataFrame: # check that species exist if version_table.shape[0] == 0: raise ValueError( - f"No Ensembl database found for {species}. Check species name." + f"No Ensembl database found for {species}. Available species can " + f"be found via: `list_ensdb_annotations()['Species'].unique()`." ) version_table["Ensembl_version"] = version_table["rdatapath"].str.split( @@ -153,7 +152,11 @@ def metadata(self) -> dict: def __repr__(self) -> str: d = self.metadata - return f"EnsemblDB(organism='{d['Organism']}', ensembl_release='{d['ensembl_version']}')" + return ( + f"EnsemblDB(organism='{d['Organism']}', " + f"ensembl_release='{d['ensembl_version']}', " + f"genome_build='{d['genome_build']}')" + ) def genes( self, diff --git a/tests/test_basic.py b/tests/test_basic.py index 3e90722..c127400 100644 --- a/tests/test_basic.py +++ b/tests/test_basic.py @@ -20,7 +20,7 @@ def test_missing_version(): def test_repr(): result = repr(gf.ensembl.annotation("Hsapiens", 108)) - expected = "EnsemblDB(organism='Homo sapiens', ensembl_release='108')" + expected = "EnsemblDB(organism='Homo sapiens', ensembl_release='108', genome_build='GRCh38')" assert result == expected