From 5d8b6f739a4be2652562ec6f8c6c644a1b74b6d0 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Mon, 8 Apr 2024 14:31:38 +0000 Subject: [PATCH] No more non-unique column names, or non-queryable columns in list_columns --- src/genomic_features/ensembl/ensembldb.py | 17 +++++++++++------ tests/test_columns.py | 17 +++++++++++++++++ 2 files changed, 28 insertions(+), 6 deletions(-) diff --git a/src/genomic_features/ensembl/ensembldb.py b/src/genomic_features/ensembl/ensembldb.py index 93bc8ae..ffa9dc5 100644 --- a/src/genomic_features/ensembl/ensembldb.py +++ b/src/genomic_features/ensembl/ensembldb.py @@ -19,9 +19,7 @@ PKG_CACHE_DIR = "genomic-features" -BIOC_ANNOTATION_HUB_URL = ( - "https://bioconductorhubs.blob.core.windows.net/annotationhub" -) +BIOC_ANNOTATION_HUB_URL = "https://bioconductorhubs.blob.core.windows.net/annotationhub" ANNOTATION_HUB_URL = ( "https://annotationhub.bioconductor.org/metadata/annotationhub.sqlite3" ) @@ -53,7 +51,7 @@ def annotation( """ try: sqlite_file_path = retrieve_annotation( - f'{BIOC_ANNOTATION_HUB_URL}/AHEnsDbs/v{version}/EnsDb.{species}.v{version}.sqlite' + f"{BIOC_ANNOTATION_HUB_URL}/AHEnsDbs/v{version}/EnsDb.{species}.v{version}.sqlite" ) if backend == "sqlite": @@ -440,12 +438,19 @@ def _get_required_tables(self, tab) -> list: return self._tables_by_degree(tab) def list_columns(self, tables: str | list[str] | None = None) -> list[str]: - """List all columns available in the genomic features table.""" + """List queryable columns available in these tables.""" if tables is None: tables = self.db.list_tables() # list of table names + if "metadata" in tables: + tables.remove("metadata") elif isinstance(tables, str): tables = [tables] # list of tables names (only one) - columns = [c for t in tables for c in self.db.table(t).columns] + + columns = [] + for t in tables: + for c in self.db.table(t).columns: + if c not in columns: + columns.append(c) return columns def _clean_columns(self, columns: list[str]) -> list[str]: diff --git a/tests/test_columns.py b/tests/test_columns.py index 29bb2e4..49ea628 100644 --- a/tests/test_columns.py +++ b/tests/test_columns.py @@ -125,3 +125,20 @@ def test_chromosome_columns(hsapiens108): .reset_index(drop=True) ) pd.testing.assert_series_equal(result["seq_length"], expected_lengths) + + +def test_list_columns_uniqueness(hsapiens108): + # https://github.com/scverse/genomic-features/issues/42 + cols = hsapiens108.list_columns() + assert len(cols) == len(set(cols)) + + cols = hsapiens108.list_columns(["gene", "tx"]) + assert len(cols) == len(set(cols)) + + +def test_list_columns_include_unqueryable_cols(hsapiens108): + # https://github.com/scverse/genomic-features/issues/42 + cols = hsapiens108.list_columns() + # From metadata + assert "value" not in cols + assert "name" not in cols