NickAkhmetov/10X Builder, adjust visium spot opacity (#85)

hubmapconsortium · Apr 17, 2024 · 7e4d8ee · 7e4d8ee
1 parent 085911d
commit 7e4d8ee
Show file tree

Hide file tree

Showing 14 changed files with 2,239 additions and 31 deletions.
diff --git a/VERSION.txt b/VERSION.txt
@@ -1 +1 @@
-0.2.0
+0.2.1
diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -1,5 +1,5 @@
 pytest==5.2.1
-flake8==3.7.8
+flake8==7.0.0
 autopep8==1.4.4
 pytest-mock==3.7.0
 coverage==6.3.1

diff --git a/setup.cfg b/setup.cfg
@@ -20,18 +20,19 @@ package_dir =
 packages = find:
 python_requires = >=3.7
 install_requires =
-    vitessce==3.2.3
+    vitessce==3.2.5
     hubmap-commons>=2.0.12
     requests>=2.27.1
     nbformat==5.1.3
-    zarr>=2.8.0
+    zarr>=2.17.2
     aiohttp>=3.8.1
     fsspec>=2022.1.0
 
 [options.packages.find]
 where = src
 
 [flake8]
-max-line-length = 99
-ignore =
-  W503 # "line break before binary operator": Prefer operator at start of line so the context is clear.
+max-line-length = 120
+ignore = W503 
+
+# W503: "line break before binary operator": Prefer operator at start of line so the context is clear.
diff --git a/src/portal_visualization/builder_factory.py b/src/portal_visualization/builder_factory.py
@@ -10,6 +10,7 @@
     NanoDESIViewConfBuilder
 )
 from .builders.anndata_builders import (
+    MultiomicAnndataZarrViewConfBuilder,
     SpatialRNASeqAnnDataZarrViewConfBuilder,
     RNASeqAnnDataZarrViewConfBuilder,
     SpatialMultiomicAnnDataZarrViewConfBuilder
@@ -36,8 +37,9 @@ def process_hints(hints):
     is_codex = "codex" in hints
     is_anndata = "anndata" in hints
     is_json = "json_based" in hints
+    is_spatial = "spatial" in hints
 
-    return is_image, is_rna, is_atac, is_sprm, is_codex, is_anndata, is_json
+    return is_image, is_rna, is_atac, is_sprm, is_codex, is_anndata, is_json, is_spatial
 
 
 # This function is the main entrypoint for the builder factory.
@@ -52,7 +54,8 @@ def get_view_config_builder(entity, get_assaytype, parent=None):
     assay = get_assaytype(entity)
     assay_name = assay.get('assaytype')
     hints = assay.get('vitessce-hints', [])
-    is_image, is_rna, is_atac, is_sprm, is_codex, is_anndata, is_json = process_hints(hints)
+    is_image, is_rna, is_atac, is_sprm, is_codex, is_anndata, is_json, is_spatial = process_hints(
+        hints)
     if is_image:
         if is_rna:
             # e.g. Visium (no probes) [Salmon + Scanpy]
@@ -91,6 +94,10 @@ def get_view_config_builder(entity, get_assaytype, parent=None):
                 return ImagePyramidViewConfBuilder
 
     if is_rna:
+        # multiomic mudata, e.g. 10x Multiome, SNARE-Seq, etc.
+        # e.g. 272789a950b2b5d4b9387a1cf66ad487 on dev
+        if is_atac:
+            return MultiomicAnndataZarrViewConfBuilder
         if is_json:
             # e.g. c019a1cd35aab4d2b4a6ff221e92aaab
             return RNASeqViewConfBuilder

diff --git a/src/portal_visualization/builders/anndata_builders.py b/src/portal_visualization/builders/anndata_builders.py
@@ -3,10 +3,12 @@
 from vitessce import (
     VitessceConfig,
     AnnDataWrapper,
+    MultivecZarrWrapper,
     Component as cm,
     CoordinationType as ct,
     ImageOmeTiffWrapper,
     CoordinationLevel as CL,
+    ViewType as vt,
     get_initial_coordination_scope_prefix
 )
 
@@ -64,6 +66,12 @@ def is_annotated(self):
         else:
             return False
 
+    @cached_property
+    def has_marker_genes(self):
+        z = self.zarr_store
+        if 'obs/marker_gene_0' in z:
+            return True
+
     def get_conf_cells(self, marker=None):
         zarr_path = 'hubmap_ui/anndata-zarr/secondary_analysis.zarr'
         file_paths_found = [file["rel_path"] for file in self._entity["files"]]
@@ -151,7 +159,15 @@ def _set_up_dataset(self, vc):
         ))
         return dataset
 
-    def _set_up_obs_labels(self):
+    def _set_up_obs_labels(self,
+                           additional_obs_labels_paths=[],
+                           additional_obs_labels_names=[],
+                           additional_obs_set_paths=[],
+                           additional_obs_set_names=[],
+                           # Optionally skip default obs paths and labels
+                           skip_default_paths=False,
+                           # Support multiomic datasets
+                           modality_prefix=None):
         # Some of the keys (like marker_genes_for_heatmap) here are from our pipeline
         # https://github.com/hubmapconsortium/portal-containers/blob/master/containers/anndata-to-ui
         # while others come from Matt's standard scanpy pipeline
@@ -163,26 +179,33 @@ def _set_up_obs_labels(self):
         # or help map predicted cell labels to their IDs
         obs_label_paths = []
         obs_label_names = []
-        dags = [
-            dag for dag in self._entity['metadata']['dag_provenance_list']
-            if 'name' in dag]
+
+        # Add additional obs labels and sets if provided
+        obs_set_paths.extend(additional_obs_set_paths)
+        obs_set_names.extend(additional_obs_set_names)
+        obs_label_paths.extend(additional_obs_labels_paths)
+        obs_label_names.extend(additional_obs_labels_names)
+
         z = self.zarr_store
-        if (any(['azimuth-annotate' in dag['origin'] for dag in dags])):
+        obs = z['obs'] if modality_prefix is None else z[f'{modality_prefix}/obs']
+
+        if not skip_default_paths:
             if self.is_annotated:
-                if 'predicted.ASCT.celltype' in z['obs']:
+                if 'predicted.ASCT.celltype' in obs:
                     obs_set_paths.append("obs/predicted.ASCT.celltype")
                     obs_set_names.append("Predicted ASCT Cell Type")
-                if 'predicted_label' in z['obs']:
+                if 'predicted_label' in obs:
                     obs_set_paths.append("obs/predicted_label")
                     obs_set_names.append("Cell Ontology Annotation")
-                if 'predicted_CLID' in z['obs']:
+                if 'predicted_CLID' in obs:
                     obs_label_paths.append("obs/predicted_CLID")
                     obs_label_names.append("Predicted CL ID")
+            obs_set_paths.append("obs/leiden")
+            obs_set_names.append("Leiden")
+        if self.has_marker_genes:
+            obs_label_paths.extend(RNA_SEQ_ANNDATA_FACTOR_PATHS)
+            obs_label_names.extend(RNA_SEQ_FACTOR_LABEL_NAMES)
 
-        obs_set_paths.append("obs/leiden")
-        obs_set_names.append("Leiden")
-        obs_label_paths.extend(RNA_SEQ_ANNDATA_FACTOR_PATHS)
-        obs_label_names.extend(RNA_SEQ_FACTOR_LABEL_NAMES)
         self._obs_set_paths = obs_set_paths
         self._obs_set_names = obs_set_names
         self._obs_labels_paths = obs_label_paths
@@ -387,8 +410,210 @@ def _setup_anndata_view_config(self, vc, dataset):
         }, scope_prefix=get_initial_coordination_scope_prefix(self._uuid, 'image'))
         vc.link_views_by_dict(spatial_views, {
             "spotLayer": CL([{
-                "spatialLayerOpacity": 0.5,
+                "spatialLayerOpacity": 1,
                 "spatialSpotRadius": self._get_scale_factor(),
             }]),
         }, scope_prefix=get_initial_coordination_scope_prefix(self._uuid, 'obsSpots'))
         return vc
+
+
+class MultiomicAnndataZarrViewConfBuilder(RNASeqAnnDataZarrViewConfBuilder):
+    """Wrapper class for creating a AnnData-backed view configuration
+    for multiomic data from mudata-to-ui.cwl like 10X Multiome
+    TODO: Provide specific link to example dataset
+    """
+
+    def __init__(self, entity, groups_token, assets_endpoint, **kwargs):
+        super().__init__(entity, groups_token, assets_endpoint, **kwargs)
+        self._scatterplot_w = 3
+
+    @cached_property
+    def zarr_store(self):
+        zarr_path = 'hubmap_ui/mudata-zarr/secondary_analysis.zarr'
+        request_init = self._get_request_init() or {}
+        adata_url = self._build_assets_url(zarr_path, use_token=False)
+        return zarr.open(adata_url, mode='r', storage_options={'client_kwargs': request_init})
+
+    @cached_property
+    def is_annotated(self):
+        z = self.zarr_store
+        if 'mod/rna/uns/annotation_metadata/is_annotated' in z:
+            return z['mod/rna/uns/annotation_metadata/is_annotated'][()]
+        else:
+            return False
+
+    @cached_property
+    def has_marker_genes(self):
+        z = self.zarr_store
+        return 'mod/rna/var/marker_genes_for_heatmap' in z
+
+    @cached_property
+    def has_cbb(self):
+        z = self.zarr_store
+        return 'mod/atac_cbb' in z
+
+    def get_conf_cells(self, marker=None):
+
+        # TODO: The files array is empty for this entity, so we can't check for the zarr store
+
+        # zarr_path = 'hubmap_ui/mudata-zarr/secondary_analysis.zarr'
+        # file_paths_found = [file["rel_path"] for file in self._entity["files"]]
+        # # Use .zgroup file as proxy for whether or not the zarr store is present.
+        # if f'{zarr_path}/.zgroup' not in file_paths_found:
+        #     message = f'Multiomic assay with uuid {self._uuid} has no .zarr store at {zarr_path}'
+        #     raise FileNotFoundError(message)
+
+        # Each clustering has its own genomic profile; since we can't currently toggle between
+        # selected genomic profiles, each clustering needs its own view config.
+        confs = []
+        cluster_columns = [
+            ["leiden_wnn", "Leiden (Weighted Nearest Neighbor)", "wnn"],
+            ["cluster_cbg", "Cluster (ATAC Cell x Gene)", "cbg"],
+            ["leiden_rna", "Leiden (RNA)", "rna"],
+            ["cluster_cbb", "Cluster (ATAC Cell x Bin)", "cbb"] if self.has_cbb else None,
+            ["predicted_label", "Cell Ontology Annotation", "label"] if self.is_annotated else None,
+        ]
+        # Filter out None values
+        cluster_columns = [col for col in cluster_columns if col is not None]
+
+        column_names, column_labels = [f'obs/{col[0]}' for col in cluster_columns], [
+            col[1] for col in cluster_columns]
+
+        self._set_up_marker_gene(marker)
+        self._set_up_obs_labels(additional_obs_set_names=column_labels,
+                                additional_obs_set_paths=column_names,
+                                skip_default_paths=True,
+                                modality_prefix='mod/rna')
+
+        for column_name, column_label, multivec_label in cluster_columns:
+            vc = VitessceConfig(name=f'{column_label}',
+                                schema_version=self._schema_version)
+            dataset = self._set_up_dataset(vc, multivec_label)
+            vc = self._setup_anndata_view_config(vc, dataset, column_name, column_label)
+            vc = self._link_marker_gene(vc)
+            confs.append(vc.to_dict())
+        return get_conf_cells(confs)
+
+    def _set_up_dataset(self, vc, multivec_label):
+        zarr_base = 'hubmap_ui/mudata-zarr'
+        zarr_path = f'{zarr_base}/secondary_analysis.zarr'
+        h5mu_zarr = self._build_assets_url(zarr_path, use_token=False)
+        rna_zarr = self._build_assets_url(f'{zarr_path}/mod/rna', use_token=False)
+        atac_cbg_zarr = self._build_assets_url(f'{zarr_path}/mod/atac_cbg', use_token=False)
+        multivec_zarr = self._build_assets_url(
+            f'{zarr_base}/{multivec_label}.multivec.zarr', use_token=False)
+        dataset = vc.add_dataset(name=multivec_label).add_object(MultivecZarrWrapper(
+            zarr_url=multivec_zarr,
+            request_init=self._get_request_init(),
+        )).add_object(AnnDataWrapper(
+            # We run add_object with adata_path=rna_zarr first to add the cell-by-gene
+            # matrix and associated metadata.
+            adata_url=rna_zarr,
+            obs_embedding_paths=["obsm/X_umap"],
+            obs_embedding_names=["UMAP - RNA"],
+            obs_set_paths=self._obs_set_paths,
+            obs_set_names=self._obs_set_names,
+            obs_feature_matrix_path="X",
+            initial_feature_filter_path="var/highly_variable",
+            feature_labels_path="var/hugo_symbol",
+            request_init=self._get_request_init(),
+            # To be explicit that the features represent genes and gene expression, we
+            # specify that here.
+            coordination_values={
+                "featureType": "gene",
+                "featureValueType": "expression",
+                "featureLabelsType": "gene",
+            }
+        )).add_object(AnnDataWrapper(
+            adata_url=atac_cbg_zarr,
+            obs_feature_matrix_path="X",
+            initial_feature_filter_path="var/highly_variable",
+            obs_embedding_paths=["obsm/X_umap"],
+            obs_embedding_names=["UMAP - ATAC"],
+            feature_labels_path="var/hugo_symbol",
+            request_init=self._get_request_init(),
+            # To be explicit that the features represent genes and gene expression, we
+            # specify that here.
+            coordination_values={
+                "featureType": "peak",
+                "featureValueType": "count",
+            }
+        )).add_object(AnnDataWrapper(
+            adata_url=h5mu_zarr,
+            obs_feature_matrix_path="X",
+            obs_embedding_paths=["obsm/X_umap"],
+            obs_embedding_names=["UMAP - WNN"],
+            request_init=self._get_request_init(),
+            coordination_values={
+                "featureType": "other"
+            }
+        ))
+        return dataset
+
+    def _setup_anndata_view_config(self, vc, dataset, column_name, column_label):
+        umap_scatterplot_by_rna = vc.add_view(
+            vt.SCATTERPLOT, dataset=dataset, mapping="UMAP - RNA"
+        ).set_props(embeddingCellSetLabelsVisible=False)
+        umap_scatterplot_by_atac = vc.add_view(
+            vt.SCATTERPLOT, dataset=dataset, mapping="UMAP - ATAC"
+        ).set_props(embeddingCellSetLabelsVisible=False)
+        umap_scatterplot_by_wnn = vc.add_view(
+            vt.SCATTERPLOT, dataset=dataset, mapping="UMAP - WNN"
+        ).set_props(embeddingCellSetLabelsVisible=False)
+
+        gene_list = vc.add_view(vt.FEATURE_LIST, dataset=dataset)
+        peak_list = vc.add_view(vt.FEATURE_LIST, dataset=dataset)
+
+        # rna_heatmap = vc.add_view(vt.HEATMAP, dataset=dataset).set_props(transpose=False)
+        # atac_heatmap = vc.add_view(vt.HEATMAP, dataset=dataset).set_props(transpose=False)
+        genomic_profiles = vc.add_view(vt.GENOMIC_PROFILES, dataset=dataset)
+
+        cell_sets = vc.add_view(vt.OBS_SETS, dataset=dataset)
+
+        # specify which of the two features' (i.e., genes or peaks) views correspond to
+        # We also need to make sure the selection of genes and peaks are scoped only to
+        # the corresponding view,
+        # and we want to make sure the color mappings are independent for each modality.
+        coordination_types = [ct.FEATURE_TYPE, ct.FEATURE_VALUE_TYPE]
+        vc.link_views([umap_scatterplot_by_rna, gene_list],
+                      coordination_types, ["gene", "expression"])
+        vc.link_views([umap_scatterplot_by_atac, peak_list],
+                      coordination_types, ["peak", "count"])
+
+        # Coordinate the selection of cell sets between the scatterplots and lists
+        # of features/observations.
+        coordination_types = [ct.FEATURE_SELECTION,
+                              ct.OBS_COLOR_ENCODING,
+                              ct.FEATURE_VALUE_COLORMAP_RANGE]
+        vc.link_views([umap_scatterplot_by_rna,
+                       umap_scatterplot_by_atac,
+                       umap_scatterplot_by_wnn,
+                       gene_list, peak_list, cell_sets],
+                      coordination_types, [None, 'cellSetSelection', [0.0, 1.0]])
+
+        # Indicate genomic profiles' clusters; based on the display name for the ATAC CBB clusters.
+        obs_set_coordination, obs_color_coordination = vc.add_coordination(
+            ct.OBS_SET_SELECTION, ct.OBS_COLOR_ENCODING)
+        genomic_profiles.use_coordination(obs_set_coordination, obs_color_coordination)
+
+        # Dynamically determine the number of clusters in the given clustering column
+        label_names = self._get_obs_set_members(column_name)
+        obs_set_coordinations = [[column_label, str(i)] for i in label_names]
+        obs_set_coordination.set_value(obs_set_coordinations)
+        obs_color_coordination.set_value('cellSetSelection')
+
+        # Hide numeric cluster labels
+        vc.link_views([umap_scatterplot_by_rna, umap_scatterplot_by_atac, umap_scatterplot_by_wnn], [
+            ct.EMBEDDING_OBS_SET_LABELS_VISIBLE], [False])
+
+        vc.layout(((umap_scatterplot_by_rna | umap_scatterplot_by_atac) | (
+            umap_scatterplot_by_wnn | cell_sets)) / (genomic_profiles | (peak_list | gene_list)))
+
+        self._views = [umap_scatterplot_by_rna, umap_scatterplot_by_atac, umap_scatterplot_by_wnn,
+                       gene_list, peak_list, genomic_profiles, cell_sets]
+        return vc
+
+    def _get_obs_set_members(self, column_name):
+        z = self.zarr_store
+        members = z[f'mod/rna/obs/{column_name}'].categories
+        return members
diff --git a/src/portal_visualization/builders/scatterplot_builders.py b/src/portal_visualization/builders/scatterplot_builders.py
@@ -42,8 +42,8 @@ def _setup_scatterplot_view_config(self, vc, dataset):
 
 
 class RNASeqViewConfBuilder(AbstractScatterplotViewConfBuilder):
-    """Wrapper class for creating a JSON-backed scatterplot for "first generation" RNA-seq data like
-    https://portal.hubmapconsortium.org/browse/dataset/c019a1cd35aab4d2b4a6ff221e92aaab
+    """Wrapper class for creating a JSON-backed scatterplot for "first generation" RNA-seq data
+    like https://portal.hubmapconsortium.org/browse/dataset/c019a1cd35aab4d2b4a6ff221e92aaab
     from h5ad-to-arrow.cwl (August 2020 release).
     """
 
@@ -75,8 +75,8 @@ def __init__(self, entity, groups_token, assets_endpoint, **kwargs):
 
 
 class ATACSeqViewConfBuilder(AbstractScatterplotViewConfBuilder):
-    """Wrapper class for creating a JSON-backed scatterplot for "first generation" ATAC-seq data like
-    https://portal.hubmapconsortium.org/browse/dataset/d4493657cde29702c5ed73932da5317c
+    """Wrapper class for creating a JSON-backed scatterplot for "first generation" ATAC-seq data
+    like https://portal.hubmapconsortium.org/browse/dataset/d4493657cde29702c5ed73932da5317c
     from h5ad-to-arrow.cwl.
     """