From 02fce69f4eb4a60b50455f0b61a55e0f1d877280 Mon Sep 17 00:00:00 2001 From: George Chen <72078254+jiajic@users.noreply.github.com> Date: Fri, 26 Jul 2024 22:31:13 -0400 Subject: [PATCH 01/33] chore: document --- NAMESPACE | 282 ++++++++++++++++++++++++++++ man/dot-igraph_vertex_membership.Rd | 6 +- man/identifyTMAcores.Rd | 42 +++++ man/importVisiumHD.Rd | 4 +- man/spatialSplitCluster.Rd | 18 +- 5 files changed, 345 insertions(+), 7 deletions(-) create mode 100644 man/identifyTMAcores.Rd diff --git a/NAMESPACE b/NAMESPACE index 1e78b2e0b..f770bc45f 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -261,6 +261,7 @@ export(heatmSpatialCorGenes) export(hexVertices) export(hist) export(hyperGeometricEnrich) +export(identifyTMAcores) export(importCosMx) export(importVisiumHD) export(initHMRF_V2) @@ -508,6 +509,287 @@ import(methods) import(stats, except = density) import(utils) importClassesFrom(data.table,data.table) +importFrom(GiottoClass,"activeFeatType<-") +importFrom(GiottoClass,"activeSpatUnit<-") +importFrom(GiottoClass,"ext<-") +importFrom(GiottoClass,"featType<-") +importFrom(GiottoClass,"instructions<-") +importFrom(GiottoClass,"objName<-") +importFrom(GiottoClass,"prov<-") +importFrom(GiottoClass,"spatUnit<-") +importFrom(GiottoClass,activeFeatType) +importFrom(GiottoClass,activeSpatUnit) +importFrom(GiottoClass,addCellMetadata) +importFrom(GiottoClass,addFeatMetadata) +importFrom(GiottoClass,addGiottoImage) +importFrom(GiottoClass,addGiottoImageMG) +importFrom(GiottoClass,addGiottoLargeImage) +importFrom(GiottoClass,addGiottoPoints) +importFrom(GiottoClass,addGiottoPoints3D) +importFrom(GiottoClass,addGiottoPolygons) +importFrom(GiottoClass,addNetworkLayout) +importFrom(GiottoClass,addSpatialCentroidLocations) +importFrom(GiottoClass,addSpatialCentroidLocationsLayer) +importFrom(GiottoClass,aggregateStacks) +importFrom(GiottoClass,aggregateStacksExpression) +importFrom(GiottoClass,aggregateStacksLocations) +importFrom(GiottoClass,aggregateStacksPolygonOverlaps) +importFrom(GiottoClass,aggregateStacksPolygons) +importFrom(GiottoClass,anndataToGiotto) +importFrom(GiottoClass,annotateGiotto) +importFrom(GiottoClass,annotateSpatialGrid) +importFrom(GiottoClass,annotateSpatialNetwork) +importFrom(GiottoClass,as.points) +importFrom(GiottoClass,as.polygons) +importFrom(GiottoClass,as.sf) +importFrom(GiottoClass,as.sp) +importFrom(GiottoClass,as.stars) +importFrom(GiottoClass,as.terra) +importFrom(GiottoClass,calculateMetaTable) +importFrom(GiottoClass,calculateMetaTableCells) +importFrom(GiottoClass,calculateOverlap) +importFrom(GiottoClass,calculateOverlapParallel) +importFrom(GiottoClass,calculateOverlapPolygonImages) +importFrom(GiottoClass,calculateOverlapRaster) +importFrom(GiottoClass,calculateOverlapSerial) +importFrom(GiottoClass,calculateSpatCellMetadataProportions) +importFrom(GiottoClass,centroids) +importFrom(GiottoClass,changeGiottoInstructions) +importFrom(GiottoClass,changeImageBg) +importFrom(GiottoClass,checkGiottoEnvironment) +importFrom(GiottoClass,circleVertices) +importFrom(GiottoClass,combineCellData) +importFrom(GiottoClass,combineFeatureData) +importFrom(GiottoClass,combineFeatureOverlapData) +importFrom(GiottoClass,combineMetadata) +importFrom(GiottoClass,combineSpatialCellFeatureInfo) +importFrom(GiottoClass,combineSpatialCellMetadataInfo) +importFrom(GiottoClass,combineToMultiPolygon) +importFrom(GiottoClass,convertGiottoLargeImageToMG) +importFrom(GiottoClass,copy) +importFrom(GiottoClass,createBentoAdata) +importFrom(GiottoClass,createCellMetaObj) +importFrom(GiottoClass,createDimObj) +importFrom(GiottoClass,createExprObj) +importFrom(GiottoClass,createFeatMetaObj) +importFrom(GiottoClass,createGiottoImage) +importFrom(GiottoClass,createGiottoInstructions) +importFrom(GiottoClass,createGiottoLargeImage) +importFrom(GiottoClass,createGiottoLargeImageList) +importFrom(GiottoClass,createGiottoObject) +importFrom(GiottoClass,createGiottoObjectSubcellular) +importFrom(GiottoClass,createGiottoPoints) +importFrom(GiottoClass,createGiottoPolygon) +importFrom(GiottoClass,createGiottoPolygonsFromDfr) +importFrom(GiottoClass,createGiottoPolygonsFromGeoJSON) +importFrom(GiottoClass,createGiottoPolygonsFromMask) +importFrom(GiottoClass,createMetafeats) +importFrom(GiottoClass,createNearestNetObj) +importFrom(GiottoClass,createNearestNetwork) +importFrom(GiottoClass,createSpatEnrObj) +importFrom(GiottoClass,createSpatLocsObj) +importFrom(GiottoClass,createSpatNetObj) +importFrom(GiottoClass,createSpatialDefaultGrid) +importFrom(GiottoClass,createSpatialDelaunayNetwork) +importFrom(GiottoClass,createSpatialFeaturesKNNnetwork) +importFrom(GiottoClass,createSpatialGrid) +importFrom(GiottoClass,createSpatialKNNnetwork) +importFrom(GiottoClass,createSpatialNetwork) +importFrom(GiottoClass,createSpatialWeightMatrix) +importFrom(GiottoClass,crop) +importFrom(GiottoClass,cropGiottoLargeImage) +importFrom(GiottoClass,density) +importFrom(GiottoClass,distGiottoImage) +importFrom(GiottoClass,estimateImageBg) +importFrom(GiottoClass,ext) +importFrom(GiottoClass,fDataDT) +importFrom(GiottoClass,featIDs) +importFrom(GiottoClass,featType) +importFrom(GiottoClass,featureNetwork) +importFrom(GiottoClass,flip) +importFrom(GiottoClass,gefToGiotto) +importFrom(GiottoClass,getCellMetadata) +importFrom(GiottoClass,getDimReduction) +importFrom(GiottoClass,getExpression) +importFrom(GiottoClass,getFeatureInfo) +importFrom(GiottoClass,getFeatureMetadata) +importFrom(GiottoClass,getGiottoImage) +importFrom(GiottoClass,getMultiomics) +importFrom(GiottoClass,getNearestNetwork) +importFrom(GiottoClass,getPolygonInfo) +importFrom(GiottoClass,getSpatialEnrichment) +importFrom(GiottoClass,getSpatialGrid) +importFrom(GiottoClass,getSpatialLocations) +importFrom(GiottoClass,getSpatialNetwork) +importFrom(GiottoClass,giotto) +importFrom(GiottoClass,giottoImage) +importFrom(GiottoClass,giottoLargeImage) +importFrom(GiottoClass,giottoMasterToSuite) +importFrom(GiottoClass,giottoPoints) +importFrom(GiottoClass,giottoPolygon) +importFrom(GiottoClass,giottoToAnnData) +importFrom(GiottoClass,giottoToSeurat) +importFrom(GiottoClass,giottoToSeuratV4) +importFrom(GiottoClass,giottoToSeuratV5) +importFrom(GiottoClass,giottoToSpatialExperiment) +importFrom(GiottoClass,hexVertices) +importFrom(GiottoClass,hist) +importFrom(GiottoClass,installGiottoEnvironment) +importFrom(GiottoClass,instructions) +importFrom(GiottoClass,joinGiottoObjects) +importFrom(GiottoClass,loadGiotto) +importFrom(GiottoClass,makePseudoVisium) +importFrom(GiottoClass,objHistory) +importFrom(GiottoClass,objName) +importFrom(GiottoClass,orthoGrid) +importFrom(GiottoClass,overlapImagesToMatrix) +importFrom(GiottoClass,overlapToMatrix) +importFrom(GiottoClass,overlapToMatrixMultiPoly) +importFrom(GiottoClass,overlaps) +importFrom(GiottoClass,pDataDT) +importFrom(GiottoClass,plotGiottoImage) +importFrom(GiottoClass,polyStamp) +importFrom(GiottoClass,prov) +importFrom(GiottoClass,readCellMetadata) +importFrom(GiottoClass,readDimReducData) +importFrom(GiottoClass,readExprData) +importFrom(GiottoClass,readExprMatrix) +importFrom(GiottoClass,readFeatData) +importFrom(GiottoClass,readFeatMetadata) +importFrom(GiottoClass,readGiottoInstructions) +importFrom(GiottoClass,readNearestNetData) +importFrom(GiottoClass,readPolygonData) +importFrom(GiottoClass,readSpatEnrichData) +importFrom(GiottoClass,readSpatLocsData) +importFrom(GiottoClass,readSpatNetData) +importFrom(GiottoClass,reconnectGiottoImage) +importFrom(GiottoClass,rectVertices) +importFrom(GiottoClass,removeCellAnnotation) +importFrom(GiottoClass,removeFeatAnnotation) +importFrom(GiottoClass,removeGiottoEnvironment) +importFrom(GiottoClass,replaceGiottoInstructions) +importFrom(GiottoClass,rescale) +importFrom(GiottoClass,rescalePolygons) +importFrom(GiottoClass,saveGiotto) +importFrom(GiottoClass,setCellMetadata) +importFrom(GiottoClass,setDimReduction) +importFrom(GiottoClass,setExpression) +importFrom(GiottoClass,setFeatureInfo) +importFrom(GiottoClass,setFeatureMetadata) +importFrom(GiottoClass,setGiotto) +importFrom(GiottoClass,setGiottoImage) +importFrom(GiottoClass,setMultiomics) +importFrom(GiottoClass,setNearestNetwork) +importFrom(GiottoClass,setPolygonInfo) +importFrom(GiottoClass,setSpatialEnrichment) +importFrom(GiottoClass,setSpatialGrid) +importFrom(GiottoClass,setSpatialLocations) +importFrom(GiottoClass,setSpatialNetwork) +importFrom(GiottoClass,seuratToGiotto) +importFrom(GiottoClass,seuratToGiottoV4) +importFrom(GiottoClass,seuratToGiottoV5) +importFrom(GiottoClass,showGiottoCellMetadata) +importFrom(GiottoClass,showGiottoDimRed) +importFrom(GiottoClass,showGiottoExpression) +importFrom(GiottoClass,showGiottoFeatInfo) +importFrom(GiottoClass,showGiottoFeatMetadata) +importFrom(GiottoClass,showGiottoImageNames) +importFrom(GiottoClass,showGiottoInstructions) +importFrom(GiottoClass,showGiottoNearestNetworks) +importFrom(GiottoClass,showGiottoSpatEnrichments) +importFrom(GiottoClass,showGiottoSpatGrids) +importFrom(GiottoClass,showGiottoSpatLocs) +importFrom(GiottoClass,showGiottoSpatNetworks) +importFrom(GiottoClass,showGiottoSpatialInfo) +importFrom(GiottoClass,showProcessingSteps) +importFrom(GiottoClass,smoothGiottoPolygons) +importFrom(GiottoClass,spatIDs) +importFrom(GiottoClass,spatQueryGiottoPolygons) +importFrom(GiottoClass,spatShift) +importFrom(GiottoClass,spatUnit) +importFrom(GiottoClass,spatialExperimentToGiotto) +importFrom(GiottoClass,spin) +importFrom(GiottoClass,stitchFieldCoordinates) +importFrom(GiottoClass,stitchGiottoLargeImage) +importFrom(GiottoClass,subsetGiotto) +importFrom(GiottoClass,subsetGiottoLocs) +importFrom(GiottoClass,subsetGiottoLocsMulti) +importFrom(GiottoClass,subsetGiottoLocsSubcellular) +importFrom(GiottoClass,tessellate) +importFrom(GiottoClass,triGrid) +importFrom(GiottoClass,updateGiottoImage) +importFrom(GiottoClass,updateGiottoImageMG) +importFrom(GiottoClass,updateGiottoLargeImage) +importFrom(GiottoClass,updateGiottoObject) +importFrom(GiottoClass,updateGiottoPointsObject) +importFrom(GiottoClass,updateGiottoPolygonObject) +importFrom(GiottoClass,vect) +importFrom(GiottoClass,wrap) +importFrom(GiottoClass,writeGiottoLargeImage) +importFrom(GiottoUtils,"%>%") +importFrom(GiottoUtils,getDistinctColors) +importFrom(GiottoUtils,getRainbowColors) +importFrom(GiottoVisuals,"sankeyLabel<-") +importFrom(GiottoVisuals,"sankeyRelate<-") +importFrom(GiottoVisuals,addGiottoImageToSpatPlot) +importFrom(GiottoVisuals,dimCellPlot) +importFrom(GiottoVisuals,dimCellPlot2D) +importFrom(GiottoVisuals,dimFeatPlot2D) +importFrom(GiottoVisuals,dimFeatPlot3D) +importFrom(GiottoVisuals,dimGenePlot3D) +importFrom(GiottoVisuals,dimPlot) +importFrom(GiottoVisuals,dimPlot2D) +importFrom(GiottoVisuals,dimPlot3D) +importFrom(GiottoVisuals,getColors) +importFrom(GiottoVisuals,giottoSankeyPlan) +importFrom(GiottoVisuals,plotHeatmap) +importFrom(GiottoVisuals,plotMetaDataCellsHeatmap) +importFrom(GiottoVisuals,plotMetaDataHeatmap) +importFrom(GiottoVisuals,plotPCA) +importFrom(GiottoVisuals,plotPCA_2D) +importFrom(GiottoVisuals,plotPCA_3D) +importFrom(GiottoVisuals,plotStatDelaunayNetwork) +importFrom(GiottoVisuals,plotTSNE) +importFrom(GiottoVisuals,plotTSNE_2D) +importFrom(GiottoVisuals,plotTSNE_3D) +importFrom(GiottoVisuals,plotUMAP) +importFrom(GiottoVisuals,plotUMAP_2D) +importFrom(GiottoVisuals,plotUMAP_3D) +importFrom(GiottoVisuals,sankeyLabel) +importFrom(GiottoVisuals,sankeyPlot) +importFrom(GiottoVisuals,sankeyRelate) +importFrom(GiottoVisuals,sankeySet) +importFrom(GiottoVisuals,sankeySetAddresses) +importFrom(GiottoVisuals,showClusterDendrogram) +importFrom(GiottoVisuals,showClusterHeatmap) +importFrom(GiottoVisuals,showColorInstructions) +importFrom(GiottoVisuals,showSaveParameters) +importFrom(GiottoVisuals,spatCellPlot) +importFrom(GiottoVisuals,spatCellPlot2D) +importFrom(GiottoVisuals,spatDeconvPlot) +importFrom(GiottoVisuals,spatDimCellPlot) +importFrom(GiottoVisuals,spatDimCellPlot2D) +importFrom(GiottoVisuals,spatDimFeatPlot2D) +importFrom(GiottoVisuals,spatDimFeatPlot3D) +importFrom(GiottoVisuals,spatDimGenePlot3D) +importFrom(GiottoVisuals,spatDimPlot) +importFrom(GiottoVisuals,spatDimPlot2D) +importFrom(GiottoVisuals,spatDimPlot3D) +importFrom(GiottoVisuals,spatFeatPlot2D) +importFrom(GiottoVisuals,spatFeatPlot2D_single) +importFrom(GiottoVisuals,spatFeatPlot3D) +importFrom(GiottoVisuals,spatGenePlot3D) +importFrom(GiottoVisuals,spatInSituPlotDensity) +importFrom(GiottoVisuals,spatInSituPlotHex) +importFrom(GiottoVisuals,spatInSituPlotPoints) +importFrom(GiottoVisuals,spatNetwDistributions) +importFrom(GiottoVisuals,spatNetwDistributionsDistance) +importFrom(GiottoVisuals,spatNetwDistributionsKneighbors) +importFrom(GiottoVisuals,spatPlot) +importFrom(GiottoVisuals,spatPlot2D) +importFrom(GiottoVisuals,spatPlot3D) +importFrom(GiottoVisuals,subsetSankeySet) +importFrom(GiottoVisuals,violinPlot) importFrom(data.table,data.table) importFrom(data.table,frank) importFrom(data.table,fread) diff --git a/man/dot-igraph_vertex_membership.Rd b/man/dot-igraph_vertex_membership.Rd index 937905ad9..f4362ce83 100644 --- a/man/dot-igraph_vertex_membership.Rd +++ b/man/dot-igraph_vertex_membership.Rd @@ -4,12 +4,16 @@ \alias{.igraph_vertex_membership} \title{igraph vertex membership} \usage{ -.igraph_vertex_membership(g, clus_name) +.igraph_vertex_membership(g, clus_name, all_ids = NULL, missing_id_name) } \arguments{ \item{g}{igraph} \item{clus_name}{character. name to assign column of clustering info} + +\item{all_ids}{(optional) character vector with all ids} + +\item{missing_id_name}{character and name for vertices that were missing from g} } \value{ data.table diff --git a/man/identifyTMAcores.Rd b/man/identifyTMAcores.Rd new file mode 100644 index 000000000..010f05d50 --- /dev/null +++ b/man/identifyTMAcores.Rd @@ -0,0 +1,42 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/spatial_clusters.R +\name{identifyTMAcores} +\alias{identifyTMAcores} +\title{Split cluster annotations based on a spatial network} +\usage{ +identifyTMAcores( + gobject, + spat_unit = NULL, + feat_type = NULL, + spatial_network_name = "Delaunay_network", + core_id_name = "core_id", + include_all_ids = TRUE, + missing_id_name = "not_connected", + return_gobject = TRUE +) +} +\arguments{ +\item{gobject}{giotto object} + +\item{spat_unit}{spatial unit (e.g. "cell")} + +\item{feat_type}{feature type (e.g. "rna", "dna", "protein")} + +\item{spatial_network_name}{character. Name of spatial network to use} + +\item{core_id_name}{metadata column name for the core information} + +\item{include_all_ids}{Boolean. Include all ids, including vertex ids not found +in the spatial network} + +\item{missing_id_name}{Character. Name for vertices that were missing from +spatial network} + +\item{return_gobject}{Boolean. Return giotto object} +} +\value{ +cluster annotations +} +\description{ +Split cluster annotations based on a spatial network +} diff --git a/man/importVisiumHD.Rd b/man/importVisiumHD.Rd index 1b584aad3..08fb3801b 100644 --- a/man/importVisiumHD.Rd +++ b/man/importVisiumHD.Rd @@ -20,7 +20,7 @@ importVisiumHD( \item{expression_source}{character. Raw or filter expression data. Defaults to 'raw'} -\item{gene_column_index}{numeric. Expression column to use for gene names +\item{gene_column_index}{numeric. Expression column to use for gene names 1 = Ensembl and 2 = gene symbols} \item{barcodes}{character vector. (optional) Use if you only want to load @@ -74,7 +74,7 @@ expression_obj = readerHD$load_expression() Load transcript data (cell metadata, expression object, and transcripts per pixel) my_transcripts = readerHD$load_transcripts(array_subset_row = c(500, 1000), array_subset_col = c(500, 1000)) - + # Create a `giotto` object and add the loaded data TODO } diff --git a/man/spatialSplitCluster.Rd b/man/spatialSplitCluster.Rd index b0c03729b..f6d503a68 100644 --- a/man/spatialSplitCluster.Rd +++ b/man/spatialSplitCluster.Rd @@ -10,7 +10,10 @@ spatialSplitCluster( feat_type = NULL, spatial_network_name = "Delaunay_network", cluster_col, - split_clus_name = paste0(cluster_col, "_split") + split_clus_name = paste0(cluster_col, "_split"), + include_all_ids = TRUE, + missing_id_name = "not_connected", + return_gobject = TRUE ) } \arguments{ @@ -25,11 +28,18 @@ spatialSplitCluster( \item{cluster_col}{character. Column in metadata containing original clustering} -\item{split_clus_name}{character. Name to assign the split cluster results -information to split} +\item{split_clus_name}{character. Name to assign the split cluster results} + +\item{include_all_ids}{Boolean. Include all ids, including vertex ids not found +in the spatial network} + +\item{missing_id_name}{Character. Name for vertices that were missing from +spatial network} + +\item{return_gobject}{Boolean. Return giotto object} } \value{ -cluster annotations +giotto object with cluster annotations } \description{ Split cluster annotations based on a spatial network From d7e83b43407519c0163f890a5d3a3e6e81594f23 Mon Sep 17 00:00:00 2001 From: George Chen <72078254+jiajic@users.noreply.github.com> Date: Tue, 30 Jul 2024 06:50:34 -0400 Subject: [PATCH 02/33] wip --- R/convenience_xenium.R | 1436 ++++++++++++++++++++++------------------ 1 file changed, 797 insertions(+), 639 deletions(-) diff --git a/R/convenience_xenium.R b/R/convenience_xenium.R index e68a897b7..cc61d87e4 100644 --- a/R/convenience_xenium.R +++ b/R/convenience_xenium.R @@ -9,6 +9,7 @@ setClass( xenium_dir = "character", filetype = "list", qv = "ANY", + micron = "numeric", calls = "list" ), prototype = list( @@ -62,31 +63,35 @@ setMethod( .Object, xenium_dir, filetype, - qv_cutoff + qv_cutoff, + micron ) { - .Object <- callNextMethod(.Object) + obj <- callNextMethod(.Object) # provided params (if any) if (!missing(xenium_dir)) { checkmate::assert_directory_exists(xenium_dir) - .Object@xenium_dir <- xenium_dir + obj@xenium_dir <- xenium_dir } if (!missing(filetype)) { - .Object@filetype <- filetype + obj@filetype <- filetype } if (!missing(qv_cutoff)) { - .Object@qv <- qv_cutoff + obj@qv <- qv_cutoff + } + if (!missing(micron)) { + obj@micron <- micron } # check filetype ftype_data <- c("transcripts", "boundaries", "expression", "cell_meta") - if (!all(ftype_data %in% names(.Object@filetype))) { + if (!all(ftype_data %in% names(obj@filetype))) { stop(wrap_txt("`$filetype` must have entries for each of:\n", paste(ftype_data, collapse = ", "))) } - ftype <- .Object@filetype + ftype <- obj@filetype ft_tab <- c("csv", "parquet") ft_exp <- c("h5", "mtx", "zarr") if (!ftype$transcripts %in% ft_tab) { @@ -112,7 +117,7 @@ setMethod( # detect paths and subdirs - p <- .Object@xenium_dir + p <- obj@xenium_dir .xenium_detect <- function(pattern, ...) { .detect_in_dir( pattern = pattern, ..., @@ -152,6 +157,17 @@ setMethod( expr_path <- .xenium_ftype(expr_path, ftype$expression) cell_meta_path <- .xenium_ftype(cell_meta_path, ftype$cell_meta) + # decide micron scaling + if (length(obj@micron) == 0) { # if no value already set + if (!is.null(experiment_info_path)) { + obj@micron <- jsonlite::fromJSON( + manifest$experiment.xenium)$pixel_size + } else { + warning(wrap_txt("No .xenium file found. + Guessing 0.2125 as micron scaling")) + obj@micron <- 0.2125 # default + } + } # transcripts load call tx_fun <- function( @@ -168,7 +184,7 @@ setMethod( "NegControlCodeword" ), dropcols = c(), - qv_threshold = .Object@qv, + qv_threshold = obj@qv, cores = determine_cores(), verbose = NULL ) { @@ -182,15 +198,15 @@ setMethod( verbose = verbose ) } - .Object@calls$load_transcripts <- tx_fun + obj@calls$load_transcripts <- tx_fun # load polys call poly_fun <- function( - path = cell_bound_path, - name = "cell", - calc_centroids = TRUE, - cores = determine_cores(), - verbose = NULL + path = cell_bound_path, + name = "cell", + calc_centroids = TRUE, + cores = determine_cores(), + verbose = NULL ) { .xenium_poly( path = path, @@ -200,14 +216,14 @@ setMethod( verbose = verbose ) } - .Object@calls$load_polys <- poly_fun + obj@calls$load_polys <- poly_fun # load cellmeta cmeta_fun <- function( - path = cell_meta_path, - dropcols = c(), - cores = determine_cores(), - verbose = NULL + path = cell_meta_path, + dropcols = c(), + cores = determine_cores(), + verbose = NULL ) { .xenium_cellmeta( path = path, @@ -216,14 +232,14 @@ setMethod( verbose = verbose ) } - .Object@calls$load_cellmeta <- cmeta_fun + obj@calls$load_cellmeta <- cmeta_fun # load featmeta fmeta_fun <- function( - path = panel_meta_path, - dropcols = c(), - cores = determine_cores(), - verbose = NULL + path = panel_meta_path, + dropcols = c(), + cores = determine_cores(), + verbose = NULL ) { .xenium_featmeta( path = path, @@ -232,15 +248,15 @@ setMethod( verbose = verbose ) } - .Object@calls$load_featmeta <- fmeta_fun + obj@calls$load_featmeta <- fmeta_fun # load expression call expr_fun <- function( - path, - gene_ids = "symbols", - remove_zero_rows = TRUE, - split_by_type = TRUE, - verbose = NULL + path = expr_path, + gene_ids = "symbols", + remove_zero_rows = TRUE, + split_by_type = TRUE, + verbose = NULL ) { .xenium_expression( path = path, @@ -250,11 +266,43 @@ setMethod( verbose = verbose ) } - .Object@calls$load_expression <- expr_fun + obj@calls$load_expression <- expr_fun # load image call + img_fun <- function( + path, + name = "image", + micron = obj@micron, + negative_y = TRUE, + flip_vertical = FALSE, + flip_horizontal = FALSE, + verbose = NULL + ) { + .xenium_image( + path = path, + name = name, + micron = micron, + negative_y = negative_y, + flip_vertical = flip_vertical, + flip_horizontal = flip_horizontal, + verbose = verbose + ) + } + obj@calls$load_image <- img_fun - + # load aligned image call + img_aff_fun <- function( + path = path, + micron = obj@micron, + imagealignment_path + ) { + read10xAffineImage( + file = path, + imagealignment_path = imagealignment_path, + micron = micron + ) + } + obj@calls$load_aligned_image <- img_aff_fun # create giotto object call @@ -301,7 +349,7 @@ setMethod( - funs <- .Object@calls + funs <- obj@calls # init gobject g <- giotto() @@ -367,10 +415,10 @@ setMethod( } - .Object@calls$create_gobject <- gobject_fun + obj@calls$create_gobject <- gobject_fun - return(.Object) + return(obj) } ) @@ -414,7 +462,7 @@ setMethod("$<-", signature("XeniumReader"), function(x, name, value) { # MODULAR #### - +## transcript #### .xenium_transcript <- function( path, @@ -452,6 +500,7 @@ setMethod("$<-", signature("XeniumReader"), function(x, name, value) { verbose = verbose ) vmsg("Loading transcript level info...", .v = verbose) + # pass to specific reader fun based on filetype tx <- switch(e, "csv" = do.call(.xenium_transcript_csv, args = c(a, list(cores = cores))), @@ -558,6 +607,9 @@ setMethod("$<-", signature("XeniumReader"), function(x, name, value) { return(tx_dt) } + +## polygon #### + .xenium_poly <- function( path, name = "cell", @@ -572,6 +624,7 @@ setMethod("$<-", signature("XeniumReader"), function(x, name, value) { a <- list(path = path) vmsg("Loading boundary info...", .v = verbose) + # pass to specific load function based on file extension polys <- switch(e, "csv" = do.call(.xenium_poly_csv, args = c(a, list(cores = cores))), "parquet" = do.call(.xenium_poly_parquet, args = a), @@ -608,6 +661,9 @@ setMethod("$<-", signature("XeniumReader"), function(x, name, value) { data.table::setDT() } + +## cellmeta #### + .xenium_cellmeta <- function( path, dropcols = c(), @@ -651,10 +707,12 @@ setMethod("$<-", signature("XeniumReader"), function(x, name, value) { arrow::read_parquet(file = path, as_data_frame = FALSE) %>% dplyr::mutate(cell_id = cast(cell_id, arrow::string())) %>% dplyr::select(-dplyr::any_of(dropcols)) %>% - as.data.frame() %>% - data.table::setDT() + data.table::as.data.table() } + +## featmeta #### + .xenium_featmeta <- function( path, gene_ids = "symbols", @@ -694,6 +752,42 @@ setMethod("$<-", signature("XeniumReader"), function(x, name, value) { return(fx) } + +.load_xenium_panel_json <- function(path, gene_ids = "symbols") { + gene_ids <- match.arg(gene_ids, c("symbols", "ensembl")) + + # tested on v1.6 + j <- jsonlite::fromJSON(path) + # j$metadata # dataset meta + # j$payload # main content + # j$payload$chemistry # panel chemistry used + # j$payload$customer # panel customer + # j$payload$designer # panel designer + # j$payload$spec_version # versioning + # j$payload$panel # dataset panel stats + + panel_info <- j$payload$targets$type %>% + data.table::as.data.table() + + switch(gene_ids, + "symbols" = data.table::setnames( + panel_info, + old = c("data.id", "data.name", "descriptor"), + new = c("ensembl", "feat_ID", "type") + ), + "ensembl" = data.table::setnames( + panel_info, + old = c("data.id", "data.name", "descriptor"), + new = c("feat_ID", "symbol", "type") + ) + ) + return(panel_info) +} + + + +## expression #### + .xenium_expression <- function( path, gene_ids = "symbols", @@ -772,22 +866,101 @@ setMethod("$<-", signature("XeniumReader"), function(x, name, value) { ) } + + +## image #### + .xenium_image <- function( path, - name = "image", + name, + output_dir, + micron, negative_y = TRUE, flip_vertical = FALSE, flip_horizontal = FALSE, - affine = NULL, - verbose = NULL + verbose = NULL, + ... ) { if (missing(path)) { stop(wrap_txt( - "No path to image file to load provided or auto-detected" + "No path to image file or dir to load provided or auto-detected" ), call. = FALSE) } - checkmate::assert_file_exists(path) + # [directory input] -> load as individual .ome paths with defined names + # intended for usage with single channel stain focus images + if (checkmate::check_directory_exists(path)) { + if (missing(output_dir)) output_dir <- file.path(path, "tif_exports") + # find actual image paths in directory + ome_paths <- list.files(path, full.names = TRUE, pattern = ".ome") + # parse ome metadata for images names + ome_xml <- ometif_metadata( + ome_paths[[1]], node = "Channel", output = "data.frame" + ) + # update names with the channel names + name <- ome_xml$Name + + # do conversion if file does not already exist in output_dir + vmsg(.v = verbose, "> ometif to tif conversion") + lapply(ome_paths, function(ome) { + try(silent = TRUE, { # ignore fail when already written + ometif_to_tif( + # can pass overwrite = TRUE via ... if needed + ome, output_dir = output_dir, ... + ) + }) + }) + # update path param + path <- list.files(output_dir, pattern = ".tif", full.names = TRUE) + } + + # set default if still missing + if (missing(name)) name <- "image" + + # [paths] + # check files exist + vapply(path, checkmate::assert_file_exists, FUN.VALUE = character(1L)) + # names + if (length(name) != length(path) && + length(name) != 1) { + stop("length of `name` should be same as length of `path`") + } + if (length(name) == 1 && + length(path) > 1) { + name <- sprintf("%s_%d", name, seq_along(path)) + } + # micron + checkmate::assert_numeric(micron) + + progressr::with_progress({ + p <- progressr::progressor(along = path) + + gimg_list <- lapply(seq_along(path), function(img_i) { + gimg <- .xenium_image_single( + path = path[[img_i]], + name = name[[img_i]], + micron = micron, + negative_y = negative_y, + flip_vertical = flip_vertical, + flip_horizontal = flip_horizontal, + verbose = verbose + ) + p() + return(gimg) + }) + }) + return(gimg_list) +} + +.xenium_image_single <- function( + path, + name = "image", + micron, + negative_y = TRUE, + flip_vertical = FALSE, + flip_horizontal = FALSE, + verbose = NULL +) { vmsg(.v = verbose, sprintf("loading image as '%s'", name)) vmsg(.v = verbose, .is_debug = TRUE, path) vmsg( @@ -797,670 +970,379 @@ setMethod("$<-", signature("XeniumReader"), function(x, name, value) { .prefix = "" ) - verbose <- verbose %null% TRUE + # warning to for single channel .ome.tif images that terra::rast() and + # gdal still have difficulties with. May be related to JP2OpenJPEG driver + # but even loading this does not seem to fix it. + if (file_extension(path) %in% "ome") { + warning(wrap_txt( + ".ome.tif images not fully supported. + If reading fails, try converting to a basic tif `ometif_to_tif()`") + ) + } - # TODO + img <- createGiottoLargeImage(path, + name = name, + flip_vertical = flip_vertical, + flip_horizontal = flip_horizontal, + negative_y = negative_y, + verbose = verbose + ) + img <- rescale(img, micron, x0 = 0, y0 = 0) + return(img) } +# for affine, see the init method -#' @title Load xenium data from folder -#' @name load_xenium_folder -#' @param path_list list of full filepaths from .read_xenium_folder -#' @inheritParams createGiottoXeniumObject -#' @returns list of loaded in xenium data -NULL -#' @rdname load_xenium_folder -#' @keywords internal -.load_xenium_folder <- function( - path_list, + + + +# OLD #### + + + + +#' @title Create 10x Xenium Giotto Object +#' @name createGiottoXeniumObject +#' @description Given the path to a Xenium experiment output folder, creates a +#' Giotto object +#' @param xenium_dir full path to the exported xenium directory +#' @param data_to_use which type(s) of expression data to build the gobject with +#' (e.g. default: \strong{'subcellular'}, 'aggregate', or 'all') +#' @param load_format files formats from which to load the data. Either `csv` or +#' `parquet` currently supported. +#' @param h5_expression (boolean) whether to load cell_feature_matrix from .h5 +#' file. Default is \code{TRUE} +#' @param h5_gene_ids use gene symbols (default) or ensembl ids for the .h5 gene +#' expression matrix +#' @param bounds_to_load vector of boundary information to load +#' (e.g. \code{'cell'} +#' or \code{'nucleus'} by themselves or \code{c('cell', 'nucleus')} to load both +#' at the same time.) +#' @param qv_threshold Minimum Phred-scaled quality score cutoff to be included +#' as a subcellular transcript detection (default = 20) +#' @param key_list (advanced) list of grep-based keywords to split the +#' subcellular feature detections by feature type. See details +#' @inheritParams get10Xmatrix +#' @inheritParams GiottoClass::createGiottoObjectSubcellular +#' @returns giotto object +#' @details +#' +#' [\strong{QC feature types}] +#' Xenium provides info on feature detections that include more than only the +#' Gene Expression specific probes. Additional probes for QC are included: +#' \emph{blank codeword}, \emph{negative control codeword}, and +#' \emph{negative control probe}. These additional QC probes each occupy and +#' are treated as their own feature types so that they can largely remain +#' independent of the gene expression information. +#' +#' [\strong{key_list}] +#' Related to \code{data_to_use = 'subcellular'} workflow only: +#' Additional QC probe information is in the subcellular feature detections +#' information and must be separated from the gene expression information +#' during processing. +#' The QC probes have prefixes that allow them to be selected from the rest of +#' the feature IDs. +#' Giotto uses a named list of keywords (\code{key_list}) to select these QC +#' probes, with the list names being the names that will be assigned as the +#' feature type of these feature detections. The default list is used when +#' \code{key_list} = NULL. +#' +#' Default list: +#' \preformatted{ +#' list(blank_code = 'BLANK_', +#' neg_code = 'NegControlCodeword_', +#' neg_probe = c('NegControlProbe_|antisense_')) +#' } +#' +#' The Gene expression subset is accepted as the subset of feat_IDs that do not +#' map to any of the keys. +#' +#' @export +createGiottoXeniumObject <- function( + xenium_dir, + data_to_use = c("subcellular", "aggregate"), load_format = "csv", - data_to_use = "subcellular", - h5_expression = "FALSE", - h5_gene_ids = "symbols", + h5_expression = TRUE, + h5_gene_ids = c("symbols", "ensembl"), gene_column_index = 1, - cores, + bounds_to_load = c("cell"), + qv_threshold = 20, + key_list = NULL, + instructions = NULL, + cores = NA, verbose = TRUE ) { - data_list <- switch(load_format, - "csv" = .load_xenium_folder_csv( - path_list = path_list, - data_to_use = data_to_use, - h5_expression = h5_expression, - h5_gene_ids = h5_gene_ids, - gene_column_index = gene_column_index, - cores = cores, - verbose = verbose - ), - "parquet" = .load_xenium_folder_parquet( - path_list = path_list, - data_to_use = data_to_use, - h5_expression = h5_expression, - h5_gene_ids = h5_gene_ids, - gene_column_index = gene_column_index, - cores = cores, - verbose = verbose - ), - "zarr" = stop("load_format zarr:\n Not yet implemented", call. = FALSE) - ) + # 0. setup + xenium_dir <- path.expand(xenium_dir) - return(data_list) -} + # Determine data to load + data_to_use <- match.arg( + arg = data_to_use, choices = c("subcellular", "aggregate")) + # Determine load formats + load_format <- "csv" # TODO Remove this and add as param once other options + # are available + load_format <- match.arg( + arg = load_format, choices = c("csv", "parquet", "zarr")) -#' @describeIn load_xenium_folder Load from csv files -#' @keywords internal -.load_xenium_folder_csv <- function( - path_list, - cores, - data_to_use = "subcellular", - h5_expression = FALSE, - h5_gene_ids = "symbols", - gene_column_index = 1, - verbose = TRUE -) { - # initialize return vars - feat_meta <- tx_dt <- bound_dt_list <- cell_meta <- agg_expr <- NULL + # set number of cores automatically, but with limit of 10 + cores <- determine_cores(cores) + data.table::setDTthreads(threads = cores) - vmsg("Loading feature metadata...", .v = verbose) - # updated for pipeline v1.6 json format - fdata_path <- path_list$panel_meta_path[[1]] - fdata_ext <- GiottoUtils::file_extension(fdata_path) - if ("json" %in% fdata_ext) { - feat_meta <- .load_xenium_panel_json(path = fdata_path, - gene_ids = h5_gene_ids) - } else { - feat_meta <- data.table::fread(fdata_path, nThread = cores) - colnames(feat_meta)[[1]] <- "feat_ID" - } + # 1. detect xenium folder and find filepaths to load - # **** subcellular info **** - if (data_to_use == "subcellular") { - # append missing QC probe info to feat_meta - if (isTRUE(h5_expression)) { - h5 <- hdf5r::H5File$new(path_list$agg_expr_path) - tryCatch({ - root <- names(h5) - feature_id <- h5[[paste0(root, "/features/id")]][] - feature_info <- h5[[paste0(root, "/features/feature_type")]][] - feature_names <- h5[[paste0(root, "/features/name")]][] - features_dt <- data.table::data.table( - "id" = feature_id, - "name" = feature_names, - "feature_type" = feature_info - ) - }, finally = { - h5$close_all() - }) - } else { - features_dt <- data.table::fread( - paste0(path_list$agg_expr_path, "/features.tsv.gz"), - header = FALSE - ) - } - colnames(features_dt) <- c("id", "feat_ID", "feat_class") - feat_meta <- merge( - features_dt[, c(2, 3)], feat_meta, all.x = TRUE, by = "feat_ID") + # path_list contents: + # tx_path + # bound_paths + # cell_meta_path + # agg_expr_path + # panel_meta_path + path_list <- .read_xenium_folder( + xenium_dir = xenium_dir, + data_to_use = data_to_use, + bounds_to_load = bounds_to_load, + load_format = load_format, + h5_expression = h5_expression, + verbose = verbose + ) - GiottoUtils::vmsg("Loading transcript level info...", .v = verbose) - tx_dt <- data.table::fread(path_list$tx_path[[1]], nThread = cores) - data.table::setnames( - x = tx_dt, - old = c("feature_name", "x_location", "y_location"), - new = c("feat_ID", "x", "y") - ) - GiottoUtils::vmsg("Loading boundary info...", .v = verbose) - bound_dt_list <- lapply( - path_list$bound_paths, - function(x) data.table::fread(x[[1]], nThread = cores) - ) - } + # 2. load in data - # **** aggregate info **** - GiottoUtils::vmsg("loading cell metadata...", .v = verbose) - cell_meta <- data.table::fread( - path_list$cell_meta_path[[1]], nThread = cores) + # data_list contents: + # feat_meta + # tx_dt + # bound_dt_list + # cell_meta + # agg_expr + data_list <- .load_xenium_folder( + path_list = path_list, + load_format = load_format, + data_to_use = data_to_use, + h5_expression = h5_expression, + h5_gene_ids = h5_gene_ids, + gene_column_index = gene_column_index, + cores = cores, + verbose = verbose + ) - if (data_to_use == "aggregate") { - GiottoUtils::vmsg("Loading aggregated expression...", .v = verbose) - if (isTRUE(h5_expression)) { - agg_expr <- get10Xmatrix_h5( - path_to_data = path_list$agg_expr_path, - gene_ids = h5_gene_ids, - remove_zero_rows = TRUE, - split_by_type = TRUE - ) - } else { - agg_expr <- get10Xmatrix( - path_to_data = path_list$agg_expr_path, - gene_column_index = gene_column_index, - remove_zero_rows = TRUE, - split_by_type = TRUE + + # TODO load images + + + # 3. Create giotto objects + + if (data_to_use == "subcellular") { + # ** feat type search keys ** + if (is.null(key_list)) { + key_list <- list( + blank_code = "BLANK_", + neg_code = "NegControlCodeword_", + neg_probe = c("NegControlProbe_|antisense_") ) } + + # needed: + # feat_meta + # tx_dt + # bound_dt_list + xenium_gobject <- .createGiottoXeniumObject_subcellular( + data_list = data_list, + qv_threshold = qv_threshold, + key_list = key_list, + instructions = instructions, + cores = cores, + verbose = verbose + ) } - data_list <- list( - "feat_meta" = feat_meta, - "tx_dt" = tx_dt, - "bound_dt_list" = bound_dt_list, - "cell_meta" = cell_meta, - "agg_expr" = agg_expr - ) + if (data_to_use == "aggregate") { + # needed: + # feat_meta + # cell_meta + # agg_expr + # optional? + # tx_dt + # bound_dt_list + xenium_gobject <- .createGiottoXeniumObject_aggregate( + data_list = data_list, + instructions = instructions, + cores = cores, + verbose = verbose + ) + } - return(data_list) + return(xenium_gobject) } -#' @describeIn load_xenium_folder Load from parquet files +#' @title Create a Xenium Giotto object from subcellular info +#' @name .createGiottoXeniumObject_subcellular +#' @description Subcellular workflow for createGiottoXeniumObject +#' @param data_list list of data loaded by \code{\link{.load_xenium_folder}} +#' @param key_list regex-based search keys for feature IDs to allow separation +#' into separate giottoPoints objects by feat_type +#' @param qv_threshold Minimum Phred-scaled quality score cutoff to be included +#' as a subcellular transcript detection (default = 20) +#' @inheritParams get10Xmatrix +#' @inheritParams GiottoClass::createGiottoObjectSubcellular +#' @returns giotto object +#' @seealso createGiottoXeniumObject .createGiottoXeniumObject_aggregate #' @keywords internal -.load_xenium_folder_parquet <- function( - path_list, - cores, - data_to_use = "subcellular", - h5_expression = FALSE, - h5_gene_ids = "symbols", - gene_column_index = 1, +.createGiottoXeniumObject_subcellular <- function( + data_list, + key_list = NULL, + qv_threshold = 20, + instructions = NULL, + cores = NA, verbose = TRUE ) { - # initialize return vars - feat_meta <- tx_dt <- bound_dt_list <- cell_meta <- agg_expr <- NULL - # dplyr variable - cell_id <- NULL + # data.table vars + qv <- NULL - vmsg("Loading feature metadata...", .v = verbose) - # updated for pipeline v1.6 json format - fdata_path <- path_list$panel_meta_path[[1]] - fdata_ext <- GiottoUtils::file_extension(fdata_path) - if ("json" %in% fdata_ext) { - feat_meta <- .load_xenium_panel_json( - path = fdata_path, gene_ids = h5_gene_ids) - } else { - feat_meta <- data.table::fread(fdata_path, nThread = cores) - colnames(feat_meta)[[1]] <- "feat_ID" - } + # Unpack data_list info + feat_meta <- data_list$feat_meta + tx_dt <- data_list$tx_dt + bound_dt_list <- data_list$bound_dt_list - # **** subcellular info **** - if (data_to_use == "subcellular") { - # define for data.table - transcript_id <- feature_name <- NULL + # define for data.table + cell_id <- feat_ID <- feature_name <- NULL - # append missing QC probe info to feat_meta - if (isTRUE(h5_expression)) { - h5 <- hdf5r::H5File$new(path_list$agg_expr_path) - tryCatch({ - root <- names(h5) - feature_id <- h5[[paste0(root, "/features/id")]][] - feature_info <- h5[[paste0(root, "/features/feature_type")]][] - feature_names <- h5[[paste0(root, "/features/name")]][] - features_dt <- data.table::data.table( - "id" = feature_id, - "name" = feature_names, - "feature_type" = feature_info - ) - }, finally = { - h5$close_all() - }) - } else { - features_dt <- arrow::read_tsv_arrow(paste0( - path_list$agg_expr_path, "/features.tsv.gz"), - col_names = FALSE - ) %>% - data.table::setDT() - } - colnames(features_dt) <- c("id", "feat_ID", "feat_class") - feat_meta <- merge(features_dt[ - , c(2, 3)], feat_meta, all.x = TRUE, by = "feat_ID") + vmsg("Building subcellular giotto object...", .v = verbose) + # Giotto points object + vmsg("> points data prep...", .v = verbose) - vmsg("Loading transcript level info...", .v = verbose) - tx_dt <- arrow::read_parquet( - file = path_list$tx_path[[1]], - as_data_frame = FALSE - ) %>% - dplyr::mutate( - transcript_id = cast(transcript_id, arrow::string())) %>% - dplyr::mutate(cell_id = cast(cell_id, arrow::string())) %>% - dplyr::mutate( - feature_name = cast(feature_name, arrow::string())) %>% - as.data.frame() %>% - data.table::setDT() - data.table::setnames( - x = tx_dt, - old = c("feature_name", "x_location", "y_location"), - new = c("feat_ID", "x", "y") + # filter by qv_threshold + vmsg("> filtering feature detections for Phred score >= ", + qv_threshold, .v = verbose) + n_before <- tx_dt[, .N] + tx_dt_filtered <- tx_dt[qv >= qv_threshold] + n_after <- tx_dt_filtered[, .N] + + if (verbose) { + cat( + "Number of feature points removed: ", + n_before - n_after, + " out of ", n_before, "\n" ) - vmsg("Loading boundary info...", .v = verbose) - bound_dt_list <- lapply(path_list$bound_paths, function(x) { - arrow::read_parquet(file = x[[1]], as_data_frame = FALSE) %>% - dplyr::mutate(cell_id = cast(cell_id, arrow::string())) %>% - as.data.frame() %>% - data.table::setDT() - }) } - # **** aggregate info **** - if (data_to_use == "aggregate") { - vmsg("Loading cell metadata...", .v = verbose) - cell_meta <- arrow::read_parquet( - file = path_list$cell_meta_path[[1]], - as_data_frame = FALSE - ) %>% - dplyr::mutate(cell_id = cast(cell_id, arrow::string())) %>% - as.data.frame() %>% - data.table::setDT() - # NOTE: no parquet for agg_expr. - vmsg("Loading aggregated expression...", .v = verbose) - if (isTRUE(h5_expression)) { - agg_expr <- get10Xmatrix_h5( - path_to_data = path_list$agg_expr_path, - gene_ids = h5_gene_ids, - remove_zero_rows = TRUE, - split_by_type = TRUE - ) - } else { - agg_expr <- get10Xmatrix( - path_to_data = path_list$agg_expr_path, - gene_column_index = gene_column_index, - remove_zero_rows = TRUE, - split_by_type = TRUE - ) - } - } + vmsg("> splitting detections by feat_type", .v = verbose) + # discover feat_IDs for each feat_type + all_IDs <- tx_dt_filtered[, unique(feat_ID)] + feat_types_IDs <- lapply( + key_list, function(x) all_IDs[grepl(pattern = x, all_IDs)]) + rna <- list("rna" = all_IDs[!all_IDs %in% unlist(feat_types_IDs)]) + feat_types_IDs <- append(rna, feat_types_IDs) - data_list <- list( - "feat_meta" = feat_meta, - "tx_dt" = tx_dt, - "bound_dt_list" = bound_dt_list, - "cell_meta" = cell_meta, - "agg_expr" = agg_expr + # separate detections by feature type + points_list <- lapply( + feat_types_IDs, + function(types) { + tx_dt_filtered[feat_ID %in% types] + } ) - return(data_list) -} - - - -.load_xenium_panel_json <- function(path, gene_ids = "symbols") { - gene_ids <- match.arg(gene_ids, c("symbols", "ensembl")) - - # tested on v1.6 - j <- jsonlite::fromJSON(path) - # j$metadata # dataset meta - # j$payload # main content - # j$payload$chemistry # panel chemistry used - # j$payload$customer # panel customer - # j$payload$designer # panel designer - # j$payload$spec_version # versioning - # j$payload$panel # dataset panel stats + # Giotto polygons object + vmsg("> polygons data prep...", .v = verbose) + polys_list <- lapply( + bound_dt_list, + function(bound_type) { + bound_type[, cell_id := as.character(cell_id)] + } + ) - panel_info <- j$payload$targets$type %>% - data.table::as.data.table() + xenium_gobject <- createGiottoObjectSubcellular( + gpoints = points_list, + gpolygons = polys_list, + instructions = instructions, + cores = cores, + verbose = verbose + ) - switch(gene_ids, - "symbols" = data.table::setnames( - panel_info, - old = c("data.id", "data.name", "descriptor"), - new = c("ensembl", "feat_ID", "type") - ), - "ensembl" = data.table::setnames( - panel_info, - old = c("data.id", "data.name", "descriptor"), - new = c("feat_ID", "symbol", "type") - ) + # generate centroids + vmsg("Calculating polygon centroids...", .v = verbose) + xenium_gobject <- addSpatialCentroidLocations( + xenium_gobject, + poly_info = c(names(bound_dt_list)), + provenance = as.list(names(bound_dt_list)) ) - return(panel_info) -} + return(xenium_gobject) +} -# OLD #### -#' @title Create 10x Xenium Giotto Object -#' @name createGiottoXeniumObject -#' @description Given the path to a Xenium experiment output folder, creates a -#' Giotto object -#' @param xenium_dir full path to the exported xenium directory -#' @param data_to_use which type(s) of expression data to build the gobject with -#' (e.g. default: \strong{'subcellular'}, 'aggregate', or 'all') -#' @param load_format files formats from which to load the data. Either `csv` or -#' `parquet` currently supported. -#' @param h5_expression (boolean) whether to load cell_feature_matrix from .h5 -#' file. Default is \code{TRUE} -#' @param h5_gene_ids use gene symbols (default) or ensembl ids for the .h5 gene -#' expression matrix -#' @param bounds_to_load vector of boundary information to load -#' (e.g. \code{'cell'} -#' or \code{'nucleus'} by themselves or \code{c('cell', 'nucleus')} to load both -#' at the same time.) -#' @param qv_threshold Minimum Phred-scaled quality score cutoff to be included -#' as a subcellular transcript detection (default = 20) -#' @param key_list (advanced) list of grep-based keywords to split the -#' subcellular feature detections by feature type. See details +#' @title Create a Xenium Giotto object from aggregate info +#' @name .createGiottoXeniumObject_aggregate +#' @description Aggregate workflow for createGiottoXeniumObject +#' @param data_list list of data loaded by \code{.load_xenium_folder} #' @inheritParams get10Xmatrix #' @inheritParams GiottoClass::createGiottoObjectSubcellular #' @returns giotto object -#' @details -#' -#' [\strong{QC feature types}] -#' Xenium provides info on feature detections that include more than only the -#' Gene Expression specific probes. Additional probes for QC are included: -#' \emph{blank codeword}, \emph{negative control codeword}, and -#' \emph{negative control probe}. These additional QC probes each occupy and -#' are treated as their own feature types so that they can largely remain -#' independent of the gene expression information. -#' -#' [\strong{key_list}] -#' Related to \code{data_to_use = 'subcellular'} workflow only: -#' Additional QC probe information is in the subcellular feature detections -#' information and must be separated from the gene expression information -#' during processing. -#' The QC probes have prefixes that allow them to be selected from the rest of -#' the feature IDs. -#' Giotto uses a named list of keywords (\code{key_list}) to select these QC -#' probes, with the list names being the names that will be assigned as the -#' feature type of these feature detections. The default list is used when -#' \code{key_list} = NULL. -#' -#' Default list: -#' \preformatted{ -#' list(blank_code = 'BLANK_', -#' neg_code = 'NegControlCodeword_', -#' neg_probe = c('NegControlProbe_|antisense_')) -#' } -#' -#' The Gene expression subset is accepted as the subset of feat_IDs that do not -#' map to any of the keys. -#' -#' @export -createGiottoXeniumObject <- function( - xenium_dir, - data_to_use = c("subcellular", "aggregate"), - load_format = "csv", - h5_expression = TRUE, - h5_gene_ids = c("symbols", "ensembl"), - gene_column_index = 1, - bounds_to_load = c("cell"), - qv_threshold = 20, - key_list = NULL, +#' @seealso createGiottoXeniumObject .createGiottoXeniumObject_subcellular +#' @keywords internal +.createGiottoXeniumObject_aggregate <- function( + data_list, + # include_analysis = FALSE, instructions = NULL, cores = NA, verbose = TRUE ) { - # 0. setup - xenium_dir <- path.expand(xenium_dir) - - # Determine data to load - data_to_use <- match.arg( - arg = data_to_use, choices = c("subcellular", "aggregate")) - - # Determine load formats - load_format <- "csv" # TODO Remove this and add as param once other options - # are available - load_format <- match.arg( - arg = load_format, choices = c("csv", "parquet", "zarr")) + # Unpack data_list info + feat_meta <- data_list$feat_meta + cell_meta <- data_list$cell_meta + agg_expr <- data_list$agg_expr - # set number of cores automatically, but with limit of 10 - cores <- determine_cores(cores) - data.table::setDTthreads(threads = cores) + # define for data.table + cell_ID <- x_centroid <- y_centroid <- NULL - # 1. detect xenium folder and find filepaths to load + # clean up names for aggregate matrices + names(agg_expr) <- gsub(pattern = " ", replacement = "_", names(agg_expr)) + geneExpMat <- which(names(agg_expr) == "Gene_Expression") + names(agg_expr)[[geneExpMat]] <- "raw" - # path_list contents: - # tx_path - # bound_paths - # cell_meta_path - # agg_expr_path - # panel_meta_path - path_list <- .read_xenium_folder( - xenium_dir = xenium_dir, - data_to_use = data_to_use, - bounds_to_load = bounds_to_load, - load_format = load_format, - h5_expression = h5_expression, - verbose = verbose - ) + # set cell_id as character + cell_meta <- cell_meta[, data.table::setnames(.SD, "cell_id", "cell_ID")] + cell_meta <- cell_meta[, cell_ID := as.character(cell_ID)] + # set up spatial locations + agg_spatlocs <- cell_meta[, .(x_centroid, y_centroid, cell_ID)] - # 2. load in data + # set up metadata + agg_meta <- cell_meta[, !c("x_centroid", "y_centroid")] - # data_list contents: - # feat_meta - # tx_dt - # bound_dt_list - # cell_meta - # agg_expr - data_list <- .load_xenium_folder( - path_list = path_list, - load_format = load_format, - data_to_use = data_to_use, - h5_expression = h5_expression, - h5_gene_ids = h5_gene_ids, - gene_column_index = gene_column_index, + vmsg("Building aggregate giotto object...", .v = verbose) + xenium_gobject <- createGiottoObject( + expression = agg_expr, + spatial_locs = agg_spatlocs, + instructions = instructions, cores = cores, verbose = verbose ) + # append aggregate metadata + xenium_gobject <- addCellMetadata( + gobject = xenium_gobject, + new_metadata = agg_meta, + by_column = TRUE, + column_cell_ID = "cell_ID" + ) + xenium_gobject <- addFeatMetadata( + gobject = xenium_gobject, + new_metadata = feat_meta, + by_column = TRUE, + column_feat_ID = "feat_ID" + ) - # TODO load images - - - # 3. Create giotto objects - - if (data_to_use == "subcellular") { - # ** feat type search keys ** - if (is.null(key_list)) { - key_list <- list( - blank_code = "BLANK_", - neg_code = "NegControlCodeword_", - neg_probe = c("NegControlProbe_|antisense_") - ) - } - - # needed: - # feat_meta - # tx_dt - # bound_dt_list - xenium_gobject <- .createGiottoXeniumObject_subcellular( - data_list = data_list, - qv_threshold = qv_threshold, - key_list = key_list, - instructions = instructions, - cores = cores, - verbose = verbose - ) - } - - if (data_to_use == "aggregate") { - # needed: - # feat_meta - # cell_meta - # agg_expr - # optional? - # tx_dt - # bound_dt_list - xenium_gobject <- .createGiottoXeniumObject_aggregate( - data_list = data_list, - instructions = instructions, - cores = cores, - verbose = verbose - ) - } - - return(xenium_gobject) -} - - - - -#' @title Create a Xenium Giotto object from subcellular info -#' @name .createGiottoXeniumObject_subcellular -#' @description Subcellular workflow for createGiottoXeniumObject -#' @param data_list list of data loaded by \code{\link{.load_xenium_folder}} -#' @param key_list regex-based search keys for feature IDs to allow separation -#' into separate giottoPoints objects by feat_type -#' @param qv_threshold Minimum Phred-scaled quality score cutoff to be included -#' as a subcellular transcript detection (default = 20) -#' @inheritParams get10Xmatrix -#' @inheritParams GiottoClass::createGiottoObjectSubcellular -#' @returns giotto object -#' @seealso createGiottoXeniumObject .createGiottoXeniumObject_aggregate -#' @keywords internal -.createGiottoXeniumObject_subcellular <- function( - data_list, - key_list = NULL, - qv_threshold = 20, - instructions = NULL, - cores = NA, - verbose = TRUE -) { - # data.table vars - qv <- NULL - - # Unpack data_list info - feat_meta <- data_list$feat_meta - tx_dt <- data_list$tx_dt - bound_dt_list <- data_list$bound_dt_list - - # define for data.table - cell_id <- feat_ID <- feature_name <- NULL - - vmsg("Building subcellular giotto object...", .v = verbose) - # Giotto points object - vmsg("> points data prep...", .v = verbose) - - # filter by qv_threshold - vmsg("> filtering feature detections for Phred score >= ", - qv_threshold, .v = verbose) - n_before <- tx_dt[, .N] - tx_dt_filtered <- tx_dt[qv >= qv_threshold] - n_after <- tx_dt_filtered[, .N] - - if (verbose) { - cat( - "Number of feature points removed: ", - n_before - n_after, - " out of ", n_before, "\n" - ) - } - - vmsg("> splitting detections by feat_type", .v = verbose) - # discover feat_IDs for each feat_type - all_IDs <- tx_dt_filtered[, unique(feat_ID)] - feat_types_IDs <- lapply( - key_list, function(x) all_IDs[grepl(pattern = x, all_IDs)]) - rna <- list("rna" = all_IDs[!all_IDs %in% unlist(feat_types_IDs)]) - feat_types_IDs <- append(rna, feat_types_IDs) - - # separate detections by feature type - points_list <- lapply( - feat_types_IDs, - function(types) { - tx_dt_filtered[feat_ID %in% types] - } - ) - - # Giotto polygons object - vmsg("> polygons data prep...", .v = verbose) - polys_list <- lapply( - bound_dt_list, - function(bound_type) { - bound_type[, cell_id := as.character(cell_id)] - } - ) - - xenium_gobject <- createGiottoObjectSubcellular( - gpoints = points_list, - gpolygons = polys_list, - instructions = instructions, - cores = cores, - verbose = verbose - ) - - # generate centroids - vmsg("Calculating polygon centroids...", .v = verbose) - xenium_gobject <- addSpatialCentroidLocations( - xenium_gobject, - poly_info = c(names(bound_dt_list)), - provenance = as.list(names(bound_dt_list)) - ) - - return(xenium_gobject) -} - - - - - -#' @title Create a Xenium Giotto object from aggregate info -#' @name .createGiottoXeniumObject_aggregate -#' @description Aggregate workflow for createGiottoXeniumObject -#' @param data_list list of data loaded by \code{.load_xenium_folder} -#' @inheritParams get10Xmatrix -#' @inheritParams GiottoClass::createGiottoObjectSubcellular -#' @returns giotto object -#' @seealso createGiottoXeniumObject .createGiottoXeniumObject_subcellular -#' @keywords internal -.createGiottoXeniumObject_aggregate <- function( - data_list, - # include_analysis = FALSE, - instructions = NULL, - cores = NA, - verbose = TRUE -) { - # Unpack data_list info - feat_meta <- data_list$feat_meta - cell_meta <- data_list$cell_meta - agg_expr <- data_list$agg_expr - - # define for data.table - cell_ID <- x_centroid <- y_centroid <- NULL - - # clean up names for aggregate matrices - names(agg_expr) <- gsub(pattern = " ", replacement = "_", names(agg_expr)) - geneExpMat <- which(names(agg_expr) == "Gene_Expression") - names(agg_expr)[[geneExpMat]] <- "raw" - - # set cell_id as character - cell_meta <- cell_meta[, data.table::setnames(.SD, "cell_id", "cell_ID")] - cell_meta <- cell_meta[, cell_ID := as.character(cell_ID)] - - # set up spatial locations - agg_spatlocs <- cell_meta[, .(x_centroid, y_centroid, cell_ID)] - - # set up metadata - agg_meta <- cell_meta[, !c("x_centroid", "y_centroid")] - - vmsg("Building aggregate giotto object...", .v = verbose) - xenium_gobject <- createGiottoObject( - expression = agg_expr, - spatial_locs = agg_spatlocs, - instructions = instructions, - cores = cores, - verbose = verbose - ) - - # append aggregate metadata - xenium_gobject <- addCellMetadata( - gobject = xenium_gobject, - new_metadata = agg_meta, - by_column = TRUE, - column_cell_ID = "cell_ID" - ) - xenium_gobject <- addFeatMetadata( - gobject = xenium_gobject, - new_metadata = feat_meta, - by_column = TRUE, - column_feat_ID = "feat_ID" - ) - - return(xenium_gobject) -} + return(xenium_gobject) +} @@ -1636,4 +1518,280 @@ createGiottoXeniumObject <- function( return(path_list) } +#' @title Load xenium data from folder +#' @name load_xenium_folder +#' @param path_list list of full filepaths from .read_xenium_folder +#' @inheritParams createGiottoXeniumObject +#' @returns list of loaded in xenium data +NULL + +#' @rdname load_xenium_folder +#' @keywords internal +.load_xenium_folder <- function( + path_list, + load_format = "csv", + data_to_use = "subcellular", + h5_expression = "FALSE", + h5_gene_ids = "symbols", + gene_column_index = 1, + cores, + verbose = TRUE +) { + data_list <- switch(load_format, + "csv" = .load_xenium_folder_csv( + path_list = path_list, + data_to_use = data_to_use, + h5_expression = h5_expression, + h5_gene_ids = h5_gene_ids, + gene_column_index = gene_column_index, + cores = cores, + verbose = verbose + ), + "parquet" = .load_xenium_folder_parquet( + path_list = path_list, + data_to_use = data_to_use, + h5_expression = h5_expression, + h5_gene_ids = h5_gene_ids, + gene_column_index = gene_column_index, + cores = cores, + verbose = verbose + ), + "zarr" = stop("load_format zarr:\n Not yet implemented", call. = FALSE) + ) + + return(data_list) +} + +#' @describeIn load_xenium_folder Load from csv files +#' @keywords internal +.load_xenium_folder_csv <- function( + path_list, + cores, + data_to_use = "subcellular", + h5_expression = FALSE, + h5_gene_ids = "symbols", + gene_column_index = 1, + verbose = TRUE +) { + # initialize return vars + feat_meta <- tx_dt <- bound_dt_list <- cell_meta <- agg_expr <- NULL + + vmsg("Loading feature metadata...", .v = verbose) + # updated for pipeline v1.6 json format + fdata_path <- path_list$panel_meta_path[[1]] + fdata_ext <- GiottoUtils::file_extension(fdata_path) + if ("json" %in% fdata_ext) { + feat_meta <- .load_xenium_panel_json(path = fdata_path, + gene_ids = h5_gene_ids) + } else { + feat_meta <- data.table::fread(fdata_path, nThread = cores) + colnames(feat_meta)[[1]] <- "feat_ID" + } + + # **** subcellular info **** + if (data_to_use == "subcellular") { + # append missing QC probe info to feat_meta + if (isTRUE(h5_expression)) { + h5 <- hdf5r::H5File$new(path_list$agg_expr_path) + tryCatch({ + root <- names(h5) + feature_id <- h5[[paste0(root, "/features/id")]][] + feature_info <- h5[[paste0(root, "/features/feature_type")]][] + feature_names <- h5[[paste0(root, "/features/name")]][] + features_dt <- data.table::data.table( + "id" = feature_id, + "name" = feature_names, + "feature_type" = feature_info + ) + }, finally = { + h5$close_all() + }) + } else { + features_dt <- data.table::fread( + paste0(path_list$agg_expr_path, "/features.tsv.gz"), + header = FALSE + ) + } + colnames(features_dt) <- c("id", "feat_ID", "feat_class") + feat_meta <- merge( + features_dt[, c(2, 3)], feat_meta, all.x = TRUE, by = "feat_ID") + + GiottoUtils::vmsg("Loading transcript level info...", .v = verbose) + tx_dt <- data.table::fread(path_list$tx_path[[1]], nThread = cores) + data.table::setnames( + x = tx_dt, + old = c("feature_name", "x_location", "y_location"), + new = c("feat_ID", "x", "y") + ) + + GiottoUtils::vmsg("Loading boundary info...", .v = verbose) + bound_dt_list <- lapply( + path_list$bound_paths, + function(x) data.table::fread(x[[1]], nThread = cores) + ) + } + + # **** aggregate info **** + GiottoUtils::vmsg("loading cell metadata...", .v = verbose) + cell_meta <- data.table::fread( + path_list$cell_meta_path[[1]], nThread = cores) + + if (data_to_use == "aggregate") { + GiottoUtils::vmsg("Loading aggregated expression...", .v = verbose) + if (isTRUE(h5_expression)) { + agg_expr <- get10Xmatrix_h5( + path_to_data = path_list$agg_expr_path, + gene_ids = h5_gene_ids, + remove_zero_rows = TRUE, + split_by_type = TRUE + ) + } else { + agg_expr <- get10Xmatrix( + path_to_data = path_list$agg_expr_path, + gene_column_index = gene_column_index, + remove_zero_rows = TRUE, + split_by_type = TRUE + ) + } + } + + data_list <- list( + "feat_meta" = feat_meta, + "tx_dt" = tx_dt, + "bound_dt_list" = bound_dt_list, + "cell_meta" = cell_meta, + "agg_expr" = agg_expr + ) + + return(data_list) +} + + + + +#' @describeIn load_xenium_folder Load from parquet files +#' @keywords internal +.load_xenium_folder_parquet <- function( + path_list, + cores, + data_to_use = "subcellular", + h5_expression = FALSE, + h5_gene_ids = "symbols", + gene_column_index = 1, + verbose = TRUE +) { + # initialize return vars + feat_meta <- tx_dt <- bound_dt_list <- cell_meta <- agg_expr <- NULL + # dplyr variable + cell_id <- NULL + + vmsg("Loading feature metadata...", .v = verbose) + # updated for pipeline v1.6 json format + fdata_path <- path_list$panel_meta_path[[1]] + fdata_ext <- GiottoUtils::file_extension(fdata_path) + if ("json" %in% fdata_ext) { + feat_meta <- .load_xenium_panel_json( + path = fdata_path, gene_ids = h5_gene_ids) + } else { + feat_meta <- data.table::fread(fdata_path, nThread = cores) + colnames(feat_meta)[[1]] <- "feat_ID" + } + + # **** subcellular info **** + if (data_to_use == "subcellular") { + # define for data.table + transcript_id <- feature_name <- NULL + + # append missing QC probe info to feat_meta + if (isTRUE(h5_expression)) { + h5 <- hdf5r::H5File$new(path_list$agg_expr_path) + tryCatch({ + root <- names(h5) + feature_id <- h5[[paste0(root, "/features/id")]][] + feature_info <- h5[[paste0(root, "/features/feature_type")]][] + feature_names <- h5[[paste0(root, "/features/name")]][] + features_dt <- data.table::data.table( + "id" = feature_id, + "name" = feature_names, + "feature_type" = feature_info + ) + }, finally = { + h5$close_all() + }) + } else { + features_dt <- arrow::read_tsv_arrow(paste0( + path_list$agg_expr_path, "/features.tsv.gz"), + col_names = FALSE + ) %>% + data.table::setDT() + } + colnames(features_dt) <- c("id", "feat_ID", "feat_class") + feat_meta <- merge(features_dt[ + , c(2, 3)], feat_meta, all.x = TRUE, by = "feat_ID") + + vmsg("Loading transcript level info...", .v = verbose) + tx_dt <- arrow::read_parquet( + file = path_list$tx_path[[1]], + as_data_frame = FALSE + ) %>% + dplyr::mutate( + transcript_id = cast(transcript_id, arrow::string())) %>% + dplyr::mutate(cell_id = cast(cell_id, arrow::string())) %>% + dplyr::mutate( + feature_name = cast(feature_name, arrow::string())) %>% + as.data.frame() %>% + data.table::setDT() + data.table::setnames( + x = tx_dt, + old = c("feature_name", "x_location", "y_location"), + new = c("feat_ID", "x", "y") + ) + vmsg("Loading boundary info...", .v = verbose) + bound_dt_list <- lapply(path_list$bound_paths, function(x) { + arrow::read_parquet(file = x[[1]], as_data_frame = FALSE) %>% + dplyr::mutate(cell_id = cast(cell_id, arrow::string())) %>% + as.data.frame() %>% + data.table::setDT() + }) + } + # **** aggregate info **** + if (data_to_use == "aggregate") { + vmsg("Loading cell metadata...", .v = verbose) + cell_meta <- arrow::read_parquet( + file = path_list$cell_meta_path[[1]], + as_data_frame = FALSE + ) %>% + dplyr::mutate(cell_id = cast(cell_id, arrow::string())) %>% + as.data.frame() %>% + data.table::setDT() + + # NOTE: no parquet for agg_expr. + vmsg("Loading aggregated expression...", .v = verbose) + if (isTRUE(h5_expression)) { + agg_expr <- get10Xmatrix_h5( + path_to_data = path_list$agg_expr_path, + gene_ids = h5_gene_ids, + remove_zero_rows = TRUE, + split_by_type = TRUE + ) + } else { + agg_expr <- get10Xmatrix( + path_to_data = path_list$agg_expr_path, + gene_column_index = gene_column_index, + remove_zero_rows = TRUE, + split_by_type = TRUE + ) + } + } + + data_list <- list( + "feat_meta" = feat_meta, + "tx_dt" = tx_dt, + "bound_dt_list" = bound_dt_list, + "cell_meta" = cell_meta, + "agg_expr" = agg_expr + ) + + return(data_list) +} From 856264374fe298dda49799f6cb65c09f4ba4d9c0 Mon Sep 17 00:00:00 2001 From: George Chen <72078254+jiajic@users.noreply.github.com> Date: Tue, 30 Jul 2024 07:30:37 -0400 Subject: [PATCH 03/33] feat: add minimal `importXenium()` and fix bug --- R/convenience_xenium.R | 38 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 36 insertions(+), 2 deletions(-) diff --git a/R/convenience_xenium.R b/R/convenience_xenium.R index cc61d87e4..6a2953fee 100644 --- a/R/convenience_xenium.R +++ b/R/convenience_xenium.R @@ -459,6 +459,36 @@ setMethod("$<-", signature("XeniumReader"), function(x, name, value) { +# CREATE READER #### + +#' @title Import a 10X Xenium Assay +#' @name importXenium +#' @description +#' Giotto import functionalities for Xenium datasets. This function creates a +#' `XeniumReader` instance that has convenient reader functions for converting +#' individual pieces of Xenium data into Giotto-compatible representations. +#' +#' These functions should have all param values provided as defaults, but +#' can be flexibly modified to do things such as look in alternative +#' directories or paths +#' @param xenium_dir Xenium output directory +#' @returns `XeniumReader` object +#' @export +importXenium <- function( + xenium_dir = NULL, qv_threshold = 20, +) { + a <- list(Class = "XeniumReader") + if (!is.null(xenium_dir)) { + a$xenium_dir <- xenium_dir + } + + do.call(new, args = a) +} + + + + + # MODULAR #### @@ -683,9 +713,13 @@ setMethod("$<-", signature("XeniumReader"), function(x, name, value) { vmsg(.v = verbose, .is_debug = TRUE, path) verbose <- verbose %null% TRUE cx <- switch(e, - "csv" = do.call(.xenium_cellmeta_csv, args = c(a, list(cores = cores))), - "parquet" = do.call(.xenium_cellmeta_parquet, args = a) + "csv" = do.call( + .xenium_cellmeta_csv, + args = c(a, list(cores = cores)) + ), + "parquet" = do.call(.xenium_cellmeta_parquet, args = a) ) + data.table::setnames(cx, "cell_id", "cell_ID") cx <- createCellMetaObj( metadata = cx, From a26a7541e6818962558f91cf731b3e0fdc433fb7 Mon Sep 17 00:00:00 2001 From: George Chen <72078254+jiajic@users.noreply.github.com> Date: Tue, 30 Jul 2024 07:38:00 -0400 Subject: [PATCH 04/33] fix bug in import xenium --- R/convenience_xenium.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R/convenience_xenium.R b/R/convenience_xenium.R index 6a2953fee..cff626bff 100644 --- a/R/convenience_xenium.R +++ b/R/convenience_xenium.R @@ -475,12 +475,13 @@ setMethod("$<-", signature("XeniumReader"), function(x, name, value) { #' @returns `XeniumReader` object #' @export importXenium <- function( - xenium_dir = NULL, qv_threshold = 20, + xenium_dir = NULL, qv_threshold = 20 ) { a <- list(Class = "XeniumReader") if (!is.null(xenium_dir)) { a$xenium_dir <- xenium_dir } + a$qv_threshold <- qv_threshold do.call(new, args = a) } From 718fc013b34d2bb80bcf1f97246a32a63445c327 Mon Sep 17 00:00:00 2001 From: George Chen <72078254+jiajic@users.noreply.github.com> Date: Tue, 30 Jul 2024 07:38:24 -0400 Subject: [PATCH 05/33] chore: document --- NAMESPACE | 1 + man/importXenium.Rd | 23 +++++++++++++++++++++++ 2 files changed, 24 insertions(+) create mode 100644 man/importXenium.Rd diff --git a/NAMESPACE b/NAMESPACE index f770bc45f..b9e278ca0 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -264,6 +264,7 @@ export(hyperGeometricEnrich) export(identifyTMAcores) export(importCosMx) export(importVisiumHD) +export(importXenium) export(initHMRF_V2) export(insertCrossSectionFeatPlot3D) export(insertCrossSectionSpatPlot3D) diff --git a/man/importXenium.Rd b/man/importXenium.Rd new file mode 100644 index 000000000..190c48a35 --- /dev/null +++ b/man/importXenium.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/convenience_xenium.R +\name{importXenium} +\alias{importXenium} +\title{Import a 10X Xenium Assay} +\usage{ +importXenium(xenium_dir = NULL, qv_threshold = 20) +} +\arguments{ +\item{xenium_dir}{Xenium output directory} +} +\value{ +`XeniumReader` object +} +\description{ +Giotto import functionalities for Xenium datasets. This function creates a +`XeniumReader` instance that has convenient reader functions for converting +individual pieces of Xenium data into Giotto-compatible representations. + +These functions should have all param values provided as defaults, but +can be flexibly modified to do things such as look in alternative +directories or paths +} From 3d74951d3e3f1d6e93356da636bdd44819efcaa4 Mon Sep 17 00:00:00 2001 From: George Chen <72078254+jiajic@users.noreply.github.com> Date: Tue, 30 Jul 2024 07:42:35 -0400 Subject: [PATCH 06/33] Update convenience_xenium.R --- R/convenience_xenium.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/convenience_xenium.R b/R/convenience_xenium.R index cff626bff..3c03961a9 100644 --- a/R/convenience_xenium.R +++ b/R/convenience_xenium.R @@ -481,7 +481,7 @@ importXenium <- function( if (!is.null(xenium_dir)) { a$xenium_dir <- xenium_dir } - a$qv_threshold <- qv_threshold + a$qv <- qv_threshold do.call(new, args = a) } From 9aaf2b77f12d70ca2d4c07788537f463620f29b2 Mon Sep 17 00:00:00 2001 From: George Chen <72078254+jiajic@users.noreply.github.com> Date: Tue, 30 Jul 2024 07:54:16 -0400 Subject: [PATCH 07/33] fix .xenium path --- R/convenience_xenium.R | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/R/convenience_xenium.R b/R/convenience_xenium.R index 3c03961a9..97a86ae96 100644 --- a/R/convenience_xenium.R +++ b/R/convenience_xenium.R @@ -161,7 +161,7 @@ setMethod( if (length(obj@micron) == 0) { # if no value already set if (!is.null(experiment_info_path)) { obj@micron <- jsonlite::fromJSON( - manifest$experiment.xenium)$pixel_size + experiment_info_path)$pixel_size } else { warning(wrap_txt("No .xenium file found. Guessing 0.2125 as micron scaling")) @@ -472,6 +472,8 @@ setMethod("$<-", signature("XeniumReader"), function(x, name, value) { #' can be flexibly modified to do things such as look in alternative #' directories or paths #' @param xenium_dir Xenium output directory +#' @param qv_threshold Minimum Phred-scaled quality score cutoff to be included +#' as a subcellular transcript detection (default = 20) #' @returns `XeniumReader` object #' @export importXenium <- function( From 4023e8b1a870d6a7b90573a594d27daa5cfe635d Mon Sep 17 00:00:00 2001 From: George Chen <72078254+jiajic@users.noreply.github.com> Date: Tue, 30 Jul 2024 07:54:32 -0400 Subject: [PATCH 08/33] document --- man/importXenium.Rd | 3 +++ 1 file changed, 3 insertions(+) diff --git a/man/importXenium.Rd b/man/importXenium.Rd index 190c48a35..144db176d 100644 --- a/man/importXenium.Rd +++ b/man/importXenium.Rd @@ -8,6 +8,9 @@ importXenium(xenium_dir = NULL, qv_threshold = 20) } \arguments{ \item{xenium_dir}{Xenium output directory} + +\item{qv_threshold}{Minimum Phred-scaled quality score cutoff to be included +as a subcellular transcript detection (default = 20)} } \value{ `XeniumReader` object From 63c95920fc98ddc65ef73b6d6a35bb52b75ff399 Mon Sep 17 00:00:00 2001 From: George Chen <72078254+jiajic@users.noreply.github.com> Date: Tue, 30 Jul 2024 08:03:57 -0400 Subject: [PATCH 09/33] fix feat reading --- R/convenience_xenium.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R/convenience_xenium.R b/R/convenience_xenium.R index 97a86ae96..a305fbb95 100644 --- a/R/convenience_xenium.R +++ b/R/convenience_xenium.R @@ -237,13 +237,14 @@ setMethod( # load featmeta fmeta_fun <- function( path = panel_meta_path, + gene_ids = "symbols", dropcols = c(), cores = determine_cores(), verbose = NULL ) { .xenium_featmeta( path = path, - gene_ids, + gene_ids = gene_ids, dropcols = dropcols, verbose = verbose ) From 72f3e5d6612e2cd7d0ccf482f42c88eaed6c8d70 Mon Sep 17 00:00:00 2001 From: George Chen <72078254+jiajic@users.noreply.github.com> Date: Tue, 30 Jul 2024 08:15:07 -0400 Subject: [PATCH 10/33] add debug messages --- R/convenience_xenium.R | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/R/convenience_xenium.R b/R/convenience_xenium.R index a305fbb95..791d004f6 100644 --- a/R/convenience_xenium.R +++ b/R/convenience_xenium.R @@ -246,6 +246,7 @@ setMethod( path = path, gene_ids = gene_ids, dropcols = dropcols, + cores = cores, verbose = verbose ) } @@ -525,6 +526,7 @@ importXenium <- function( checkmate::assert_file_exists(path) e <- file_extension(path) %>% head(1L) %>% tolower() vmsg(.v = verbose, .is_debug = TRUE, "[TX_READ] FMT =", e) + vmsg(.v = verbose, .is_debug = TRUE, path) # read in as data.table a <- list( @@ -658,6 +660,8 @@ importXenium <- function( a <- list(path = path) vmsg("Loading boundary info...", .v = verbose) + vmsg(.v = verbose, .is_debug = TRUE, "[POLY_READ] FMT =", e) + vmsg(.v = verbose, .is_debug = TRUE, path) # pass to specific load function based on file extension polys <- switch(e, "csv" = do.call(.xenium_poly_csv, args = c(a, list(cores = cores))), @@ -714,6 +718,7 @@ importXenium <- function( e <- file_extension(path) %>% head(1L) %>% tolower() a <- list(path = path, dropcols = dropcols) vmsg('Loading cell metadata...', .v = verbose) + vmsg(.v = verbose, .is_debug = TRUE, "[CMETA_READ] FMT =", e) vmsg(.v = verbose, .is_debug = TRUE, path) verbose <- verbose %null% TRUE cx <- switch(e, @@ -765,6 +770,7 @@ importXenium <- function( } checkmate::assert_file_exists(path) vmsg("Loading feature metadata...", .v = verbose) + vmsg(.v = verbose, .is_debug = TRUE, path) # updated for pipeline v1.6 json format fdata_ext <- GiottoUtils::file_extension(path) if ("json" %in% fdata_ext) { @@ -855,6 +861,7 @@ importXenium <- function( } vmsg("Loading 10x pre-aggregated expression...", .v = verbose) + vmsg(.v = verbose, .is_debug = TRUE, "[EXPR_READ] FMT =", e) vmsg(.v = verbose, .is_debug = TRUE, path) verbose <- verbose %null% TRUE ex <- switch(e, From 6ec475e894b54778f3d675917b4c42fd94ca928e Mon Sep 17 00:00:00 2001 From: George Chen <72078254+jiajic@users.noreply.github.com> Date: Tue, 30 Jul 2024 08:24:39 -0400 Subject: [PATCH 11/33] fix: xenium fmeta dropcols --- R/convenience_xenium.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R/convenience_xenium.R b/R/convenience_xenium.R index 791d004f6..7e8935698 100644 --- a/R/convenience_xenium.R +++ b/R/convenience_xenium.R @@ -783,7 +783,8 @@ importXenium <- function( } dropcols <- dropcols[dropcols %in% colnames(feat_meta)] - feat_meta[, (dropcols) := NULL] # remove dropcols + # remove dropcols + if (length(dropcols) > 0L) feat_meta[, (dropcols) := NULL] fx <- createFeatMetaObj( metadata = feat_meta, From a0e3ee2af2ed9a51737ff5563811e1783896f12a Mon Sep 17 00:00:00 2001 From: George Chen <72078254+jiajic@users.noreply.github.com> Date: Tue, 30 Jul 2024 08:52:30 -0400 Subject: [PATCH 12/33] add y flipping to xenium vector data --- R/convenience_xenium.R | 49 +++++++++++++++++++++++++++--------------- 1 file changed, 32 insertions(+), 17 deletions(-) diff --git a/R/convenience_xenium.R b/R/convenience_xenium.R index 7e8935698..b17304e39 100644 --- a/R/convenience_xenium.R +++ b/R/convenience_xenium.R @@ -171,27 +171,29 @@ setMethod( # transcripts load call tx_fun <- function( - path = tx_path, - feat_type = c( - "rna", - "NegControlProbe", - "UnassignedCodeword", - "NegControlCodeword" - ), - split_keyword = list( - "NegControlProbe", - "UnassignedCodeword", - "NegControlCodeword" - ), - dropcols = c(), - qv_threshold = obj@qv, - cores = determine_cores(), - verbose = NULL + path = tx_path, + feat_type = c( + "rna", + "NegControlProbe", + "UnassignedCodeword", + "NegControlCodeword" + ), + split_keyword = list( + "NegControlProbe", + "UnassignedCodeword", + "NegControlCodeword" + ), + flip_vertical = TRUE, + dropcols = c(), + qv_threshold = obj@qv, + cores = determine_cores(), + verbose = NULL ) { .xenium_transcript( path = path, feat_type = feat_type, split_keyword = split_keyword, + flip_vertical = flip_vertical, dropcols = dropcols, qv_threshold = qv_threshold, cores = cores, @@ -204,6 +206,7 @@ setMethod( poly_fun <- function( path = cell_bound_path, name = "cell", + flip_vertical = TRUE, calc_centroids = TRUE, cores = determine_cores(), verbose = NULL @@ -211,6 +214,7 @@ setMethod( .xenium_poly( path = path, name = name, + flip_vertical = flip_vertical, calc_centroids = calc_centroids, cores = cores, verbose = verbose @@ -512,6 +516,7 @@ importXenium <- function( "UnassignedCodeword", "NegControlCodeword" ), + flip_vertical = TRUE, dropcols = c(), qv_threshold = 20, cores = determine_cores(), @@ -537,6 +542,7 @@ importXenium <- function( ) vmsg("Loading transcript level info...", .v = verbose) # pass to specific reader fun based on filetype + # return as data.table with colnames `feat_ID`, `x`, `y` tx <- switch(e, "csv" = do.call(.xenium_transcript_csv, args = c(a, list(cores = cores))), @@ -544,6 +550,10 @@ importXenium <- function( "zarr" = stop('zarr not yet supported') ) + # flip values vertically + y <- NULL # NSE var + if (flip_vertical) tx[, y := -y] + # create gpoints gpointslist <- createGiottoPoints( x = tx, @@ -649,6 +659,7 @@ importXenium <- function( .xenium_poly <- function( path, name = "cell", + flip_vertical = TRUE, calc_centroids = TRUE, cores = determine_cores(), verbose = NULL @@ -663,12 +674,16 @@ importXenium <- function( vmsg(.v = verbose, .is_debug = TRUE, "[POLY_READ] FMT =", e) vmsg(.v = verbose, .is_debug = TRUE, path) # pass to specific load function based on file extension + # returns as data.table with colnames `cell_id`, `vertex_x`, `vertex_y` polys <- switch(e, "csv" = do.call(.xenium_poly_csv, args = c(a, list(cores = cores))), "parquet" = do.call(.xenium_poly_parquet, args = a), "zarr" = stop("zarr not yet supported") ) + vertex_y <- NULL # NSE var + if (flip_vertical) polys[, vertex_y := -vertex_y] + # create gpolys verbose <- verbose %null% FALSE gpolys <- createGiottoPolygon( @@ -935,7 +950,7 @@ importXenium <- function( # [directory input] -> load as individual .ome paths with defined names # intended for usage with single channel stain focus images - if (checkmate::check_directory_exists(path)) { + if (checkmate::test_directory_exists(path)) { if (missing(output_dir)) output_dir <- file.path(path, "tif_exports") # find actual image paths in directory ome_paths <- list.files(path, full.names = TRUE, pattern = ".ome") From 6e2b01f14a672bbca2470066d27c9ddddc794d47 Mon Sep 17 00:00:00 2001 From: George Chen <72078254+jiajic@users.noreply.github.com> Date: Tue, 30 Jul 2024 09:05:42 -0400 Subject: [PATCH 13/33] fix xenium expression param passing --- R/convenience_xenium.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/convenience_xenium.R b/R/convenience_xenium.R index b17304e39..e76f13a20 100644 --- a/R/convenience_xenium.R +++ b/R/convenience_xenium.R @@ -881,8 +881,8 @@ importXenium <- function( vmsg(.v = verbose, .is_debug = TRUE, path) verbose <- verbose %null% TRUE ex <- switch(e, - "mtx" = do.call(.xenium_cellmeta_csv, args = a), - "h5" = do.call(.xenium_cellmeta_parquet, args = a) + "mtx" = do.call(.xenium_expression_mtx, args = a), + "h5" = do.call(.xenium_expression_h5, args = a) ) eo <- createExprObj( From 2017d644bbf4ff22515db45cec257437f1f5c905 Mon Sep 17 00:00:00 2001 From: George Chen <72078254+jiajic@users.noreply.github.com> Date: Tue, 30 Jul 2024 09:17:18 -0400 Subject: [PATCH 14/33] fix exprobj generation for xenium --- R/convenience_xenium.R | 25 ++++++++++++++++--------- R/general_help.R | 34 +++++++++++++++++----------------- man/writeChatGPTqueryDEG.Rd | 6 +++--- 3 files changed, 36 insertions(+), 29 deletions(-) diff --git a/R/convenience_xenium.R b/R/convenience_xenium.R index e76f13a20..5d8119fc6 100644 --- a/R/convenience_xenium.R +++ b/R/convenience_xenium.R @@ -880,19 +880,26 @@ importXenium <- function( vmsg(.v = verbose, .is_debug = TRUE, "[EXPR_READ] FMT =", e) vmsg(.v = verbose, .is_debug = TRUE, path) verbose <- verbose %null% TRUE - ex <- switch(e, + ex_list <- switch(e, "mtx" = do.call(.xenium_expression_mtx, args = a), "h5" = do.call(.xenium_expression_h5, args = a) ) - eo <- createExprObj( - expression_data = ex, - name = "raw", - spat_unit = "cell", - feat_type = "rna", - provenance = "cell" - ) - return(eo) + # ensure list + if (!inherits(ex, "list")) ex_list <- list(ex_list) + + # lapply to process more than one if present + eo_list <- lapply(ex_list, function(ex) { + createExprObj( + expression_data = ex, + name = "raw", + spat_unit = "cell", + feat_type = "rna", + provenance = "cell" + ) + }) + + return(eo_list) } .xenium_expression_h5 <- function( diff --git a/R/general_help.R b/R/general_help.R index faef358d8..f6fe38ea8 100644 --- a/R/general_help.R +++ b/R/general_help.R @@ -276,8 +276,8 @@ rank_binarize_wrapper <- function( #' @title writeChatGPTqueryDEG #' @name writeChatGPTqueryDEG -#' @description This function writes a query as a .txt file that can be used with -#' ChatGPT or a similar LLM service to find the most likely cell types based on the +#' @description This function writes a query as a .txt file that can be used with +#' ChatGPT or a similar LLM service to find the most likely cell types based on the #' top differential expressed genes (DEGs) between identified clusters. #' @param DEG_output the output format from the differenetial expression functions #' @param top_n_genes number of genes for each cluster @@ -285,36 +285,36 @@ rank_binarize_wrapper <- function( #' @param folder_name path to the folder where you want to save the .txt file #' @param file_name name of .txt file #' @returns writes a .txt file to the desired location -#' @details This function does not run any LLM service. It simply creates the .txt +#' @details This function does not run any LLM service. It simply creates the .txt #' file that can then be used any LLM service (e.g. OpenAI, Gemini, ...) #' @export -writeChatGPTqueryDEG = function(DEG_output, - top_n_genes = 10, - tissue_type = 'human breast cancer', - folder_name = getwd(), +writeChatGPTqueryDEG = function(DEG_output, + top_n_genes = 10, + tissue_type = 'human breast cancer', + folder_name = getwd(), file_name = 'chatgpt_query.txt') { - + chatgpt_query = paste0("Identify cell types of ", tissue_type, " tissue using the following markers. Identify one cell type for each row. Only provide the cell type name and the marker genes used for cell type identification.") - + selected_DEG_output = DEG_output[, head(.SD, top_n_genes), by="cluster"] - + finallist = list() finallist[[1]] = chatgpt_query - + for(clus in unique(selected_DEG_output$cluster)) { x = selected_DEG_output[cluster == clus][['feats']] x = c(clus, x) finallist[[as.numeric(clus)+1]] = x } - + outputdt = data.table::data.table(finallist) - + cat('\n start writing \n') - data.table::fwrite(x = outputdt, + data.table::fwrite(x = outputdt, file = paste0(folder_name,'/', file_name), sep2 = c(""," ",""), col.names = F) - -} + +} @@ -691,7 +691,7 @@ get10Xmatrix_h5 <- function( ] # change names to gene symbols if it's expression - if (fclass == "Gene Expression" & gene_ids == "symbols") { + if (fclass == "Gene Expression" && gene_ids == "symbols") { conv_vector <- features_dt$uniq_name names(conv_vector) <- features_dt$id diff --git a/man/writeChatGPTqueryDEG.Rd b/man/writeChatGPTqueryDEG.Rd index 034f77fa4..8e1920852 100644 --- a/man/writeChatGPTqueryDEG.Rd +++ b/man/writeChatGPTqueryDEG.Rd @@ -27,11 +27,11 @@ writeChatGPTqueryDEG( writes a .txt file to the desired location } \description{ -This function writes a query as a .txt file that can be used with -ChatGPT or a similar LLM service to find the most likely cell types based on the +This function writes a query as a .txt file that can be used with +ChatGPT or a similar LLM service to find the most likely cell types based on the top differential expressed genes (DEGs) between identified clusters. } \details{ -This function does not run any LLM service. It simply creates the .txt +This function does not run any LLM service. It simply creates the .txt file that can then be used any LLM service (e.g. OpenAI, Gemini, ...) } From 60c2268cd6dc035826d54e6ee9f843bc2d8535f3 Mon Sep 17 00:00:00 2001 From: George Chen <72078254+jiajic@users.noreply.github.com> Date: Tue, 30 Jul 2024 09:19:22 -0400 Subject: [PATCH 15/33] fix typo --- R/convenience_xenium.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/convenience_xenium.R b/R/convenience_xenium.R index 5d8119fc6..ac205f22b 100644 --- a/R/convenience_xenium.R +++ b/R/convenience_xenium.R @@ -886,7 +886,7 @@ importXenium <- function( ) # ensure list - if (!inherits(ex, "list")) ex_list <- list(ex_list) + if (!inherits(ex_list, "list")) ex_list <- list(ex_list) # lapply to process more than one if present eo_list <- lapply(ex_list, function(ex) { From f20d692e5701f17a712dfae403617e9eef680ced Mon Sep 17 00:00:00 2001 From: George Chen <72078254+jiajic@users.noreply.github.com> Date: Wed, 31 Jul 2024 06:02:44 -0400 Subject: [PATCH 16/33] update xenium --- DESCRIPTION | 12 +-- R/convenience_xenium.R | 172 ++++++++++++++++++++++++++++++----------- 2 files changed, 128 insertions(+), 56 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index ab1607bd7..2568a14c7 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -60,7 +60,6 @@ Imports: Suggests: ArchR, arrow, - Biobase, biomaRt, ClusterR, clustree, @@ -69,14 +68,12 @@ Suggests: DelayedMatrixStats, dendextend (>= 1.13.0), dplyr, - exactextractr, FactoMineR, factoextra, fitdistrplus, FNN, future, future.apply, - geometry, GiottoData, ggalluvial, ggdendro, @@ -86,7 +83,6 @@ Suggests: graphcoloring, HDF5Array (>= 1.18.1), hdf5r, - htmlwidgets, jackstraw, kableExtra, knitr, @@ -95,7 +91,6 @@ Suggests: multinet (>= 3.0.2), networkD3, pheatmap, - png, quadprog, harmony, R.utils, @@ -105,10 +100,7 @@ Suggests: rhdf5, RTriangle (>= 1.6-0.10), Rvision, - S4Vectors, scater, - scatterpie, - scattermore, scran (>= 1.10.1), Seurat, sf, @@ -121,10 +113,8 @@ Suggests: STexampleData, SummarizedExperiment, tidygraph, - tiff, trendsceek, - testthat (>= 3.0.0), - qs + testthat (>= 3.0.0) Remotes: drieslab/GiottoUtils, drieslab/GiottoClass, diff --git a/R/convenience_xenium.R b/R/convenience_xenium.R index ac205f22b..584ef629d 100644 --- a/R/convenience_xenium.R +++ b/R/convenience_xenium.R @@ -1,4 +1,28 @@ +# modular reader functions layout # +# # +# - initialize method for reader object +# - filepath detection based on directory path +# - register modular load functions as tx_fun(), poly_fun, etc. in the object. +# include expected defaults that update based on filepath info +# - `create_gobject()` single function that utilizes other functions +# registered to the reader object in previous step. Returns gobject with +# desired data contents +# For params that `create_gobject()` is in charge of, the registered funs +# should use the params passed to `create_gobject()` instead of the baked +# in defaults +# +# # +# - exported function to create a `XYZReader` class object +# +# # +# - from `path` and other minimal args, create a giotto subobject with access +# to specific ways to load and manipulate data + + + + + # CLASS #### @@ -276,7 +300,7 @@ setMethod( # load image call img_fun <- function( - path, + path = img_focus_path, name = "image", micron = obj@micron, negative_y = TRUE, @@ -298,9 +322,9 @@ setMethod( # load aligned image call img_aff_fun <- function( - path = path, - micron = obj@micron, - imagealignment_path + path, + imagealignment_path, + micron = obj@micron ) { read10xAffineImage( file = path, @@ -313,29 +337,32 @@ setMethod( # create giotto object call gobject_fun <- function( - transcript_path = tx_path, - load_bounds = list( - cell = "cell", - nucleus = "nucleus" - ), - expression_path = expr_path, - metadata_path = meta_path, - feat_type = c( - "rna", - "NegControlProbe", - "UnassignedCodeword", - "NegControlCodeword" - ), - split_keyword = list( - "NegControlProbe", - "UnassignedCodeword", - "NegControlCodeword" - ), - load_images = list( - morphology = "focus", - ), - load_expression = FALSE, - load_cellmeta = FALSE + transcript_path = tx_path, + load_bounds = list( + cell = "cell", + nucleus = "nucleus" + ), + gene_panel_json_path = panel_meta_path, + expression_path = expr_path, + metadata_path = meta_path, + feat_type = c( + "rna", + "NegControlProbe", + "UnassignedCodeword", + "NegControlCodeword" + ), + split_keyword = list( + "NegControlProbe", + "UnassignedCodeword", + "NegControlCodeword" + ), + load_images = list( + morphology = "focus" + ), + load_aligned_images = NULL, + load_expression = FALSE, + load_cellmeta = FALSE, + verbose = NULL ) { load_expression <- as.logical(load_expression) load_cellmeta <- as.logical(load_cellmeta) @@ -343,18 +370,35 @@ setMethod( if (!is.null(load_images)) { checkmate::assert_list(load_images) if (is.null(names(load_images))) { - stop("Images paths provided to 'load_images' must be named") + stop("'load_images' must be a named list of filepaths\n") + } + } + if (!is.null(load_aligned_images)) { + checkmate::assert_list(load_aligned_images) + if (is.null(names(load_aligned_images))) { + stop(wrap_txt( + "'load_aligned_images' must be a named list" + )) + } + if (any(lengths(load_aligned_images) != 2L) || + any(!vapply(load_aligned_images, is.character, + FUN.VALUE = logical(1L)))) { + stop(wrap_txt( + "'load_aligned_images' must be character with length 2: + 1. image path + 2. alignment matrix path" + )) } } if (!is.null(load_bounds)) { checkmate::assert_list(load_bounds) if (is.null(names(load_bounds))) { - stop("bounds paths provided to 'load_bounds' must be named") + stop("'load_bounds' must be named list of filepaths\n") } } - + # place calls in new variable for easier access funs <- obj@calls # init gobject @@ -365,12 +409,10 @@ setMethod( tx_list <- funs$load_transcripts( path = transcript_path, feat_type = feat_type, - split_keyword = split_keyword + split_keyword = split_keyword, + verbose = verbose ) - for (tx in tx_list) { - g <- setGiotto(g, tx) - } - + g <- setGiotto(g, tx, verbose = FALSE) # lists are fine # polys if (!is.null(load_bounds)) { @@ -383,31 +425,46 @@ setMethod( for (b_i in seq_along(load_bounds)) { b <- funs$load_polys( path = load_bounds[[b_i]], - name = bnames[[b_i]] + name = bnames[[b_i]], + verbose = verbose ) blist <- c(blist, b) } - for (gpoly_i in seq_along(blist)) { - g <- setGiotto(g, blist[[gpoly_i]]) - } + g <- setGiotto(g, blist, verbose = FALSE) } # feat metadata fx <- funs$load_featmeta( - path = + path = gene_panel_json_path, + # ID = symbols makes sense with the subcellular feat_IDs + gene_ids = "symbols", + # no dropcols + verbose = verbose ) + g <- setGiotto(g, fx) # expression if (load_expression) { - + ex <- funs$load_expression( + path = expression_path, + gene_ids = "symbols", + remove_zero_rows = TRUE, + split_by_type = TRUE, + verbose = verbose + ) + g <- setGiotto(g, ex) } # cell metadata if (load_cellmeta) { - + cx <- funs$load_cellmeta( + path = metadata_path, + verbose = verbose + ) + g <- setGiotto(g, cx) } @@ -415,11 +472,36 @@ setMethod( if (!is.null(load_images)) { # replace convenient shortnames load_images[load_images == "focus"] <- img_focus_path - } + imglist <- list() + imnames <- names(load_images) + for (impath_i in seq_along(load_images)) { + im <- load_image( + path = load_images[[impath_i]], + name = imnames[[impath_i]] + ) + imglist <- c(imglist, im) + } + g <- setGiotto(g, imglist) + } + # aligned images can be placed in random places and do not have + # a standardized naming scheme. + if (!is.null(load_aligned_images)) { + aimglist <- list() + aimnames <- names(load_aligned_images) + for (aim_i in seq_along(load_aligned_images)) { + aim <- load_aligned_image( + path = load_aligned_images[[aim_i]][1], + imagealignment_path = load_aligned_images[[aim_i]][2] + ) + aimglist <- c(aimglist, aim) + } + g <- setGiotto(g, aimglist) + } + return(g) } obj@calls$create_gobject <- gobject_fun @@ -1023,6 +1105,7 @@ importXenium <- function( .xenium_image_single <- function( path, name = "image", + output_dir, micron, negative_y = TRUE, flip_vertical = FALSE, @@ -1059,8 +1142,7 @@ importXenium <- function( return(img) } -# for affine, see the init method - +# for aligned_image (affine), see the `XeniumReader` init method From 4aa44d76dd0fbb3dff3159d7d62e42a93e78a9e0 Mon Sep 17 00:00:00 2001 From: George Chen <72078254+jiajic@users.noreply.github.com> Date: Wed, 31 Jul 2024 09:02:25 -0400 Subject: [PATCH 17/33] remove img dir reading --- R/convenience_xenium.R | 57 +++++++++++++++++++++--------------------- 1 file changed, 28 insertions(+), 29 deletions(-) diff --git a/R/convenience_xenium.R b/R/convenience_xenium.R index 584ef629d..7db5c37e1 100644 --- a/R/convenience_xenium.R +++ b/R/convenience_xenium.R @@ -1023,7 +1023,7 @@ importXenium <- function( .xenium_image <- function( path, name, - output_dir, + # output_dir, micron, negative_y = TRUE, flip_vertical = FALSE, @@ -1033,36 +1033,36 @@ importXenium <- function( ) { if (missing(path)) { stop(wrap_txt( - "No path to image file or dir to load provided or auto-detected" + "No path to image file provided or auto-detected" ), call. = FALSE) } - # [directory input] -> load as individual .ome paths with defined names - # intended for usage with single channel stain focus images - if (checkmate::test_directory_exists(path)) { - if (missing(output_dir)) output_dir <- file.path(path, "tif_exports") - # find actual image paths in directory - ome_paths <- list.files(path, full.names = TRUE, pattern = ".ome") - # parse ome metadata for images names - ome_xml <- ometif_metadata( - ome_paths[[1]], node = "Channel", output = "data.frame" - ) - # update names with the channel names - name <- ome_xml$Name - - # do conversion if file does not already exist in output_dir - vmsg(.v = verbose, "> ometif to tif conversion") - lapply(ome_paths, function(ome) { - try(silent = TRUE, { # ignore fail when already written - ometif_to_tif( - # can pass overwrite = TRUE via ... if needed - ome, output_dir = output_dir, ... - ) - }) - }) - # update path param - path <- list.files(output_dir, pattern = ".tif", full.names = TRUE) - } + # # [directory input] -> load as individual .ome paths with defined names + # # intended for usage with single channel stain focus images + # if (checkmate::test_directory_exists(path)) { + # if (missing(output_dir)) output_dir <- file.path(path, "tif_exports") + # # find actual image paths in directory + # ome_paths <- list.files(path, full.names = TRUE, pattern = ".ome") + # # parse ome metadata for images names + # ome_xml <- ometif_metadata( + # ome_paths[[1]], node = "Channel", output = "data.frame" + # ) + # # update names with the channel names + # name <- ome_xml$Name + # + # # do conversion if file does not already exist in output_dir + # vmsg(.v = verbose, "> ometif to tif conversion") + # lapply(ome_paths, function(ome) { + # try(silent = TRUE, { # ignore fail when already written + # ometif_to_tif( + # # can pass overwrite = TRUE via ... if needed + # ome, output_dir = output_dir, ... + # ) + # }) + # }) + # # update path param + # path <- list.files(output_dir, pattern = ".tif", full.names = TRUE) + # } # set default if still missing if (missing(name)) name <- "image" @@ -1105,7 +1105,6 @@ importXenium <- function( .xenium_image_single <- function( path, name = "image", - output_dir, micron, negative_y = TRUE, flip_vertical = FALSE, From ce64c3c09e99a63b5c77ca88b4006a55850d2d43 Mon Sep 17 00:00:00 2001 From: George Chen <72078254+jiajic@users.noreply.github.com> Date: Wed, 31 Jul 2024 11:09:27 -0400 Subject: [PATCH 18/33] add xenium wrapper --- R/convenience_xenium.R | 1668 +++++++++-------- man/createGiottoXeniumObject.Rd | 128 +- man/dot-createGiottoXeniumObject_aggregate.Rd | 34 - ...ot-createGiottoXeniumObject_subcellular.Rd | 42 - man/dot-read_xenium_folder.Rd | 42 - man/load_xenium_folder.Rd | 77 - 6 files changed, 979 insertions(+), 1012 deletions(-) delete mode 100644 man/dot-createGiottoXeniumObject_aggregate.Rd delete mode 100644 man/dot-createGiottoXeniumObject_subcellular.Rd delete mode 100644 man/dot-read_xenium_folder.Rd delete mode 100644 man/load_xenium_folder.Rd diff --git a/R/convenience_xenium.R b/R/convenience_xenium.R index 7db5c37e1..25bf811c0 100644 --- a/R/convenience_xenium.R +++ b/R/convenience_xenium.R @@ -300,7 +300,7 @@ setMethod( # load image call img_fun <- function( - path = img_focus_path, + path, name = "image", micron = obj@micron, negative_y = TRUE, @@ -356,9 +356,7 @@ setMethod( "UnassignedCodeword", "NegControlCodeword" ), - load_images = list( - morphology = "focus" - ), + load_images = NULL, load_aligned_images = NULL, load_expression = FALSE, load_cellmeta = FALSE, @@ -1146,38 +1144,73 @@ importXenium <- function( - -# OLD #### - - +# wrapper #### #' @title Create 10x Xenium Giotto Object #' @name createGiottoXeniumObject -#' @description Given the path to a Xenium experiment output folder, creates a -#' Giotto object -#' @param xenium_dir full path to the exported xenium directory -#' @param data_to_use which type(s) of expression data to build the gobject with -#' (e.g. default: \strong{'subcellular'}, 'aggregate', or 'all') -#' @param load_format files formats from which to load the data. Either `csv` or -#' `parquet` currently supported. -#' @param h5_expression (boolean) whether to load cell_feature_matrix from .h5 -#' file. Default is \code{TRUE} -#' @param h5_gene_ids use gene symbols (default) or ensembl ids for the .h5 gene -#' expression matrix -#' @param bounds_to_load vector of boundary information to load -#' (e.g. \code{'cell'} -#' or \code{'nucleus'} by themselves or \code{c('cell', 'nucleus')} to load both -#' at the same time.) +#' @description Create a Giotto object from a Xenium experiment output folder. +#' Only the `xenium_dir`, `load_images`, and `load_aligned_images` params +#' need to be supplied when defaults are sufficient. All other params have +#' defaults set and are there in case of non-standard directory layouts or +#' alternative preference in file format to load from.\cr +#' When possible, `.parquet` files are loaded. This requires the additional +#' installation of \pkg{arrow} with zstd support. See details. `h5` is also +#' used by default if the 10x provided expression matrix is loaded.\cr +#' The 10X provided aggregated expression matrix and cell metdata are not +#' loaded by default since the results may be slightly different from those +#' that Giotto spatially aggregates. +#' @param xenium_dir Full path to the exported xenium directory +#' @param transcript_path Optional. Filepath to desired transcripts file to +#' load. Either the `.parquet` or `.csv` files can be used. +#' @param bounds_path Optional. Named list of filepaths to desired Xenium +#' bounds/polygon files to load. Either the `.parquet` or `.csv` files can be +#' used. The default is to load the `.parquets` of both cell and nucleus. +#' @param gene_panel_json_path Optional. Filepath to panel json. This json +#' contains feature metadata information and ENSG names. +#' @param expression_path Optional. Filepath to cell feature matrix. Accepts +#' either the `.h5` or the unzipped directory containing `.mtx` files. +#' @param metadata_path Optional. Filepath to `cells.csv.gz` or `cells.parquet` +#' which contain cell metadata information. +#' @param feat_type character. feature type. Provide more than one value if +#' using the `split_keyword` param. For each set of keywords to split by, an +#' additional feat_type should be provided in the same order. Affects how +#' the transcripts information is loaded. Helpful for separating out the +#' QC probes. See details. +#' @param split_keyword list of character vectors of keywords to split the +#' transcripts based on their feat_ID. Keywords will be `grepl()` +#' matched against the feature IDs information. See details. #' @param qv_threshold Minimum Phred-scaled quality score cutoff to be included #' as a subcellular transcript detection (default = 20) -#' @param key_list (advanced) list of grep-based keywords to split the -#' subcellular feature detections by feature type. See details -#' @inheritParams get10Xmatrix -#' @inheritParams GiottoClass::createGiottoObjectSubcellular -#' @returns giotto object +#' @param load_images Named list of filepaths to `.tif` images, usually the +#' ones in the `morphology_focus` directory. These `ome.tif` images are not +#' compatible and must be converted to `tif` using +#' `[GiottoClass::ometif_to_tif()]`. +#' @param load_aligned_images Named list of filepaths. The list names are used +#' as the image names when loaded. Two filepaths are expected per entry. The +#' first one should be to the `.tif` image. The second path is to the `.csv` +#' alignment matrix file. `ome.tif` images will work, but they are currently +#' slower in our imaging pipeline. +#' @param load_expression logical. Default = FALSE. Whether to load in 10X +#' provided expression matrix. +#' @param load_cellmeta logical. Default = FALSE. Whether to laod in 10X +#' provided cell metadata information +#' @param verbose logical or NULL. NULL uses the `giotto.verbose` option +#' setting and defaults to TRUE. +#' @returns `giotto` object #' @details #' +#' [\strong{arrow zstd support}] +#' Xenium parquets have zstd compression. \pkg{arrow} is used to access +#' parquets, however it may not install on all systems with zstd by default. +#' You can check whether zstd support is installed by running: +#' `arrow::arrow_info()$capabilities[["zstd"]]`. If `FALSE`, it needs to be +#' reinstalled with the following: +#' \preformatted{ +#' Sys.setenv(ARROW_WITH_ZSTD = "ON") +#' install.packages("arrow", repos = c("https://apache.r-universe.dev")) +#' } +#' #' [\strong{QC feature types}] #' Xenium provides info on feature detections that include more than only the #' Gene Expression specific probes. Additional probes for QC are included: @@ -1186,761 +1219,862 @@ importXenium <- function( #' are treated as their own feature types so that they can largely remain #' independent of the gene expression information. #' -#' [\strong{key_list}] -#' Related to \code{data_to_use = 'subcellular'} workflow only: +#' [\strong{feat_type and split_keyword}] #' Additional QC probe information is in the subcellular feature detections #' information and must be separated from the gene expression information #' during processing. #' The QC probes have prefixes that allow them to be selected from the rest of #' the feature IDs. -#' Giotto uses a named list of keywords (\code{key_list}) to select these QC -#' probes, with the list names being the names that will be assigned as the -#' feature type of these feature detections. The default list is used when -#' \code{key_list} = NULL. -#' -#' Default list: -#' \preformatted{ -#' list(blank_code = 'BLANK_', -#' neg_code = 'NegControlCodeword_', -#' neg_probe = c('NegControlProbe_|antisense_')) -#' } +#' Giotto uses `feat_type` and `split_keyword` params to select these QC +#' probes out as separate feature types. See examples in +#' `[GiottoClass::createGiottoPoints]` for how this works. #' -#' The Gene expression subset is accepted as the subset of feat_IDs that do not -#' map to any of the keys. +#' The Gene expression subset labeled as `rna` is accepted as the subset of +#' feat_IDs that do not get matched to any of the `split_keywords`. #' #' @export createGiottoXeniumObject <- function( xenium_dir, - data_to_use = c("subcellular", "aggregate"), - load_format = "csv", - h5_expression = TRUE, - h5_gene_ids = c("symbols", "ensembl"), - gene_column_index = 1, - bounds_to_load = c("cell"), - qv_threshold = 20, - key_list = NULL, - instructions = NULL, - cores = NA, - verbose = TRUE -) { - # 0. setup - xenium_dir <- path.expand(xenium_dir) - - # Determine data to load - data_to_use <- match.arg( - arg = data_to_use, choices = c("subcellular", "aggregate")) - - # Determine load formats - load_format <- "csv" # TODO Remove this and add as param once other options - # are available - load_format <- match.arg( - arg = load_format, choices = c("csv", "parquet", "zarr")) - - # set number of cores automatically, but with limit of 10 - cores <- determine_cores(cores) - data.table::setDTthreads(threads = cores) - - # 1. detect xenium folder and find filepaths to load - - # path_list contents: - # tx_path - # bound_paths - # cell_meta_path - # agg_expr_path - # panel_meta_path - path_list <- .read_xenium_folder( - xenium_dir = xenium_dir, - data_to_use = data_to_use, - bounds_to_load = bounds_to_load, - load_format = load_format, - h5_expression = h5_expression, - verbose = verbose - ) - - - # 2. load in data - - # data_list contents: - # feat_meta - # tx_dt - # bound_dt_list - # cell_meta - # agg_expr - data_list <- .load_xenium_folder( - path_list = path_list, - load_format = load_format, - data_to_use = data_to_use, - h5_expression = h5_expression, - h5_gene_ids = h5_gene_ids, - gene_column_index = gene_column_index, - cores = cores, - verbose = verbose - ) - - - # TODO load images - - - # 3. Create giotto objects - - if (data_to_use == "subcellular") { - # ** feat type search keys ** - if (is.null(key_list)) { - key_list <- list( - blank_code = "BLANK_", - neg_code = "NegControlCodeword_", - neg_probe = c("NegControlProbe_|antisense_") - ) - } - - # needed: - # feat_meta - # tx_dt - # bound_dt_list - xenium_gobject <- .createGiottoXeniumObject_subcellular( - data_list = data_list, - qv_threshold = qv_threshold, - key_list = key_list, - instructions = instructions, - cores = cores, - verbose = verbose - ) - } - - if (data_to_use == "aggregate") { - # needed: - # feat_meta - # cell_meta - # agg_expr - # optional? - # tx_dt - # bound_dt_list - xenium_gobject <- .createGiottoXeniumObject_aggregate( - data_list = data_list, - instructions = instructions, - cores = cores, - verbose = verbose - ) - } - - return(xenium_gobject) -} - - - - -#' @title Create a Xenium Giotto object from subcellular info -#' @name .createGiottoXeniumObject_subcellular -#' @description Subcellular workflow for createGiottoXeniumObject -#' @param data_list list of data loaded by \code{\link{.load_xenium_folder}} -#' @param key_list regex-based search keys for feature IDs to allow separation -#' into separate giottoPoints objects by feat_type -#' @param qv_threshold Minimum Phred-scaled quality score cutoff to be included -#' as a subcellular transcript detection (default = 20) -#' @inheritParams get10Xmatrix -#' @inheritParams GiottoClass::createGiottoObjectSubcellular -#' @returns giotto object -#' @seealso createGiottoXeniumObject .createGiottoXeniumObject_aggregate -#' @keywords internal -.createGiottoXeniumObject_subcellular <- function( - data_list, - key_list = NULL, + transcript_path = NULL, # optional + bounds_path = list( # looks for parquets by default + cell = "cell", + nucleus = "nucleus" + ), + gene_panel_json_path = NULL, + expression_path = NULL, # optional + metadata_path = NULL, + feat_type = c( + "rna", + "NegControlProbe", + "UnassignedCodeword", + "NegControlCodeword" + ), + split_keyword = list( + "NegControlProbe", + "UnassignedCodeword", + "NegControlCodeword" + ), qv_threshold = 20, - instructions = NULL, - cores = NA, - verbose = TRUE -) { - # data.table vars - qv <- NULL - - # Unpack data_list info - feat_meta <- data_list$feat_meta - tx_dt <- data_list$tx_dt - bound_dt_list <- data_list$bound_dt_list - - # define for data.table - cell_id <- feat_ID <- feature_name <- NULL - - vmsg("Building subcellular giotto object...", .v = verbose) - # Giotto points object - vmsg("> points data prep...", .v = verbose) - - # filter by qv_threshold - vmsg("> filtering feature detections for Phred score >= ", - qv_threshold, .v = verbose) - n_before <- tx_dt[, .N] - tx_dt_filtered <- tx_dt[qv >= qv_threshold] - n_after <- tx_dt_filtered[, .N] - - if (verbose) { - cat( - "Number of feature points removed: ", - n_before - n_after, - " out of ", n_before, "\n" - ) - } - - vmsg("> splitting detections by feat_type", .v = verbose) - # discover feat_IDs for each feat_type - all_IDs <- tx_dt_filtered[, unique(feat_ID)] - feat_types_IDs <- lapply( - key_list, function(x) all_IDs[grepl(pattern = x, all_IDs)]) - rna <- list("rna" = all_IDs[!all_IDs %in% unlist(feat_types_IDs)]) - feat_types_IDs <- append(rna, feat_types_IDs) - - # separate detections by feature type - points_list <- lapply( - feat_types_IDs, - function(types) { - tx_dt_filtered[feat_ID %in% types] - } - ) - - # Giotto polygons object - vmsg("> polygons data prep...", .v = verbose) - polys_list <- lapply( - bound_dt_list, - function(bound_type) { - bound_type[, cell_id := as.character(cell_id)] - } - ) - - xenium_gobject <- createGiottoObjectSubcellular( - gpoints = points_list, - gpolygons = polys_list, - instructions = instructions, - cores = cores, - verbose = verbose - ) - - # generate centroids - vmsg("Calculating polygon centroids...", .v = verbose) - xenium_gobject <- addSpatialCentroidLocations( - xenium_gobject, - poly_info = c(names(bound_dt_list)), - provenance = as.list(names(bound_dt_list)) - ) - - return(xenium_gobject) -} - - - - - -#' @title Create a Xenium Giotto object from aggregate info -#' @name .createGiottoXeniumObject_aggregate -#' @description Aggregate workflow for createGiottoXeniumObject -#' @param data_list list of data loaded by \code{.load_xenium_folder} -#' @inheritParams get10Xmatrix -#' @inheritParams GiottoClass::createGiottoObjectSubcellular -#' @returns giotto object -#' @seealso createGiottoXeniumObject .createGiottoXeniumObject_subcellular -#' @keywords internal -.createGiottoXeniumObject_aggregate <- function( - data_list, - # include_analysis = FALSE, - instructions = NULL, - cores = NA, - verbose = TRUE + load_images = NULL, + load_aligned_images = NULL, + load_expression = FALSE, + load_cellmeta = FALSE, + verbose = NULL ) { - # Unpack data_list info - feat_meta <- data_list$feat_meta - cell_meta <- data_list$cell_meta - agg_expr <- data_list$agg_expr - - # define for data.table - cell_ID <- x_centroid <- y_centroid <- NULL - - # clean up names for aggregate matrices - names(agg_expr) <- gsub(pattern = " ", replacement = "_", names(agg_expr)) - geneExpMat <- which(names(agg_expr) == "Gene_Expression") - names(agg_expr)[[geneExpMat]] <- "raw" - - # set cell_id as character - cell_meta <- cell_meta[, data.table::setnames(.SD, "cell_id", "cell_ID")] - cell_meta <- cell_meta[, cell_ID := as.character(cell_ID)] - - # set up spatial locations - agg_spatlocs <- cell_meta[, .(x_centroid, y_centroid, cell_ID)] - - # set up metadata - agg_meta <- cell_meta[, !c("x_centroid", "y_centroid")] - - vmsg("Building aggregate giotto object...", .v = verbose) - xenium_gobject <- createGiottoObject( - expression = agg_expr, - spatial_locs = agg_spatlocs, - instructions = instructions, - cores = cores, + x <- importXenium(xenium_dir) + # apply reader params + x$qv <- qv_threshold + + g <- x$create_gobject( + transcript_path = transcript_path, + load_bounds = bounds_path, + gene_panel_json_path = gene_panel_json_path, + expression_path = expression_path, + metadata_path = metadata_path, + feat_type = split_keyword, + split_keyword = split_keyword, + load_images = load_images, + load_aligned_images = load_aligned_images, + load_expression = load_expression, + load_cellmeta = load_cellmeta, verbose = verbose ) - # append aggregate metadata - xenium_gobject <- addCellMetadata( - gobject = xenium_gobject, - new_metadata = agg_meta, - by_column = TRUE, - column_cell_ID = "cell_ID" - ) - xenium_gobject <- addFeatMetadata( - gobject = xenium_gobject, - new_metadata = feat_meta, - by_column = TRUE, - column_feat_ID = "feat_ID" - ) - - return(xenium_gobject) -} - - - - -#' @title Read a structured xenium folder -#' @name .read_xenium_folder -#' @inheritParams createGiottoXeniumObject -#' @keywords internal -#' @returns path_list a list of xenium files discovered and their filepaths. NULL -#' values denote missing items -.read_xenium_folder <- function( - xenium_dir, - data_to_use = "subcellular", - bounds_to_load = c("cell"), - load_format = "csv", - h5_expression = FALSE, - verbose = TRUE -) { - # Check needed packages - if (load_format == "parquet") { - package_check(pkg_name = "arrow", repository = "CRAN") - package_check(pkg_name = "dplyr", repository = "CRAN") - } - if (isTRUE(h5_expression)) { - package_check(pkg_name = "hdf5r", repository = "CRAN") - } - - ch <- box_chars() - - - # 0. test if folder structure exists and is as expected - - - if (is.null(xenium_dir) | !dir.exists(xenium_dir)) - stop("The full path to a xenium directory must be given.") - vmsg("A structured Xenium directory will be used\n", .v = verbose) - - # find items (length = 1 if present, length = 0 if missing) - dir_items <- list( - `analysis info` = "*analysis*", - `boundary info` = "*bound*", - `cell feature matrix` = "*cell_feature_matrix*", - `cell metadata` = "*cells*", - `image info` = "*tif", - `panel metadata` = "*panel*", - `raw transcript info` = "*transcripts*", - `experiment info (.xenium)` = "*.xenium" - ) - - dir_items <- lapply( - dir_items, function(x) Sys.glob(paths = file.path(xenium_dir, x))) - dir_items_lengths <- lengths(dir_items) - - if (isTRUE(verbose)) { - message("Checking directory contents...") - for (item in names(dir_items)) { - # IF ITEM FOUND - - if (dir_items_lengths[[item]] > 0) { - message(ch$s, "> ", item, " found") - for (item_i in seq_along(dir_items[[item]])) { - # print found item names - subItem <- gsub(pattern = ".*/", replacement = "", - x = dir_items[[item]][[item_i]]) - message(ch$s, ch$s, ch$l, ch$h, ch$h, subItem) - } - } else { - # IF ITEM MISSING - # Based on workflow, determine if: - # necessary (error) - # optional (warning) - - if (data_to_use == "subcellular") { - # necessary items - if (item %in% c("boundary info", "raw transcript info")) - stop(item, " is missing") - # optional items - if (item %in% c( - "image info", "experiment info (.xenium)", - "panel metadata")) - warning(item, " is missing (optional)") - # items to ignore: analysis info, cell feature matrix, - # cell metadata - } else if (data_to_use == "aggregate") { - # necessary items - if (item %in% c("cell feature matrix", "cell metadata")) - stop(item, " is missing") - # optional items - if (item %in% c( - "image info", "experiment info (.xenium)", - "panel metadata", "analysis info")) - warning(item, " is missing (optional)") - # items to ignore: boundary info, raw transcript info - } - } - } - } - - - # 1. Select data to load - - - # **** transcript info **** - tx_path <- NULL - tx_path <- dir_items$`raw transcript info`[grepl( - pattern = load_format, dir_items$`raw transcript info`)] - # **** cell metadata **** - cell_meta_path <- NULL - cell_meta_path <- dir_items$`cell metadata`[grepl( - pattern = load_format, dir_items$`cell metadata`)] - - # **** boundary info **** - # Select bound load format - if (load_format != "zarr") { # No zarr available for boundary info - dir_items$`boundary info` <- dir_items$`boundary info`[grepl( - pattern = load_format, dir_items$`boundary info`)] - } else { - dir_items$`boundary info` <- dir_items$`boundary info`[grepl( - pattern = "csv", dir_items$`boundary info`)] - } - - # Organize bound paths by type of bound (bounds_to_load param) - bound_paths <- NULL - bound_names <- bounds_to_load - bounds_to_load <- as.list(bounds_to_load) - bound_paths <- lapply(bounds_to_load, function(x) dir_items$`boundary info`[ - grepl(pattern = x, dir_items$`boundary info`)]) - names(bound_paths) <- bound_names - - # **** aggregated expression info **** - agg_expr_path <- NULL - if (isTRUE(h5_expression)) { # h5 expression matrix loading is default - agg_expr_path <- dir_items$`cell feature matrix`[grepl( - pattern = "h5", dir_items$`cell feature matrix`)] - } else if (load_format == "zarr") { - agg_expr_path <- dir_items$`cell feature matrix`[grepl( - pattern = "zarr", dir_items$`cell feature matrix`)] - } else { # No parquet for aggregated expression - default to normal 10x loading - agg_expr_path <- dir_items$`cell feature matrix`[sapply( - dir_items$`cell feature matrix`, function(x) file_test(op = "-d", x))] - if (length(agg_expr_path) == 0) { - stop(wrap_txt( - "Expression matrix cannot be loaded.\n - Has cell_feature_matrix(.tar.gz) been unpacked into a - directory?" - )) - } - } - if (data_to_use == "aggregate") { - if (length(path_list$agg_expr_path) == 0) { - stop(wrap_txt( - "Aggregated expression not found.\n - Please confirm h5_expression and load_format params are correct" - )) - } - } - - # **** panel info **** - panel_meta_path <- NULL - panel_meta_path <- dir_items$`panel metadata` - - - vmsg("Directory check done", .v = verbose) - - path_list <- list( - "tx_path" = tx_path, - "bound_paths" = bound_paths, - "cell_meta_path" = cell_meta_path, - "agg_expr_path" = agg_expr_path, - "panel_meta_path" = panel_meta_path - ) - - return(path_list) -} - -#' @title Load xenium data from folder -#' @name load_xenium_folder -#' @param path_list list of full filepaths from .read_xenium_folder -#' @inheritParams createGiottoXeniumObject -#' @returns list of loaded in xenium data -NULL - -#' @rdname load_xenium_folder -#' @keywords internal -.load_xenium_folder <- function( - path_list, - load_format = "csv", - data_to_use = "subcellular", - h5_expression = "FALSE", - h5_gene_ids = "symbols", - gene_column_index = 1, - cores, - verbose = TRUE -) { - data_list <- switch(load_format, - "csv" = .load_xenium_folder_csv( - path_list = path_list, - data_to_use = data_to_use, - h5_expression = h5_expression, - h5_gene_ids = h5_gene_ids, - gene_column_index = gene_column_index, - cores = cores, - verbose = verbose - ), - "parquet" = .load_xenium_folder_parquet( - path_list = path_list, - data_to_use = data_to_use, - h5_expression = h5_expression, - h5_gene_ids = h5_gene_ids, - gene_column_index = gene_column_index, - cores = cores, - verbose = verbose - ), - "zarr" = stop("load_format zarr:\n Not yet implemented", call. = FALSE) - ) - - return(data_list) -} - - -#' @describeIn load_xenium_folder Load from csv files -#' @keywords internal -.load_xenium_folder_csv <- function( - path_list, - cores, - data_to_use = "subcellular", - h5_expression = FALSE, - h5_gene_ids = "symbols", - gene_column_index = 1, - verbose = TRUE -) { - # initialize return vars - feat_meta <- tx_dt <- bound_dt_list <- cell_meta <- agg_expr <- NULL - - vmsg("Loading feature metadata...", .v = verbose) - # updated for pipeline v1.6 json format - fdata_path <- path_list$panel_meta_path[[1]] - fdata_ext <- GiottoUtils::file_extension(fdata_path) - if ("json" %in% fdata_ext) { - feat_meta <- .load_xenium_panel_json(path = fdata_path, - gene_ids = h5_gene_ids) - } else { - feat_meta <- data.table::fread(fdata_path, nThread = cores) - colnames(feat_meta)[[1]] <- "feat_ID" - } - - # **** subcellular info **** - if (data_to_use == "subcellular") { - # append missing QC probe info to feat_meta - if (isTRUE(h5_expression)) { - h5 <- hdf5r::H5File$new(path_list$agg_expr_path) - tryCatch({ - root <- names(h5) - feature_id <- h5[[paste0(root, "/features/id")]][] - feature_info <- h5[[paste0(root, "/features/feature_type")]][] - feature_names <- h5[[paste0(root, "/features/name")]][] - features_dt <- data.table::data.table( - "id" = feature_id, - "name" = feature_names, - "feature_type" = feature_info - ) - }, finally = { - h5$close_all() - }) - } else { - features_dt <- data.table::fread( - paste0(path_list$agg_expr_path, "/features.tsv.gz"), - header = FALSE - ) - } - colnames(features_dt) <- c("id", "feat_ID", "feat_class") - feat_meta <- merge( - features_dt[, c(2, 3)], feat_meta, all.x = TRUE, by = "feat_ID") - - GiottoUtils::vmsg("Loading transcript level info...", .v = verbose) - tx_dt <- data.table::fread(path_list$tx_path[[1]], nThread = cores) - data.table::setnames( - x = tx_dt, - old = c("feature_name", "x_location", "y_location"), - new = c("feat_ID", "x", "y") - ) - - GiottoUtils::vmsg("Loading boundary info...", .v = verbose) - bound_dt_list <- lapply( - path_list$bound_paths, - function(x) data.table::fread(x[[1]], nThread = cores) - ) - } - - # **** aggregate info **** - GiottoUtils::vmsg("loading cell metadata...", .v = verbose) - cell_meta <- data.table::fread( - path_list$cell_meta_path[[1]], nThread = cores) - - if (data_to_use == "aggregate") { - GiottoUtils::vmsg("Loading aggregated expression...", .v = verbose) - if (isTRUE(h5_expression)) { - agg_expr <- get10Xmatrix_h5( - path_to_data = path_list$agg_expr_path, - gene_ids = h5_gene_ids, - remove_zero_rows = TRUE, - split_by_type = TRUE - ) - } else { - agg_expr <- get10Xmatrix( - path_to_data = path_list$agg_expr_path, - gene_column_index = gene_column_index, - remove_zero_rows = TRUE, - split_by_type = TRUE - ) - } - } - - data_list <- list( - "feat_meta" = feat_meta, - "tx_dt" = tx_dt, - "bound_dt_list" = bound_dt_list, - "cell_meta" = cell_meta, - "agg_expr" = agg_expr - ) - - return(data_list) + return(g) } - -#' @describeIn load_xenium_folder Load from parquet files -#' @keywords internal -.load_xenium_folder_parquet <- function( - path_list, - cores, - data_to_use = "subcellular", - h5_expression = FALSE, - h5_gene_ids = "symbols", - gene_column_index = 1, - verbose = TRUE -) { - # initialize return vars - feat_meta <- tx_dt <- bound_dt_list <- cell_meta <- agg_expr <- NULL - # dplyr variable - cell_id <- NULL - - vmsg("Loading feature metadata...", .v = verbose) - # updated for pipeline v1.6 json format - fdata_path <- path_list$panel_meta_path[[1]] - fdata_ext <- GiottoUtils::file_extension(fdata_path) - if ("json" %in% fdata_ext) { - feat_meta <- .load_xenium_panel_json( - path = fdata_path, gene_ids = h5_gene_ids) - } else { - feat_meta <- data.table::fread(fdata_path, nThread = cores) - colnames(feat_meta)[[1]] <- "feat_ID" - } - - # **** subcellular info **** - if (data_to_use == "subcellular") { - # define for data.table - transcript_id <- feature_name <- NULL - - # append missing QC probe info to feat_meta - if (isTRUE(h5_expression)) { - h5 <- hdf5r::H5File$new(path_list$agg_expr_path) - tryCatch({ - root <- names(h5) - feature_id <- h5[[paste0(root, "/features/id")]][] - feature_info <- h5[[paste0(root, "/features/feature_type")]][] - feature_names <- h5[[paste0(root, "/features/name")]][] - features_dt <- data.table::data.table( - "id" = feature_id, - "name" = feature_names, - "feature_type" = feature_info - ) - }, finally = { - h5$close_all() - }) - } else { - features_dt <- arrow::read_tsv_arrow(paste0( - path_list$agg_expr_path, "/features.tsv.gz"), - col_names = FALSE - ) %>% - data.table::setDT() - } - colnames(features_dt) <- c("id", "feat_ID", "feat_class") - feat_meta <- merge(features_dt[ - , c(2, 3)], feat_meta, all.x = TRUE, by = "feat_ID") - - vmsg("Loading transcript level info...", .v = verbose) - tx_dt <- arrow::read_parquet( - file = path_list$tx_path[[1]], - as_data_frame = FALSE - ) %>% - dplyr::mutate( - transcript_id = cast(transcript_id, arrow::string())) %>% - dplyr::mutate(cell_id = cast(cell_id, arrow::string())) %>% - dplyr::mutate( - feature_name = cast(feature_name, arrow::string())) %>% - as.data.frame() %>% - data.table::setDT() - data.table::setnames( - x = tx_dt, - old = c("feature_name", "x_location", "y_location"), - new = c("feat_ID", "x", "y") - ) - vmsg("Loading boundary info...", .v = verbose) - bound_dt_list <- lapply(path_list$bound_paths, function(x) { - arrow::read_parquet(file = x[[1]], as_data_frame = FALSE) %>% - dplyr::mutate(cell_id = cast(cell_id, arrow::string())) %>% - as.data.frame() %>% - data.table::setDT() - }) - } - # **** aggregate info **** - if (data_to_use == "aggregate") { - vmsg("Loading cell metadata...", .v = verbose) - cell_meta <- arrow::read_parquet( - file = path_list$cell_meta_path[[1]], - as_data_frame = FALSE - ) %>% - dplyr::mutate(cell_id = cast(cell_id, arrow::string())) %>% - as.data.frame() %>% - data.table::setDT() - - # NOTE: no parquet for agg_expr. - vmsg("Loading aggregated expression...", .v = verbose) - if (isTRUE(h5_expression)) { - agg_expr <- get10Xmatrix_h5( - path_to_data = path_list$agg_expr_path, - gene_ids = h5_gene_ids, - remove_zero_rows = TRUE, - split_by_type = TRUE - ) - } else { - agg_expr <- get10Xmatrix( - path_to_data = path_list$agg_expr_path, - gene_column_index = gene_column_index, - remove_zero_rows = TRUE, - split_by_type = TRUE - ) - } - } - - data_list <- list( - "feat_meta" = feat_meta, - "tx_dt" = tx_dt, - "bound_dt_list" = bound_dt_list, - "cell_meta" = cell_meta, - "agg_expr" = agg_expr - ) - - return(data_list) -} +#' +#' #' @title Create 10x Xenium Giotto Object +#' #' @name createGiottoXeniumObject +#' #' @description Given the path to a Xenium experiment output folder, creates a +#' #' Giotto object +#' #' @param xenium_dir full path to the exported xenium directory +#' #' @param data_to_use which type(s) of expression data to build the gobject with +#' #' (e.g. default: \strong{'subcellular'}, 'aggregate', or 'all') +#' #' @param load_format files formats from which to load the data. Either `csv` or +#' #' `parquet` currently supported. +#' #' @param h5_expression (boolean) whether to load cell_feature_matrix from .h5 +#' #' file. Default is \code{TRUE} +#' #' @param h5_gene_ids use gene symbols (default) or ensembl ids for the .h5 gene +#' #' expression matrix +#' #' @param bounds_to_load vector of boundary information to load +#' #' (e.g. \code{'cell'} +#' #' or \code{'nucleus'} by themselves or \code{c('cell', 'nucleus')} to load both +#' #' at the same time.) +#' #' @param qv_threshold Minimum Phred-scaled quality score cutoff to be included +#' #' as a subcellular transcript detection (default = 20) +#' #' @param key_list (advanced) list of grep-based keywords to split the +#' #' subcellular feature detections by feature type. See details +#' #' @inheritParams get10Xmatrix +#' #' @inheritParams GiottoClass::createGiottoObjectSubcellular +#' #' @returns giotto object +#' #' @details +#' #' +#' #' [\strong{QC feature types}] +#' #' Xenium provides info on feature detections that include more than only the +#' #' Gene Expression specific probes. Additional probes for QC are included: +#' #' \emph{blank codeword}, \emph{negative control codeword}, and +#' #' \emph{negative control probe}. These additional QC probes each occupy and +#' #' are treated as their own feature types so that they can largely remain +#' #' independent of the gene expression information. +#' #' +#' #' [\strong{key_list}] +#' #' Related to \code{data_to_use = 'subcellular'} workflow only: +#' #' Additional QC probe information is in the subcellular feature detections +#' #' information and must be separated from the gene expression information +#' #' during processing. +#' #' The QC probes have prefixes that allow them to be selected from the rest of +#' #' the feature IDs. +#' #' Giotto uses a named list of keywords (\code{key_list}) to select these QC +#' #' probes, with the list names being the names that will be assigned as the +#' #' feature type of these feature detections. The default list is used when +#' #' \code{key_list} = NULL. +#' #' +#' #' Default list: +#' #' \preformatted{ +#' #' list(blank_code = 'BLANK_', +#' #' neg_code = 'NegControlCodeword_', +#' #' neg_probe = c('NegControlProbe_|antisense_')) +#' #' } +#' #' +#' #' The Gene expression subset is accepted as the subset of feat_IDs that do not +#' #' map to any of the keys. +#' #' +#' #' @export +#' createGiottoXeniumObject <- function( +#' xenium_dir, +#' data_to_use = c("subcellular", "aggregate"), +#' load_format = "csv", +#' h5_expression = TRUE, +#' h5_gene_ids = c("symbols", "ensembl"), +#' gene_column_index = 1, +#' bounds_to_load = c("cell"), +#' qv_threshold = 20, +#' key_list = NULL, +#' instructions = NULL, +#' cores = NA, +#' verbose = TRUE +#' ) { +#' # 0. setup +#' xenium_dir <- path.expand(xenium_dir) +#' +#' # Determine data to load +#' data_to_use <- match.arg( +#' arg = data_to_use, choices = c("subcellular", "aggregate")) +#' +#' # Determine load formats +#' load_format <- "csv" # TODO Remove this and add as param once other options +#' # are available +#' load_format <- match.arg( +#' arg = load_format, choices = c("csv", "parquet", "zarr")) +#' +#' # set number of cores automatically, but with limit of 10 +#' cores <- determine_cores(cores) +#' data.table::setDTthreads(threads = cores) +#' +#' # 1. detect xenium folder and find filepaths to load +#' +#' # path_list contents: +#' # tx_path +#' # bound_paths +#' # cell_meta_path +#' # agg_expr_path +#' # panel_meta_path +#' path_list <- .read_xenium_folder( +#' xenium_dir = xenium_dir, +#' data_to_use = data_to_use, +#' bounds_to_load = bounds_to_load, +#' load_format = load_format, +#' h5_expression = h5_expression, +#' verbose = verbose +#' ) +#' +#' +#' # 2. load in data +#' +#' # data_list contents: +#' # feat_meta +#' # tx_dt +#' # bound_dt_list +#' # cell_meta +#' # agg_expr +#' data_list <- .load_xenium_folder( +#' path_list = path_list, +#' load_format = load_format, +#' data_to_use = data_to_use, +#' h5_expression = h5_expression, +#' h5_gene_ids = h5_gene_ids, +#' gene_column_index = gene_column_index, +#' cores = cores, +#' verbose = verbose +#' ) +#' +#' +#' # TODO load images +#' +#' +#' # 3. Create giotto objects +#' +#' if (data_to_use == "subcellular") { +#' # ** feat type search keys ** +#' if (is.null(key_list)) { +#' key_list <- list( +#' blank_code = "BLANK_", +#' neg_code = "NegControlCodeword_", +#' neg_probe = c("NegControlProbe_|antisense_") +#' ) +#' } +#' +#' # needed: +#' # feat_meta +#' # tx_dt +#' # bound_dt_list +#' xenium_gobject <- .createGiottoXeniumObject_subcellular( +#' data_list = data_list, +#' qv_threshold = qv_threshold, +#' key_list = key_list, +#' instructions = instructions, +#' cores = cores, +#' verbose = verbose +#' ) +#' } +#' +#' if (data_to_use == "aggregate") { +#' # needed: +#' # feat_meta +#' # cell_meta +#' # agg_expr +#' # optional? +#' # tx_dt +#' # bound_dt_list +#' xenium_gobject <- .createGiottoXeniumObject_aggregate( +#' data_list = data_list, +#' instructions = instructions, +#' cores = cores, +#' verbose = verbose +#' ) +#' } +#' +#' return(xenium_gobject) +#' } +#' +#' +#' +#' +#' #' @title Create a Xenium Giotto object from subcellular info +#' #' @name .createGiottoXeniumObject_subcellular +#' #' @description Subcellular workflow for createGiottoXeniumObject +#' #' @param data_list list of data loaded by \code{\link{.load_xenium_folder}} +#' #' @param key_list regex-based search keys for feature IDs to allow separation +#' #' into separate giottoPoints objects by feat_type +#' #' @param qv_threshold Minimum Phred-scaled quality score cutoff to be included +#' #' as a subcellular transcript detection (default = 20) +#' #' @inheritParams get10Xmatrix +#' #' @inheritParams GiottoClass::createGiottoObjectSubcellular +#' #' @returns giotto object +#' #' @seealso createGiottoXeniumObject .createGiottoXeniumObject_aggregate +#' #' @keywords internal +#' .createGiottoXeniumObject_subcellular <- function( +#' data_list, +#' key_list = NULL, +#' qv_threshold = 20, +#' instructions = NULL, +#' cores = NA, +#' verbose = TRUE +#' ) { +#' # data.table vars +#' qv <- NULL +#' +#' # Unpack data_list info +#' feat_meta <- data_list$feat_meta +#' tx_dt <- data_list$tx_dt +#' bound_dt_list <- data_list$bound_dt_list +#' +#' # define for data.table +#' cell_id <- feat_ID <- feature_name <- NULL +#' +#' vmsg("Building subcellular giotto object...", .v = verbose) +#' # Giotto points object +#' vmsg("> points data prep...", .v = verbose) +#' +#' # filter by qv_threshold +#' vmsg("> filtering feature detections for Phred score >= ", +#' qv_threshold, .v = verbose) +#' n_before <- tx_dt[, .N] +#' tx_dt_filtered <- tx_dt[qv >= qv_threshold] +#' n_after <- tx_dt_filtered[, .N] +#' +#' if (verbose) { +#' cat( +#' "Number of feature points removed: ", +#' n_before - n_after, +#' " out of ", n_before, "\n" +#' ) +#' } +#' +#' vmsg("> splitting detections by feat_type", .v = verbose) +#' # discover feat_IDs for each feat_type +#' all_IDs <- tx_dt_filtered[, unique(feat_ID)] +#' feat_types_IDs <- lapply( +#' key_list, function(x) all_IDs[grepl(pattern = x, all_IDs)]) +#' rna <- list("rna" = all_IDs[!all_IDs %in% unlist(feat_types_IDs)]) +#' feat_types_IDs <- append(rna, feat_types_IDs) +#' +#' # separate detections by feature type +#' points_list <- lapply( +#' feat_types_IDs, +#' function(types) { +#' tx_dt_filtered[feat_ID %in% types] +#' } +#' ) +#' +#' # Giotto polygons object +#' vmsg("> polygons data prep...", .v = verbose) +#' polys_list <- lapply( +#' bound_dt_list, +#' function(bound_type) { +#' bound_type[, cell_id := as.character(cell_id)] +#' } +#' ) +#' +#' xenium_gobject <- createGiottoObjectSubcellular( +#' gpoints = points_list, +#' gpolygons = polys_list, +#' instructions = instructions, +#' cores = cores, +#' verbose = verbose +#' ) +#' +#' # generate centroids +#' vmsg("Calculating polygon centroids...", .v = verbose) +#' xenium_gobject <- addSpatialCentroidLocations( +#' xenium_gobject, +#' poly_info = c(names(bound_dt_list)), +#' provenance = as.list(names(bound_dt_list)) +#' ) +#' +#' return(xenium_gobject) +#' } +#' +#' +#' +#' +#' +#' #' @title Create a Xenium Giotto object from aggregate info +#' #' @name .createGiottoXeniumObject_aggregate +#' #' @description Aggregate workflow for createGiottoXeniumObject +#' #' @param data_list list of data loaded by \code{.load_xenium_folder} +#' #' @inheritParams get10Xmatrix +#' #' @inheritParams GiottoClass::createGiottoObjectSubcellular +#' #' @returns giotto object +#' #' @seealso createGiottoXeniumObject .createGiottoXeniumObject_subcellular +#' #' @keywords internal +#' .createGiottoXeniumObject_aggregate <- function( +#' data_list, +#' # include_analysis = FALSE, +#' instructions = NULL, +#' cores = NA, +#' verbose = TRUE +#' ) { +#' # Unpack data_list info +#' feat_meta <- data_list$feat_meta +#' cell_meta <- data_list$cell_meta +#' agg_expr <- data_list$agg_expr +#' +#' # define for data.table +#' cell_ID <- x_centroid <- y_centroid <- NULL +#' +#' # clean up names for aggregate matrices +#' names(agg_expr) <- gsub(pattern = " ", replacement = "_", names(agg_expr)) +#' geneExpMat <- which(names(agg_expr) == "Gene_Expression") +#' names(agg_expr)[[geneExpMat]] <- "raw" +#' +#' # set cell_id as character +#' cell_meta <- cell_meta[, data.table::setnames(.SD, "cell_id", "cell_ID")] +#' cell_meta <- cell_meta[, cell_ID := as.character(cell_ID)] +#' +#' # set up spatial locations +#' agg_spatlocs <- cell_meta[, .(x_centroid, y_centroid, cell_ID)] +#' +#' # set up metadata +#' agg_meta <- cell_meta[, !c("x_centroid", "y_centroid")] +#' +#' vmsg("Building aggregate giotto object...", .v = verbose) +#' xenium_gobject <- createGiottoObject( +#' expression = agg_expr, +#' spatial_locs = agg_spatlocs, +#' instructions = instructions, +#' cores = cores, +#' verbose = verbose +#' ) +#' +#' # append aggregate metadata +#' xenium_gobject <- addCellMetadata( +#' gobject = xenium_gobject, +#' new_metadata = agg_meta, +#' by_column = TRUE, +#' column_cell_ID = "cell_ID" +#' ) +#' xenium_gobject <- addFeatMetadata( +#' gobject = xenium_gobject, +#' new_metadata = feat_meta, +#' by_column = TRUE, +#' column_feat_ID = "feat_ID" +#' ) +#' +#' return(xenium_gobject) +#' } +#' +#' +#' +#' +#' #' @title Read a structured xenium folder +#' #' @name .read_xenium_folder +#' #' @inheritParams createGiottoXeniumObject +#' #' @keywords internal +#' #' @returns path_list a list of xenium files discovered and their filepaths. NULL +#' #' values denote missing items +#' .read_xenium_folder <- function( +#' xenium_dir, +#' data_to_use = "subcellular", +#' bounds_to_load = c("cell"), +#' load_format = "csv", +#' h5_expression = FALSE, +#' verbose = TRUE +#' ) { +#' # Check needed packages +#' if (load_format == "parquet") { +#' package_check(pkg_name = "arrow", repository = "CRAN") +#' package_check(pkg_name = "dplyr", repository = "CRAN") +#' } +#' if (isTRUE(h5_expression)) { +#' package_check(pkg_name = "hdf5r", repository = "CRAN") +#' } +#' +#' ch <- box_chars() +#' +#' +#' # 0. test if folder structure exists and is as expected +#' +#' +#' if (is.null(xenium_dir) | !dir.exists(xenium_dir)) +#' stop("The full path to a xenium directory must be given.") +#' vmsg("A structured Xenium directory will be used\n", .v = verbose) +#' +#' # find items (length = 1 if present, length = 0 if missing) +#' dir_items <- list( +#' `analysis info` = "*analysis*", +#' `boundary info` = "*bound*", +#' `cell feature matrix` = "*cell_feature_matrix*", +#' `cell metadata` = "*cells*", +#' `image info` = "*tif", +#' `panel metadata` = "*panel*", +#' `raw transcript info` = "*transcripts*", +#' `experiment info (.xenium)` = "*.xenium" +#' ) +#' +#' dir_items <- lapply( +#' dir_items, function(x) Sys.glob(paths = file.path(xenium_dir, x))) +#' dir_items_lengths <- lengths(dir_items) +#' +#' if (isTRUE(verbose)) { +#' message("Checking directory contents...") +#' for (item in names(dir_items)) { +#' # IF ITEM FOUND +#' +#' if (dir_items_lengths[[item]] > 0) { +#' message(ch$s, "> ", item, " found") +#' for (item_i in seq_along(dir_items[[item]])) { +#' # print found item names +#' subItem <- gsub(pattern = ".*/", replacement = "", +#' x = dir_items[[item]][[item_i]]) +#' message(ch$s, ch$s, ch$l, ch$h, ch$h, subItem) +#' } +#' } else { +#' # IF ITEM MISSING +#' # Based on workflow, determine if: +#' # necessary (error) +#' # optional (warning) +#' +#' if (data_to_use == "subcellular") { +#' # necessary items +#' if (item %in% c("boundary info", "raw transcript info")) +#' stop(item, " is missing") +#' # optional items +#' if (item %in% c( +#' "image info", "experiment info (.xenium)", +#' "panel metadata")) +#' warning(item, " is missing (optional)") +#' # items to ignore: analysis info, cell feature matrix, +#' # cell metadata +#' } else if (data_to_use == "aggregate") { +#' # necessary items +#' if (item %in% c("cell feature matrix", "cell metadata")) +#' stop(item, " is missing") +#' # optional items +#' if (item %in% c( +#' "image info", "experiment info (.xenium)", +#' "panel metadata", "analysis info")) +#' warning(item, " is missing (optional)") +#' # items to ignore: boundary info, raw transcript info +#' } +#' } +#' } +#' } +#' +#' +#' # 1. Select data to load +#' +#' +#' # **** transcript info **** +#' tx_path <- NULL +#' tx_path <- dir_items$`raw transcript info`[grepl( +#' pattern = load_format, dir_items$`raw transcript info`)] +#' # **** cell metadata **** +#' cell_meta_path <- NULL +#' cell_meta_path <- dir_items$`cell metadata`[grepl( +#' pattern = load_format, dir_items$`cell metadata`)] +#' +#' # **** boundary info **** +#' # Select bound load format +#' if (load_format != "zarr") { # No zarr available for boundary info +#' dir_items$`boundary info` <- dir_items$`boundary info`[grepl( +#' pattern = load_format, dir_items$`boundary info`)] +#' } else { +#' dir_items$`boundary info` <- dir_items$`boundary info`[grepl( +#' pattern = "csv", dir_items$`boundary info`)] +#' } +#' +#' # Organize bound paths by type of bound (bounds_to_load param) +#' bound_paths <- NULL +#' bound_names <- bounds_to_load +#' bounds_to_load <- as.list(bounds_to_load) +#' bound_paths <- lapply(bounds_to_load, function(x) dir_items$`boundary info`[ +#' grepl(pattern = x, dir_items$`boundary info`)]) +#' names(bound_paths) <- bound_names +#' +#' # **** aggregated expression info **** +#' agg_expr_path <- NULL +#' if (isTRUE(h5_expression)) { # h5 expression matrix loading is default +#' agg_expr_path <- dir_items$`cell feature matrix`[grepl( +#' pattern = "h5", dir_items$`cell feature matrix`)] +#' } else if (load_format == "zarr") { +#' agg_expr_path <- dir_items$`cell feature matrix`[grepl( +#' pattern = "zarr", dir_items$`cell feature matrix`)] +#' } else { # No parquet for aggregated expression - default to normal 10x loading +#' agg_expr_path <- dir_items$`cell feature matrix`[sapply( +#' dir_items$`cell feature matrix`, function(x) file_test(op = "-d", x))] +#' if (length(agg_expr_path) == 0) { +#' stop(wrap_txt( +#' "Expression matrix cannot be loaded.\n +#' Has cell_feature_matrix(.tar.gz) been unpacked into a +#' directory?" +#' )) +#' } +#' } +#' if (data_to_use == "aggregate") { +#' if (length(path_list$agg_expr_path) == 0) { +#' stop(wrap_txt( +#' "Aggregated expression not found.\n +#' Please confirm h5_expression and load_format params are correct" +#' )) +#' } +#' } +#' +#' # **** panel info **** +#' panel_meta_path <- NULL +#' panel_meta_path <- dir_items$`panel metadata` +#' +#' +#' vmsg("Directory check done", .v = verbose) +#' +#' path_list <- list( +#' "tx_path" = tx_path, +#' "bound_paths" = bound_paths, +#' "cell_meta_path" = cell_meta_path, +#' "agg_expr_path" = agg_expr_path, +#' "panel_meta_path" = panel_meta_path +#' ) +#' +#' return(path_list) +#' } +#' +#' #' @title Load xenium data from folder +#' #' @name load_xenium_folder +#' #' @param path_list list of full filepaths from .read_xenium_folder +#' #' @inheritParams createGiottoXeniumObject +#' #' @returns list of loaded in xenium data +#' NULL +#' +#' #' @rdname load_xenium_folder +#' #' @keywords internal +#' .load_xenium_folder <- function( +#' path_list, +#' load_format = "csv", +#' data_to_use = "subcellular", +#' h5_expression = "FALSE", +#' h5_gene_ids = "symbols", +#' gene_column_index = 1, +#' cores, +#' verbose = TRUE +#' ) { +#' data_list <- switch(load_format, +#' "csv" = .load_xenium_folder_csv( +#' path_list = path_list, +#' data_to_use = data_to_use, +#' h5_expression = h5_expression, +#' h5_gene_ids = h5_gene_ids, +#' gene_column_index = gene_column_index, +#' cores = cores, +#' verbose = verbose +#' ), +#' "parquet" = .load_xenium_folder_parquet( +#' path_list = path_list, +#' data_to_use = data_to_use, +#' h5_expression = h5_expression, +#' h5_gene_ids = h5_gene_ids, +#' gene_column_index = gene_column_index, +#' cores = cores, +#' verbose = verbose +#' ), +#' "zarr" = stop("load_format zarr:\n Not yet implemented", call. = FALSE) +#' ) +#' +#' return(data_list) +#' } +#' +#' +#' #' @describeIn load_xenium_folder Load from csv files +#' #' @keywords internal +#' .load_xenium_folder_csv <- function( +#' path_list, +#' cores, +#' data_to_use = "subcellular", +#' h5_expression = FALSE, +#' h5_gene_ids = "symbols", +#' gene_column_index = 1, +#' verbose = TRUE +#' ) { +#' # initialize return vars +#' feat_meta <- tx_dt <- bound_dt_list <- cell_meta <- agg_expr <- NULL +#' +#' vmsg("Loading feature metadata...", .v = verbose) +#' # updated for pipeline v1.6 json format +#' fdata_path <- path_list$panel_meta_path[[1]] +#' fdata_ext <- GiottoUtils::file_extension(fdata_path) +#' if ("json" %in% fdata_ext) { +#' feat_meta <- .load_xenium_panel_json(path = fdata_path, +#' gene_ids = h5_gene_ids) +#' } else { +#' feat_meta <- data.table::fread(fdata_path, nThread = cores) +#' colnames(feat_meta)[[1]] <- "feat_ID" +#' } +#' +#' # **** subcellular info **** +#' if (data_to_use == "subcellular") { +#' # append missing QC probe info to feat_meta +#' if (isTRUE(h5_expression)) { +#' h5 <- hdf5r::H5File$new(path_list$agg_expr_path) +#' tryCatch({ +#' root <- names(h5) +#' feature_id <- h5[[paste0(root, "/features/id")]][] +#' feature_info <- h5[[paste0(root, "/features/feature_type")]][] +#' feature_names <- h5[[paste0(root, "/features/name")]][] +#' features_dt <- data.table::data.table( +#' "id" = feature_id, +#' "name" = feature_names, +#' "feature_type" = feature_info +#' ) +#' }, finally = { +#' h5$close_all() +#' }) +#' } else { +#' features_dt <- data.table::fread( +#' paste0(path_list$agg_expr_path, "/features.tsv.gz"), +#' header = FALSE +#' ) +#' } +#' colnames(features_dt) <- c("id", "feat_ID", "feat_class") +#' feat_meta <- merge( +#' features_dt[, c(2, 3)], feat_meta, all.x = TRUE, by = "feat_ID") +#' +#' GiottoUtils::vmsg("Loading transcript level info...", .v = verbose) +#' tx_dt <- data.table::fread(path_list$tx_path[[1]], nThread = cores) +#' data.table::setnames( +#' x = tx_dt, +#' old = c("feature_name", "x_location", "y_location"), +#' new = c("feat_ID", "x", "y") +#' ) +#' +#' GiottoUtils::vmsg("Loading boundary info...", .v = verbose) +#' bound_dt_list <- lapply( +#' path_list$bound_paths, +#' function(x) data.table::fread(x[[1]], nThread = cores) +#' ) +#' } +#' +#' # **** aggregate info **** +#' GiottoUtils::vmsg("loading cell metadata...", .v = verbose) +#' cell_meta <- data.table::fread( +#' path_list$cell_meta_path[[1]], nThread = cores) +#' +#' if (data_to_use == "aggregate") { +#' GiottoUtils::vmsg("Loading aggregated expression...", .v = verbose) +#' if (isTRUE(h5_expression)) { +#' agg_expr <- get10Xmatrix_h5( +#' path_to_data = path_list$agg_expr_path, +#' gene_ids = h5_gene_ids, +#' remove_zero_rows = TRUE, +#' split_by_type = TRUE +#' ) +#' } else { +#' agg_expr <- get10Xmatrix( +#' path_to_data = path_list$agg_expr_path, +#' gene_column_index = gene_column_index, +#' remove_zero_rows = TRUE, +#' split_by_type = TRUE +#' ) +#' } +#' } +#' +#' data_list <- list( +#' "feat_meta" = feat_meta, +#' "tx_dt" = tx_dt, +#' "bound_dt_list" = bound_dt_list, +#' "cell_meta" = cell_meta, +#' "agg_expr" = agg_expr +#' ) +#' +#' return(data_list) +#' } +#' +#' +#' +#' +#' #' @describeIn load_xenium_folder Load from parquet files +#' #' @keywords internal +#' .load_xenium_folder_parquet <- function( +#' path_list, +#' cores, +#' data_to_use = "subcellular", +#' h5_expression = FALSE, +#' h5_gene_ids = "symbols", +#' gene_column_index = 1, +#' verbose = TRUE +#' ) { +#' # initialize return vars +#' feat_meta <- tx_dt <- bound_dt_list <- cell_meta <- agg_expr <- NULL +#' # dplyr variable +#' cell_id <- NULL +#' +#' vmsg("Loading feature metadata...", .v = verbose) +#' # updated for pipeline v1.6 json format +#' fdata_path <- path_list$panel_meta_path[[1]] +#' fdata_ext <- GiottoUtils::file_extension(fdata_path) +#' if ("json" %in% fdata_ext) { +#' feat_meta <- .load_xenium_panel_json( +#' path = fdata_path, gene_ids = h5_gene_ids) +#' } else { +#' feat_meta <- data.table::fread(fdata_path, nThread = cores) +#' colnames(feat_meta)[[1]] <- "feat_ID" +#' } +#' +#' # **** subcellular info **** +#' if (data_to_use == "subcellular") { +#' # define for data.table +#' transcript_id <- feature_name <- NULL +#' +#' # append missing QC probe info to feat_meta +#' if (isTRUE(h5_expression)) { +#' h5 <- hdf5r::H5File$new(path_list$agg_expr_path) +#' tryCatch({ +#' root <- names(h5) +#' feature_id <- h5[[paste0(root, "/features/id")]][] +#' feature_info <- h5[[paste0(root, "/features/feature_type")]][] +#' feature_names <- h5[[paste0(root, "/features/name")]][] +#' features_dt <- data.table::data.table( +#' "id" = feature_id, +#' "name" = feature_names, +#' "feature_type" = feature_info +#' ) +#' }, finally = { +#' h5$close_all() +#' }) +#' } else { +#' features_dt <- arrow::read_tsv_arrow(paste0( +#' path_list$agg_expr_path, "/features.tsv.gz"), +#' col_names = FALSE +#' ) %>% +#' data.table::setDT() +#' } +#' colnames(features_dt) <- c("id", "feat_ID", "feat_class") +#' feat_meta <- merge(features_dt[ +#' , c(2, 3)], feat_meta, all.x = TRUE, by = "feat_ID") +#' +#' vmsg("Loading transcript level info...", .v = verbose) +#' tx_dt <- arrow::read_parquet( +#' file = path_list$tx_path[[1]], +#' as_data_frame = FALSE +#' ) %>% +#' dplyr::mutate( +#' transcript_id = cast(transcript_id, arrow::string())) %>% +#' dplyr::mutate(cell_id = cast(cell_id, arrow::string())) %>% +#' dplyr::mutate( +#' feature_name = cast(feature_name, arrow::string())) %>% +#' as.data.frame() %>% +#' data.table::setDT() +#' data.table::setnames( +#' x = tx_dt, +#' old = c("feature_name", "x_location", "y_location"), +#' new = c("feat_ID", "x", "y") +#' ) +#' vmsg("Loading boundary info...", .v = verbose) +#' bound_dt_list <- lapply(path_list$bound_paths, function(x) { +#' arrow::read_parquet(file = x[[1]], as_data_frame = FALSE) %>% +#' dplyr::mutate(cell_id = cast(cell_id, arrow::string())) %>% +#' as.data.frame() %>% +#' data.table::setDT() +#' }) +#' } +#' # **** aggregate info **** +#' if (data_to_use == "aggregate") { +#' vmsg("Loading cell metadata...", .v = verbose) +#' cell_meta <- arrow::read_parquet( +#' file = path_list$cell_meta_path[[1]], +#' as_data_frame = FALSE +#' ) %>% +#' dplyr::mutate(cell_id = cast(cell_id, arrow::string())) %>% +#' as.data.frame() %>% +#' data.table::setDT() +#' +#' # NOTE: no parquet for agg_expr. +#' vmsg("Loading aggregated expression...", .v = verbose) +#' if (isTRUE(h5_expression)) { +#' agg_expr <- get10Xmatrix_h5( +#' path_to_data = path_list$agg_expr_path, +#' gene_ids = h5_gene_ids, +#' remove_zero_rows = TRUE, +#' split_by_type = TRUE +#' ) +#' } else { +#' agg_expr <- get10Xmatrix( +#' path_to_data = path_list$agg_expr_path, +#' gene_column_index = gene_column_index, +#' remove_zero_rows = TRUE, +#' split_by_type = TRUE +#' ) +#' } +#' } +#' +#' data_list <- list( +#' "feat_meta" = feat_meta, +#' "tx_dt" = tx_dt, +#' "bound_dt_list" = bound_dt_list, +#' "cell_meta" = cell_meta, +#' "agg_expr" = agg_expr +#' ) +#' +#' return(data_list) +#' } diff --git a/man/createGiottoXeniumObject.Rd b/man/createGiottoXeniumObject.Rd index 41be6c19e..edcc625f3 100644 --- a/man/createGiottoXeniumObject.Rd +++ b/man/createGiottoXeniumObject.Rd @@ -6,64 +6,101 @@ \usage{ createGiottoXeniumObject( xenium_dir, - data_to_use = c("subcellular", "aggregate"), - load_format = "csv", - h5_expression = TRUE, - h5_gene_ids = c("symbols", "ensembl"), - gene_column_index = 1, - bounds_to_load = c("cell"), + transcript_path = NULL, + bounds_path = list(cell = "cell", nucleus = "nucleus"), + gene_panel_json_path = NULL, + expression_path = NULL, + metadata_path = NULL, + feat_type = c("rna", "NegControlProbe", "UnassignedCodeword", "NegControlCodeword"), + split_keyword = list("NegControlProbe", "UnassignedCodeword", "NegControlCodeword"), qv_threshold = 20, - key_list = NULL, - instructions = NULL, - cores = NA, - verbose = TRUE + load_images = NULL, + load_aligned_images = NULL, + load_expression = FALSE, + load_cellmeta = FALSE, + verbose = NULL ) } \arguments{ -\item{xenium_dir}{full path to the exported xenium directory} +\item{xenium_dir}{Full path to the exported xenium directory} -\item{data_to_use}{which type(s) of expression data to build the gobject with -(e.g. default: \strong{'subcellular'}, 'aggregate', or 'all')} +\item{transcript_path}{Optional. Filepath to desired transcripts file to +load. Either the `.parquet` or `.csv` files can be used.} -\item{load_format}{files formats from which to load the data. Either `csv` or -`parquet` currently supported.} +\item{bounds_path}{Optional. Named list of filepaths to desired Xenium +bounds/polygon files to load. Either the `.parquet` or `.csv` files can be +used. The default is to load the `.parquets` of both cell and nucleus.} -\item{h5_expression}{(boolean) whether to load cell_feature_matrix from .h5 -file. Default is \code{TRUE}} +\item{gene_panel_json_path}{Optional. Filepath to panel json. This json +contains feature metadata information and ENSG names.} -\item{h5_gene_ids}{use gene symbols (default) or ensembl ids for the .h5 gene -expression matrix} +\item{expression_path}{Optional. Filepath to cell feature matrix. Accepts +either the `.h5` or the unzipped directory containing `.mtx` files.} -\item{gene_column_index}{which column from the features or genes .tsv file -to use for row ids} +\item{metadata_path}{Optional. Filepath to `cells.csv.gz` or `cells.parquet` +which contain cell metadata information.} -\item{bounds_to_load}{vector of boundary information to load -(e.g. \code{'cell'} -or \code{'nucleus'} by themselves or \code{c('cell', 'nucleus')} to load both -at the same time.)} +\item{feat_type}{character. feature type. Provide more than one value if +using the `split_keyword` param. For each set of keywords to split by, an +additional feat_type should be provided in the same order. Affects how +the transcripts information is loaded. Helpful for separating out the +QC probes. See details.} + +\item{split_keyword}{list of character vectors of keywords to split the +transcripts based on their feat_ID. Keywords will be `grepl()` +matched against the feature IDs information. See details.} \item{qv_threshold}{Minimum Phred-scaled quality score cutoff to be included as a subcellular transcript detection (default = 20)} -\item{key_list}{(advanced) list of grep-based keywords to split the -subcellular feature detections by feature type. See details} +\item{load_images}{Named list of filepaths to `.tif` images, usually the +ones in the `morphology_focus` directory. These `ome.tif` images are not +compatible and must be converted to `tif` using +`[GiottoClass::ometif_to_tif()]`.} + +\item{load_aligned_images}{Named list of filepaths. The list names are used +as the image names when loaded. Two filepaths are expected per entry. The +first one should be to the `.tif` image. The second path is to the `.csv` +alignment matrix file. `ome.tif` images will work, but they are currently +slower in our imaging pipeline.} -\item{instructions}{list of instructions or output result -from \code{\link[GiottoClass]{createGiottoInstructions}}} +\item{load_expression}{logical. Default = FALSE. Whether to load in 10X +provided expression matrix.} -\item{cores}{how many cores or threads to use to read data if paths are -provided} +\item{load_cellmeta}{logical. Default = FALSE. Whether to laod in 10X +provided cell metadata information} -\item{verbose}{be verbose when building Giotto object} +\item{verbose}{logical or NULL. NULL uses the `giotto.verbose` option +setting and defaults to TRUE.} } \value{ -giotto object +`giotto` object } \description{ -Given the path to a Xenium experiment output folder, creates a -Giotto object +Create a Giotto object from a Xenium experiment output folder. +Only the `xenium_dir`, `load_images`, and `load_aligned_images` params +need to be supplied when defaults are sufficient. All other params have +defaults set and are there in case of non-standard directory layouts or +alternative preference in file format to load from.\cr +When possible, `.parquet` files are loaded. This requires the additional +installation of \pkg{arrow} with zstd support. See details. `h5` is also +used by default if the 10x provided expression matrix is loaded.\cr +The 10X provided aggregated expression matrix and cell metdata are not +loaded by default since the results may be slightly different from those +that Giotto spatially aggregates. } \details{ +[\strong{arrow zstd support}] +Xenium parquets have zstd compression. \pkg{arrow} is used to access +parquets, however it may not install on all systems with zstd by default. +You can check whether zstd support is installed by running: +`arrow::arrow_info()$capabilities[["zstd"]]`. If `FALSE`, it needs to be +reinstalled with the following: +\preformatted{ + Sys.setenv(ARROW_WITH_ZSTD = "ON") + install.packages("arrow", repos = c("https://apache.r-universe.dev")) +} + [\strong{QC feature types}] Xenium provides info on feature detections that include more than only the Gene Expression specific probes. Additional probes for QC are included: @@ -72,25 +109,16 @@ Gene Expression specific probes. Additional probes for QC are included: are treated as their own feature types so that they can largely remain independent of the gene expression information. -[\strong{key_list}] -Related to \code{data_to_use = 'subcellular'} workflow only: +[\strong{feat_type and split_keyword}] Additional QC probe information is in the subcellular feature detections information and must be separated from the gene expression information during processing. The QC probes have prefixes that allow them to be selected from the rest of the feature IDs. -Giotto uses a named list of keywords (\code{key_list}) to select these QC -probes, with the list names being the names that will be assigned as the -feature type of these feature detections. The default list is used when -\code{key_list} = NULL. - -Default list: -\preformatted{ - list(blank_code = 'BLANK_', - neg_code = 'NegControlCodeword_', - neg_probe = c('NegControlProbe_|antisense_')) -} +Giotto uses `feat_type` and `split_keyword` params to select these QC +probes out as separate feature types. See examples in +`[GiottoClass::createGiottoPoints]` for how this works. -The Gene expression subset is accepted as the subset of feat_IDs that do not -map to any of the keys. +The Gene expression subset labeled as `rna` is accepted as the subset of +feat_IDs that do not get matched to any of the `split_keywords`. } diff --git a/man/dot-createGiottoXeniumObject_aggregate.Rd b/man/dot-createGiottoXeniumObject_aggregate.Rd deleted file mode 100644 index 5baa80496..000000000 --- a/man/dot-createGiottoXeniumObject_aggregate.Rd +++ /dev/null @@ -1,34 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/convenience_xenium.R -\name{.createGiottoXeniumObject_aggregate} -\alias{.createGiottoXeniumObject_aggregate} -\title{Create a Xenium Giotto object from aggregate info} -\usage{ -.createGiottoXeniumObject_aggregate( - data_list, - instructions = NULL, - cores = NA, - verbose = TRUE -) -} -\arguments{ -\item{data_list}{list of data loaded by \code{.load_xenium_folder}} - -\item{instructions}{list of instructions or output result -from \code{\link[GiottoClass]{createGiottoInstructions}}} - -\item{cores}{how many cores or threads to use to read data if paths are -provided} - -\item{verbose}{be verbose when building Giotto object} -} -\value{ -giotto object -} -\description{ -Aggregate workflow for createGiottoXeniumObject -} -\seealso{ -createGiottoXeniumObject .createGiottoXeniumObject_subcellular -} -\keyword{internal} diff --git a/man/dot-createGiottoXeniumObject_subcellular.Rd b/man/dot-createGiottoXeniumObject_subcellular.Rd deleted file mode 100644 index b7e564a92..000000000 --- a/man/dot-createGiottoXeniumObject_subcellular.Rd +++ /dev/null @@ -1,42 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/convenience_xenium.R -\name{.createGiottoXeniumObject_subcellular} -\alias{.createGiottoXeniumObject_subcellular} -\title{Create a Xenium Giotto object from subcellular info} -\usage{ -.createGiottoXeniumObject_subcellular( - data_list, - key_list = NULL, - qv_threshold = 20, - instructions = NULL, - cores = NA, - verbose = TRUE -) -} -\arguments{ -\item{data_list}{list of data loaded by \code{\link{.load_xenium_folder}}} - -\item{key_list}{regex-based search keys for feature IDs to allow separation -into separate giottoPoints objects by feat_type} - -\item{qv_threshold}{Minimum Phred-scaled quality score cutoff to be included -as a subcellular transcript detection (default = 20)} - -\item{instructions}{list of instructions or output result -from \code{\link[GiottoClass]{createGiottoInstructions}}} - -\item{cores}{how many cores or threads to use to read data if paths are -provided} - -\item{verbose}{be verbose when building Giotto object} -} -\value{ -giotto object -} -\description{ -Subcellular workflow for createGiottoXeniumObject -} -\seealso{ -createGiottoXeniumObject .createGiottoXeniumObject_aggregate -} -\keyword{internal} diff --git a/man/dot-read_xenium_folder.Rd b/man/dot-read_xenium_folder.Rd deleted file mode 100644 index f0e5dfda3..000000000 --- a/man/dot-read_xenium_folder.Rd +++ /dev/null @@ -1,42 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/convenience_xenium.R -\name{.read_xenium_folder} -\alias{.read_xenium_folder} -\title{Read a structured xenium folder} -\usage{ -.read_xenium_folder( - xenium_dir, - data_to_use = "subcellular", - bounds_to_load = c("cell"), - load_format = "csv", - h5_expression = FALSE, - verbose = TRUE -) -} -\arguments{ -\item{xenium_dir}{full path to the exported xenium directory} - -\item{data_to_use}{which type(s) of expression data to build the gobject with -(e.g. default: \strong{'subcellular'}, 'aggregate', or 'all')} - -\item{bounds_to_load}{vector of boundary information to load -(e.g. \code{'cell'} -or \code{'nucleus'} by themselves or \code{c('cell', 'nucleus')} to load both -at the same time.)} - -\item{load_format}{files formats from which to load the data. Either `csv` or -`parquet` currently supported.} - -\item{h5_expression}{(boolean) whether to load cell_feature_matrix from .h5 -file. Default is \code{TRUE}} - -\item{verbose}{be verbose when building Giotto object} -} -\value{ -path_list a list of xenium files discovered and their filepaths. NULL -values denote missing items -} -\description{ -Read a structured xenium folder -} -\keyword{internal} diff --git a/man/load_xenium_folder.Rd b/man/load_xenium_folder.Rd deleted file mode 100644 index fb2cd8951..000000000 --- a/man/load_xenium_folder.Rd +++ /dev/null @@ -1,77 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/convenience_xenium.R -\name{load_xenium_folder} -\alias{load_xenium_folder} -\alias{.load_xenium_folder} -\alias{.load_xenium_folder_csv} -\alias{.load_xenium_folder_parquet} -\title{Load xenium data from folder} -\usage{ -.load_xenium_folder( - path_list, - load_format = "csv", - data_to_use = "subcellular", - h5_expression = "FALSE", - h5_gene_ids = "symbols", - gene_column_index = 1, - cores, - verbose = TRUE -) - -.load_xenium_folder_csv( - path_list, - cores, - data_to_use = "subcellular", - h5_expression = FALSE, - h5_gene_ids = "symbols", - gene_column_index = 1, - verbose = TRUE -) - -.load_xenium_folder_parquet( - path_list, - cores, - data_to_use = "subcellular", - h5_expression = FALSE, - h5_gene_ids = "symbols", - gene_column_index = 1, - verbose = TRUE -) -} -\arguments{ -\item{path_list}{list of full filepaths from .read_xenium_folder} - -\item{load_format}{files formats from which to load the data. Either `csv` or -`parquet` currently supported.} - -\item{data_to_use}{which type(s) of expression data to build the gobject with -(e.g. default: \strong{'subcellular'}, 'aggregate', or 'all')} - -\item{h5_expression}{(boolean) whether to load cell_feature_matrix from .h5 -file. Default is \code{TRUE}} - -\item{h5_gene_ids}{use gene symbols (default) or ensembl ids for the .h5 gene -expression matrix} - -\item{gene_column_index}{which column from the features or genes .tsv file -to use for row ids} - -\item{cores}{how many cores or threads to use to read data if paths are -provided} - -\item{verbose}{be verbose when building Giotto object} -} -\value{ -list of loaded in xenium data -} -\description{ -Load xenium data from folder -} -\section{Functions}{ -\itemize{ -\item \code{.load_xenium_folder_csv()}: Load from csv files - -\item \code{.load_xenium_folder_parquet()}: Load from parquet files - -}} -\keyword{internal} From 5bf13dad05c3d1ace4eeebb94ca54c02009d72e4 Mon Sep 17 00:00:00 2001 From: George Chen <72078254+jiajic@users.noreply.github.com> Date: Wed, 31 Jul 2024 11:11:44 -0400 Subject: [PATCH 19/33] chore: use @md tag --- R/convenience_xenium.R | 1 + man/createGiottoXeniumObject.Rd | 52 ++++++++++++++++----------------- 2 files changed, 27 insertions(+), 26 deletions(-) diff --git a/R/convenience_xenium.R b/R/convenience_xenium.R index 25bf811c0..ebee56c0d 100644 --- a/R/convenience_xenium.R +++ b/R/convenience_xenium.R @@ -1232,6 +1232,7 @@ importXenium <- function( #' The Gene expression subset labeled as `rna` is accepted as the subset of #' feat_IDs that do not get matched to any of the `split_keywords`. #' +#' @md #' @export createGiottoXeniumObject <- function( xenium_dir, diff --git a/man/createGiottoXeniumObject.Rd b/man/createGiottoXeniumObject.Rd index edcc625f3..14a2fb4d4 100644 --- a/man/createGiottoXeniumObject.Rd +++ b/man/createGiottoXeniumObject.Rd @@ -25,43 +25,43 @@ createGiottoXeniumObject( \item{xenium_dir}{Full path to the exported xenium directory} \item{transcript_path}{Optional. Filepath to desired transcripts file to -load. Either the `.parquet` or `.csv` files can be used.} +load. Either the \code{.parquet} or \code{.csv} files can be used.} \item{bounds_path}{Optional. Named list of filepaths to desired Xenium -bounds/polygon files to load. Either the `.parquet` or `.csv` files can be -used. The default is to load the `.parquets` of both cell and nucleus.} +bounds/polygon files to load. Either the \code{.parquet} or \code{.csv} files can be +used. The default is to load the \code{.parquets} of both cell and nucleus.} \item{gene_panel_json_path}{Optional. Filepath to panel json. This json contains feature metadata information and ENSG names.} \item{expression_path}{Optional. Filepath to cell feature matrix. Accepts -either the `.h5` or the unzipped directory containing `.mtx` files.} +either the \code{.h5} or the unzipped directory containing \code{.mtx} files.} -\item{metadata_path}{Optional. Filepath to `cells.csv.gz` or `cells.parquet` +\item{metadata_path}{Optional. Filepath to \code{cells.csv.gz} or \code{cells.parquet} which contain cell metadata information.} \item{feat_type}{character. feature type. Provide more than one value if -using the `split_keyword` param. For each set of keywords to split by, an +using the \code{split_keyword} param. For each set of keywords to split by, an additional feat_type should be provided in the same order. Affects how the transcripts information is loaded. Helpful for separating out the QC probes. See details.} \item{split_keyword}{list of character vectors of keywords to split the -transcripts based on their feat_ID. Keywords will be `grepl()` +transcripts based on their feat_ID. Keywords will be \code{grepl()} matched against the feature IDs information. See details.} \item{qv_threshold}{Minimum Phred-scaled quality score cutoff to be included as a subcellular transcript detection (default = 20)} -\item{load_images}{Named list of filepaths to `.tif` images, usually the -ones in the `morphology_focus` directory. These `ome.tif` images are not -compatible and must be converted to `tif` using -`[GiottoClass::ometif_to_tif()]`.} +\item{load_images}{Named list of filepaths to \code{.tif} images, usually the +ones in the \code{morphology_focus} directory. These \code{ome.tif} images are not +compatible and must be converted to \code{tif} using +\verb{[GiottoClass::ometif_to_tif()]}.} \item{load_aligned_images}{Named list of filepaths. The list names are used as the image names when loaded. Two filepaths are expected per entry. The -first one should be to the `.tif` image. The second path is to the `.csv` -alignment matrix file. `ome.tif` images will work, but they are currently +first one should be to the \code{.tif} image. The second path is to the \code{.csv} +alignment matrix file. \code{ome.tif} images will work, but they are currently slower in our imaging pipeline.} \item{load_expression}{logical. Default = FALSE. Whether to load in 10X @@ -70,38 +70,38 @@ provided expression matrix.} \item{load_cellmeta}{logical. Default = FALSE. Whether to laod in 10X provided cell metadata information} -\item{verbose}{logical or NULL. NULL uses the `giotto.verbose` option +\item{verbose}{logical or NULL. NULL uses the \code{giotto.verbose} option setting and defaults to TRUE.} } \value{ -`giotto` object +\code{giotto} object } \description{ Create a Giotto object from a Xenium experiment output folder. -Only the `xenium_dir`, `load_images`, and `load_aligned_images` params +Only the \code{xenium_dir}, \code{load_images}, and \code{load_aligned_images} params need to be supplied when defaults are sufficient. All other params have defaults set and are there in case of non-standard directory layouts or alternative preference in file format to load from.\cr -When possible, `.parquet` files are loaded. This requires the additional -installation of \pkg{arrow} with zstd support. See details. `h5` is also +When possible, \code{.parquet} files are loaded. This requires the additional +installation of \pkg{arrow} with zstd support. See details. \code{h5} is also used by default if the 10x provided expression matrix is loaded.\cr The 10X provided aggregated expression matrix and cell metdata are not loaded by default since the results may be slightly different from those that Giotto spatially aggregates. } \details{ -[\strong{arrow zstd support}] +\link[=\\strong{arrow zstd support}]{\strong{arrow zstd support}} Xenium parquets have zstd compression. \pkg{arrow} is used to access parquets, however it may not install on all systems with zstd by default. You can check whether zstd support is installed by running: -`arrow::arrow_info()$capabilities[["zstd"]]`. If `FALSE`, it needs to be +\code{arrow::arrow_info()$capabilities[["zstd"]]}. If \code{FALSE}, it needs to be reinstalled with the following: \preformatted{ Sys.setenv(ARROW_WITH_ZSTD = "ON") install.packages("arrow", repos = c("https://apache.r-universe.dev")) } -[\strong{QC feature types}] +\link[=\\strong{QC feature types}]{\strong{QC feature types}} Xenium provides info on feature detections that include more than only the Gene Expression specific probes. Additional probes for QC are included: \emph{blank codeword}, \emph{negative control codeword}, and @@ -109,16 +109,16 @@ Gene Expression specific probes. Additional probes for QC are included: are treated as their own feature types so that they can largely remain independent of the gene expression information. -[\strong{feat_type and split_keyword}] +\link[=\\strong{feat_type and split_keyword}]{\strong{feat_type and split_keyword}} Additional QC probe information is in the subcellular feature detections information and must be separated from the gene expression information during processing. The QC probes have prefixes that allow them to be selected from the rest of the feature IDs. -Giotto uses `feat_type` and `split_keyword` params to select these QC +Giotto uses \code{feat_type} and \code{split_keyword} params to select these QC probes out as separate feature types. See examples in -`[GiottoClass::createGiottoPoints]` for how this works. +\verb{[GiottoClass::createGiottoPoints]} for how this works. -The Gene expression subset labeled as `rna` is accepted as the subset of -feat_IDs that do not get matched to any of the `split_keywords`. +The Gene expression subset labeled as \code{rna} is accepted as the subset of +feat_IDs that do not get matched to any of the \code{split_keywords}. } From 8239661befba1894c624098f164fc8de6e093ed8 Mon Sep 17 00:00:00 2001 From: George Chen <72078254+jiajic@users.noreply.github.com> Date: Wed, 31 Jul 2024 11:14:49 -0400 Subject: [PATCH 20/33] chore: fix formatting --- R/convenience_xenium.R | 10 +++++----- man/createGiottoXeniumObject.Rd | 6 +++--- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/R/convenience_xenium.R b/R/convenience_xenium.R index ebee56c0d..32ebb99bc 100644 --- a/R/convenience_xenium.R +++ b/R/convenience_xenium.R @@ -1200,7 +1200,7 @@ importXenium <- function( #' @returns `giotto` object #' @details #' -#' [\strong{arrow zstd support}] +#' \[\strong{arrow zstd support}\] #' Xenium parquets have zstd compression. \pkg{arrow} is used to access #' parquets, however it may not install on all systems with zstd by default. #' You can check whether zstd support is installed by running: @@ -1211,7 +1211,7 @@ importXenium <- function( #' install.packages("arrow", repos = c("https://apache.r-universe.dev")) #' } #' -#' [\strong{QC feature types}] +#' \[\strong{QC feature types}\] #' Xenium provides info on feature detections that include more than only the #' Gene Expression specific probes. Additional probes for QC are included: #' \emph{blank codeword}, \emph{negative control codeword}, and @@ -1219,7 +1219,7 @@ importXenium <- function( #' are treated as their own feature types so that they can largely remain #' independent of the gene expression information. #' -#' [\strong{feat_type and split_keyword}] +#' \[\strong{feat_type and split_keyword}\] #' Additional QC probe information is in the subcellular feature detections #' information and must be separated from the gene expression information #' during processing. @@ -1313,7 +1313,7 @@ createGiottoXeniumObject <- function( #' #' @returns giotto object #' #' @details #' #' -#' #' [\strong{QC feature types}] +#' #' \[\strong{QC feature types}\] #' #' Xenium provides info on feature detections that include more than only the #' #' Gene Expression specific probes. Additional probes for QC are included: #' #' \emph{blank codeword}, \emph{negative control codeword}, and @@ -1321,7 +1321,7 @@ createGiottoXeniumObject <- function( #' #' are treated as their own feature types so that they can largely remain #' #' independent of the gene expression information. #' #' -#' #' [\strong{key_list}] +#' #' \[\strong{key_list}\] #' #' Related to \code{data_to_use = 'subcellular'} workflow only: #' #' Additional QC probe information is in the subcellular feature detections #' #' information and must be separated from the gene expression information diff --git a/man/createGiottoXeniumObject.Rd b/man/createGiottoXeniumObject.Rd index 14a2fb4d4..51aadbab7 100644 --- a/man/createGiottoXeniumObject.Rd +++ b/man/createGiottoXeniumObject.Rd @@ -90,7 +90,7 @@ loaded by default since the results may be slightly different from those that Giotto spatially aggregates. } \details{ -\link[=\\strong{arrow zstd support}]{\strong{arrow zstd support}} +[\strong{arrow zstd support}] Xenium parquets have zstd compression. \pkg{arrow} is used to access parquets, however it may not install on all systems with zstd by default. You can check whether zstd support is installed by running: @@ -101,7 +101,7 @@ reinstalled with the following: install.packages("arrow", repos = c("https://apache.r-universe.dev")) } -\link[=\\strong{QC feature types}]{\strong{QC feature types}} +[\strong{QC feature types}] Xenium provides info on feature detections that include more than only the Gene Expression specific probes. Additional probes for QC are included: \emph{blank codeword}, \emph{negative control codeword}, and @@ -109,7 +109,7 @@ Gene Expression specific probes. Additional probes for QC are included: are treated as their own feature types so that they can largely remain independent of the gene expression information. -\link[=\\strong{feat_type and split_keyword}]{\strong{feat_type and split_keyword}} +[\strong{feat_type and split_keyword}] Additional QC probe information is in the subcellular feature detections information and must be separated from the gene expression information during processing. From 33a0eaf4f0bca4edc0357f3b2beb7d471f27bd9f Mon Sep 17 00:00:00 2001 From: George Chen <72078254+jiajic@users.noreply.github.com> Date: Wed, 31 Jul 2024 11:19:41 -0400 Subject: [PATCH 21/33] fix typo --- R/convenience_xenium.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/convenience_xenium.R b/R/convenience_xenium.R index 32ebb99bc..e8bd38d85 100644 --- a/R/convenience_xenium.R +++ b/R/convenience_xenium.R @@ -410,7 +410,7 @@ setMethod( split_keyword = split_keyword, verbose = verbose ) - g <- setGiotto(g, tx, verbose = FALSE) # lists are fine + g <- setGiotto(g, tx_list, verbose = FALSE) # lists are fine # polys if (!is.null(load_bounds)) { From f491e3893564a3b1866dcf9039d5439abfb4d0da Mon Sep 17 00:00:00 2001 From: George Chen <72078254+jiajic@users.noreply.github.com> Date: Wed, 31 Jul 2024 11:27:42 -0400 Subject: [PATCH 22/33] fix typo --- R/convenience_xenium.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/convenience_xenium.R b/R/convenience_xenium.R index e8bd38d85..2caf9ea81 100644 --- a/R/convenience_xenium.R +++ b/R/convenience_xenium.R @@ -344,7 +344,7 @@ setMethod( ), gene_panel_json_path = panel_meta_path, expression_path = expr_path, - metadata_path = meta_path, + metadata_path = cell_meta_path, feat_type = c( "rna", "NegControlProbe", From 4a32833f3e316347205319758bc4e04533b982bd Mon Sep 17 00:00:00 2001 From: George Chen <72078254+jiajic@users.noreply.github.com> Date: Wed, 31 Jul 2024 11:55:07 -0400 Subject: [PATCH 23/33] fixes - add dropcols for centroids in metadata - fix feature type naming in expression matrices --- R/convenience_xenium.R | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/R/convenience_xenium.R b/R/convenience_xenium.R index 2caf9ea81..2a40d7ff1 100644 --- a/R/convenience_xenium.R +++ b/R/convenience_xenium.R @@ -249,7 +249,7 @@ setMethod( # load cellmeta cmeta_fun <- function( path = cell_meta_path, - dropcols = c(), + dropcols = c("x_centroid", "y_centroid"), cores = determine_cores(), verbose = NULL ) { @@ -967,14 +967,18 @@ importXenium <- function( # ensure list if (!inherits(ex_list, "list")) ex_list <- list(ex_list) + # set correct feature name + fname <- "rna" + if (length(fname) > 1L) fname <- names(fname) + fname[fname == "Gene Expression"] <- "rna" # lapply to process more than one if present - eo_list <- lapply(ex_list, function(ex) { + eo_list <- lapply(seq_along(ex_list), function(ex_i) { createExprObj( - expression_data = ex, + expression_data = ex_list[[ex_i]], name = "raw", spat_unit = "cell", - feat_type = "rna", + feat_type = fname[[ex_i]], provenance = "cell" ) }) From 9182a8a050865c48bb51e50edbcc81d033650806 Mon Sep 17 00:00:00 2001 From: George Chen <72078254+jiajic@users.noreply.github.com> Date: Wed, 31 Jul 2024 11:56:11 -0400 Subject: [PATCH 24/33] fix refs to image loading funs --- R/convenience_xenium.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/convenience_xenium.R b/R/convenience_xenium.R index 2a40d7ff1..0218007c0 100644 --- a/R/convenience_xenium.R +++ b/R/convenience_xenium.R @@ -474,7 +474,7 @@ setMethod( imglist <- list() imnames <- names(load_images) for (impath_i in seq_along(load_images)) { - im <- load_image( + im <- funs$load_image( path = load_images[[impath_i]], name = imnames[[impath_i]] ) @@ -490,7 +490,7 @@ setMethod( aimglist <- list() aimnames <- names(load_aligned_images) for (aim_i in seq_along(load_aligned_images)) { - aim <- load_aligned_image( + aim <- funs$load_aligned_image( path = load_aligned_images[[aim_i]][1], imagealignment_path = load_aligned_images[[aim_i]][2] ) From 8d3a9e62837f9002ed2d3cc0c6f84c1df8b339ac Mon Sep 17 00:00:00 2001 From: George Chen <72078254+jiajic@users.noreply.github.com> Date: Wed, 31 Jul 2024 12:04:08 -0400 Subject: [PATCH 25/33] fix typo --- R/convenience_xenium.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/convenience_xenium.R b/R/convenience_xenium.R index 0218007c0..6d4972340 100644 --- a/R/convenience_xenium.R +++ b/R/convenience_xenium.R @@ -969,7 +969,7 @@ importXenium <- function( if (!inherits(ex_list, "list")) ex_list <- list(ex_list) # set correct feature name fname <- "rna" - if (length(fname) > 1L) fname <- names(fname) + if (length(names(ex_list)) > 1L) fname <- names(ex_list) fname[fname == "Gene Expression"] <- "rna" # lapply to process more than one if present From b9a50b848d628b77836b2de0af7774421c8984c1 Mon Sep 17 00:00:00 2001 From: George Chen <72078254+jiajic@users.noreply.github.com> Date: Wed, 31 Jul 2024 12:21:30 -0400 Subject: [PATCH 26/33] fix and add aligned image naming --- R/convenience_xenium.R | 5 ++++- R/general_help.R | 12 ++++++++---- man/read10xAffineImage.Rd | 19 +++++++++++++++---- 3 files changed, 27 insertions(+), 9 deletions(-) diff --git a/R/convenience_xenium.R b/R/convenience_xenium.R index 6d4972340..7d7fdb5ed 100644 --- a/R/convenience_xenium.R +++ b/R/convenience_xenium.R @@ -324,11 +324,13 @@ setMethod( img_aff_fun <- function( path, imagealignment_path, + name = "aligned_image", micron = obj@micron ) { read10xAffineImage( file = path, imagealignment_path = imagealignment_path, + name = name, micron = micron ) } @@ -492,7 +494,8 @@ setMethod( for (aim_i in seq_along(load_aligned_images)) { aim <- funs$load_aligned_image( path = load_aligned_images[[aim_i]][1], - imagealignment_path = load_aligned_images[[aim_i]][2] + imagealignment_path = load_aligned_images[[aim_i]][2], + name = aimnames[[aim_i]] ) aimglist <- c(aimglist, aim) } diff --git a/R/general_help.R b/R/general_help.R index f6fe38ea8..c10ae4ffb 100644 --- a/R/general_help.R +++ b/R/general_help.R @@ -727,14 +727,18 @@ get10Xmatrix_h5 <- function( #' transform. Loads the image in with an orientation that matches the dataset #' points and polygons vector information #' @param file filepath to image +#' @param imagealignment_path filepath to alignment file which contains +#' an affine transformation matrix. Usually a `.csv` file +#' @param name character. Name to assign the image. Default is 'image'. #' @param micron micron scaling. Directly used if a numeric is supplied. #' Also prefers a filepath to the `experiment.xenium` file which contains this #' info. A default of 0.2125 is provided. -#' @param affine filepath to `...imagealignment.csv` which contains an affine -#' transformation matrix +#' @param \dots additional params to pass to +#' `[GiottoClass::createGiottoLargeImage]` +#' @md #' @export read10xAffineImage <- function( - file, imagealignment_path, micron = 0.2125 + file, imagealignment_path, name = "aligned_image", micron = 0.2125, ... ) { checkmate::assert_file_exists(file) checkmate::assert_file_exists(imagealignment_path) @@ -746,7 +750,7 @@ read10xAffineImage <- function( aff <- data.table::fread(imagealignment_path) %>% as.matrix() - img <- createGiottoLargeImage(file) + img <- createGiottoLargeImage(file, name = name, ...) aff_img <- .tenx_img_affine(x = img, affine = aff, micron = micron) diff --git a/man/read10xAffineImage.Rd b/man/read10xAffineImage.Rd index b50226a50..b4681f097 100644 --- a/man/read10xAffineImage.Rd +++ b/man/read10xAffineImage.Rd @@ -4,17 +4,28 @@ \alias{read10xAffineImage} \title{read10xAffineImage} \usage{ -read10xAffineImage(file, imagealignment_path, micron = 0.2125) +read10xAffineImage( + file, + imagealignment_path, + name = "aligned_image", + micron = 0.2125, + ... +) } \arguments{ \item{file}{filepath to image} +\item{imagealignment_path}{filepath to alignment file which contains +an affine transformation matrix. Usually a \code{.csv} file} + +\item{name}{character. Name to assign the image. Default is 'image'.} + \item{micron}{micron scaling. Directly used if a numeric is supplied. -Also prefers a filepath to the `experiment.xenium` file which contains this +Also prefers a filepath to the \code{experiment.xenium} file which contains this info. A default of 0.2125 is provided.} -\item{affine}{filepath to `...imagealignment.csv` which contains an affine -transformation matrix} +\item{\dots}{additional params to pass to +\verb{[GiottoClass::createGiottoLargeImage]}} } \description{ Read a 10x image that is provided with an affine matrix From 2c1d95b98f25cd7ca5ad14bccab39e985239d4f7 Mon Sep 17 00:00:00 2001 From: George Chen <72078254+jiajic@users.noreply.github.com> Date: Wed, 31 Jul 2024 12:29:22 -0400 Subject: [PATCH 27/33] messaging changes --- R/convenience_xenium.R | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/R/convenience_xenium.R b/R/convenience_xenium.R index 7d7fdb5ed..62bf61376 100644 --- a/R/convenience_xenium.R +++ b/R/convenience_xenium.R @@ -325,13 +325,15 @@ setMethod( path, imagealignment_path, name = "aligned_image", - micron = obj@micron + micron = obj@micron, + verbose = NULL ) { read10xAffineImage( file = path, imagealignment_path = imagealignment_path, name = name, - micron = micron + micron = micron, + verbose = verbose ) } obj@calls$load_aligned_image <- img_aff_fun @@ -492,6 +494,8 @@ setMethod( aimglist <- list() aimnames <- names(load_aligned_images) for (aim_i in seq_along(load_aligned_images)) { + vmsg(.v = verbose, "loading aligned image as '%s'", + aimnames[[aim_i]]) aim <- funs$load_aligned_image( path = load_aligned_images[[aim_i]][1], imagealignment_path = load_aligned_images[[aim_i]][2], @@ -815,7 +819,7 @@ importXenium <- function( e <- file_extension(path) %>% head(1L) %>% tolower() a <- list(path = path, dropcols = dropcols) - vmsg('Loading cell metadata...', .v = verbose) + vmsg('Loading 10X cell metadata...', .v = verbose) vmsg(.v = verbose, .is_debug = TRUE, "[CMETA_READ] FMT =", e) vmsg(.v = verbose, .is_debug = TRUE, path) verbose <- verbose %null% TRUE From e5a6710757eed15dcf86b884dee0f667d4cd81f7 Mon Sep 17 00:00:00 2001 From: George Chen <72078254+jiajic@users.noreply.github.com> Date: Wed, 31 Jul 2024 12:34:53 -0400 Subject: [PATCH 28/33] fix typo --- R/convenience_xenium.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/convenience_xenium.R b/R/convenience_xenium.R index 62bf61376..b2106f3f4 100644 --- a/R/convenience_xenium.R +++ b/R/convenience_xenium.R @@ -494,8 +494,8 @@ setMethod( aimglist <- list() aimnames <- names(load_aligned_images) for (aim_i in seq_along(load_aligned_images)) { - vmsg(.v = verbose, "loading aligned image as '%s'", - aimnames[[aim_i]]) + vmsg(.v = verbose, sprintf("loading aligned image as '%s'", + aimnames[[aim_i]])) aim <- funs$load_aligned_image( path = load_aligned_images[[aim_i]][1], imagealignment_path = load_aligned_images[[aim_i]][2], From 658fc72600e642910be3fdcec580264596acc957 Mon Sep 17 00:00:00 2001 From: George Chen <72078254+jiajic@users.noreply.github.com> Date: Wed, 31 Jul 2024 12:42:38 -0400 Subject: [PATCH 29/33] update messages --- R/convenience_xenium.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/convenience_xenium.R b/R/convenience_xenium.R index b2106f3f4..9e3ca4e4d 100644 --- a/R/convenience_xenium.R +++ b/R/convenience_xenium.R @@ -757,7 +757,7 @@ importXenium <- function( e <- file_extension(path) %>% head(1L) %>% tolower() a <- list(path = path) - vmsg("Loading boundary info...", .v = verbose) + vmsg(sprintf("Loading boundary info '%s'", name), .v = verbose) vmsg(.v = verbose, .is_debug = TRUE, "[POLY_READ] FMT =", e) vmsg(.v = verbose, .is_debug = TRUE, path) # pass to specific load function based on file extension From 4edc4578bdd4950308c750c4ae7534267236c386 Mon Sep 17 00:00:00 2001 From: George Chen <72078254+jiajic@users.noreply.github.com> Date: Wed, 31 Jul 2024 12:56:53 -0400 Subject: [PATCH 30/33] fix optional params passing for xen conv. --- R/convenience_xenium.R | 39 ++++++++++++++++++++++----------------- 1 file changed, 22 insertions(+), 17 deletions(-) diff --git a/R/convenience_xenium.R b/R/convenience_xenium.R index 9e3ca4e4d..624110d92 100644 --- a/R/convenience_xenium.R +++ b/R/convenience_xenium.R @@ -1181,7 +1181,8 @@ importXenium <- function( #' contains feature metadata information and ENSG names. #' @param expression_path Optional. Filepath to cell feature matrix. Accepts #' either the `.h5` or the unzipped directory containing `.mtx` files. -#' @param metadata_path Optional. Filepath to `cells.csv.gz` or `cells.parquet` +#' @param cell_metadata_path Optional. Filepath to `cells.csv.gz` or +#' `cells.parquet` #' which contain cell metadata information. #' @param feat_type character. feature type. Provide more than one value if #' using the `split_keyword` param. For each set of keywords to split by, an @@ -1252,9 +1253,9 @@ createGiottoXeniumObject <- function( cell = "cell", nucleus = "nucleus" ), - gene_panel_json_path = NULL, + gene_panel_json_path = NULL, # optional expression_path = NULL, # optional - metadata_path = NULL, + cell_metadata_path = NULL, # optional feat_type = c( "rna", "NegControlProbe", @@ -1277,21 +1278,25 @@ createGiottoXeniumObject <- function( # apply reader params x$qv <- qv_threshold - g <- x$create_gobject( - transcript_path = transcript_path, - load_bounds = bounds_path, - gene_panel_json_path = gene_panel_json_path, - expression_path = expression_path, - metadata_path = metadata_path, - feat_type = split_keyword, - split_keyword = split_keyword, - load_images = load_images, - load_aligned_images = load_aligned_images, - load_expression = load_expression, - load_cellmeta = load_cellmeta, - verbose = verbose - ) + # directly passed + a <- list(load_bounds = bounds_path, + feat_type = feat_type, + split_keyword = split_keyword, + load_images = load_images, + load_aligned_images = load_aligned_images, + load_expression = load_expression, + load_cellmeta = load_cellmeta, + verbose = verbose) + + # only passed if not null + if (!is.null(transcript_path)) a$transcript_path <- transcript_path + if (!is.null(gene_panel_json_path)) { + a$gene_panel_json_path <- gene_panel_json_path + } + if (!is.null(expression_path)) a$expression_path <- expression_path + if (!is.null(cell_metadata_path)) a$metadata_path <- cell_metadata_path + g <- do.call(x$create_gobject, args = a) return(g) } From 45417241c3c3849c92d0fb4d0599b375defe794a Mon Sep 17 00:00:00 2001 From: George Chen <72078254+jiajic@users.noreply.github.com> Date: Wed, 31 Jul 2024 12:56:58 -0400 Subject: [PATCH 31/33] docs --- man/createGiottoXeniumObject.Rd | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/man/createGiottoXeniumObject.Rd b/man/createGiottoXeniumObject.Rd index 51aadbab7..0d310af52 100644 --- a/man/createGiottoXeniumObject.Rd +++ b/man/createGiottoXeniumObject.Rd @@ -10,7 +10,7 @@ createGiottoXeniumObject( bounds_path = list(cell = "cell", nucleus = "nucleus"), gene_panel_json_path = NULL, expression_path = NULL, - metadata_path = NULL, + cell_metadata_path = NULL, feat_type = c("rna", "NegControlProbe", "UnassignedCodeword", "NegControlCodeword"), split_keyword = list("NegControlProbe", "UnassignedCodeword", "NegControlCodeword"), qv_threshold = 20, @@ -37,7 +37,8 @@ contains feature metadata information and ENSG names.} \item{expression_path}{Optional. Filepath to cell feature matrix. Accepts either the \code{.h5} or the unzipped directory containing \code{.mtx} files.} -\item{metadata_path}{Optional. Filepath to \code{cells.csv.gz} or \code{cells.parquet} +\item{cell_metadata_path}{Optional. Filepath to \code{cells.csv.gz} or +\code{cells.parquet} which contain cell metadata information.} \item{feat_type}{character. feature type. Provide more than one value if From 699ee757e8906193d04d92fab2ce44b2d6d68b13 Mon Sep 17 00:00:00 2001 From: George Chen <72078254+jiajic@users.noreply.github.com> Date: Wed, 31 Jul 2024 13:39:06 -0400 Subject: [PATCH 32/33] fix mtx dir detection --- R/convenience_xenium.R | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/R/convenience_xenium.R b/R/convenience_xenium.R index 624110d92..0caaabbbe 100644 --- a/R/convenience_xenium.R +++ b/R/convenience_xenium.R @@ -135,7 +135,7 @@ setMethod( } if (!ftype$expression %in% ft_exp) { stop(wrap_txt("`$filetype$expression` must be one of", - paste(ft_tab, collapse = ", ")), + paste(tf_exp, collapse = ", ")), call. = FALSE) } @@ -178,9 +178,18 @@ setMethod( tx_path <- .xenium_ftype(tx_path, ftype$transcripts) cell_bound_path <- .xenium_ftype(cell_bound_path, ftype$boundaries) nuc_bound_path <- .xenium_ftype(nuc_bound_path, ftype$boundaries) - expr_path <- .xenium_ftype(expr_path, ftype$expression) cell_meta_path <- .xenium_ftype(cell_meta_path, ftype$cell_meta) + # for mtx, check if directory instead + if (ftype$expression == "mtx") { + is_dir <- vapply( + expr_path, checkmate::test_directory, FUN.VALUE = logical(1L) + ) + expr_path <- expr_path[is_dir] + } else { + expr_path <- .xenium_ftype(expr_path, ftype$expression) + } + # decide micron scaling if (length(obj@micron) == 0) { # if no value already set if (!is.null(experiment_info_path)) { @@ -444,7 +453,7 @@ setMethod( # no dropcols verbose = verbose ) - g <- setGiotto(g, fx) + g <- setGiotto(g, fx, verbose = FALSE) # expression From 2b8c5f788835065a3bb30ab2642685b0b7c703ea Mon Sep 17 00:00:00 2001 From: George Chen <72078254+jiajic@users.noreply.github.com> Date: Wed, 31 Jul 2024 13:52:26 -0400 Subject: [PATCH 33/33] fix checking for filepaths --- R/convenience_xenium.R | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/R/convenience_xenium.R b/R/convenience_xenium.R index 0caaabbbe..ef7f9d836 100644 --- a/R/convenience_xenium.R +++ b/R/convenience_xenium.R @@ -117,7 +117,7 @@ setMethod( ftype <- obj@filetype ft_tab <- c("csv", "parquet") - ft_exp <- c("h5", "mtx", "zarr") + ft_exp <- c("h5", "mtx") # zarr not yet supported if (!ftype$transcripts %in% ft_tab) { stop(wrap_txt("`$filetype$transcripts` must be one of", paste(ft_tab, collapse = ", ")), @@ -956,7 +956,7 @@ importXenium <- function( "No path to expression dir (mtx) or file (h5) provided or auto-detected" ), call. = FALSE) } - checkmate::assert_file_exists(path) + if (!file.exists(path)) stop("filepath or directory does not exist.\n") a <- list( path = path, gene_ids = gene_ids, @@ -978,7 +978,8 @@ importXenium <- function( verbose <- verbose %null% TRUE ex_list <- switch(e, "mtx" = do.call(.xenium_expression_mtx, args = a), - "h5" = do.call(.xenium_expression_h5, args = a) + "h5" = do.call(.xenium_expression_h5, args = a), + "zarr" = stop("zarr reading not yet implemented") ) # ensure list