From bee2b5dd2e132c2eb5a684d259cba5c50ff206a9 Mon Sep 17 00:00:00 2001 From: sllynn Date: Tue, 24 Sep 2024 13:36:53 +0200 Subject: [PATCH 01/12] formatter, added rst_setsrid to docs --- docs/source/api/raster-functions.rst | 49 ++++++++++++++++++++++++++-- python/mosaic/api/raster.py | 30 ++++++++--------- python/test/test_raster_functions.py | 9 +++-- python/test/test_vector_functions.py | 3 +- 4 files changed, 70 insertions(+), 21 deletions(-) diff --git a/docs/source/api/raster-functions.rst b/docs/source/api/raster-functions.rst index 517b8f8f1..ec51fa33f 100644 --- a/docs/source/api/raster-functions.rst +++ b/docs/source/api/raster-functions.rst @@ -50,7 +50,7 @@ rst_avg Returns an array containing mean values for each band. - :param tile: A column containing the raster tile. + :param tile: A column containing the raster tile. :type tile: Column (RasterTileType) :rtype: Column: ArrayType(DoubleType) @@ -92,7 +92,7 @@ rst_bandmetadata Extract the metadata describing the raster band. Metadata is return as a map of key value pairs. - :param tile: A column containing the raster tile. + :param tile: A column containing the raster tile. :type tile: Column (RasterTileType) :param band: The band number to extract metadata for. :type band: Column (IntegerType) @@ -2616,7 +2616,7 @@ rst_separatebands +--------------------------------------------------------------------------------------------------------------------------------+ rst_setnodata -********************** +************* .. function:: rst_setnodata(tile, nodata) @@ -2668,6 +2668,49 @@ rst_setnodata | {index_id: 593308294097928192, raster: [00 01 10 ... 00], parentPath: "dbfs:/path_to_file", driver: "GTiff" } | +------------------------------------------------------------------------------------------------------------------+ +rst_setsrid +*********** + +.. function:: rst_setsrid(tile, srid) + + Set the SRID of the raster tile as an EPSG code. + + :param tile: A column containing the raster tile. + :type tile: Column (RasterTileType) + :param srid: The SRID to set + :type srid: Column (IntegerType) + :rtype: Column: (RasterTileType) + + :example: + +.. tabs:: + .. code-tab:: py + + df.select(mos.rst_setsrid('tile', F.lit(9122))).display() + +------------------------------------------------------------------------------------------------------------------+ + | rst_setsrid(tile, 9122) | + +------------------------------------------------------------------------------------------------------------------+ + | {index_id: 593308294097928191, tile: [00 01 10 ... 00], parentPath: "dbfs:/path_to_file", driver: "GTiff" } | + +------------------------------------------------------------------------------------------------------------------+ + + .. code-tab:: scala + + df.select(rst_setsrid(col("tile"), lit(9122))).show + +------------------------------------------------------------------------------------------------------------------+ + | rst_setsrid(tile, 9122) | + +------------------------------------------------------------------------------------------------------------------+ + | {index_id: 593308294097928191, tile: [00 01 10 ... 00], parentPath: "dbfs:/path_to_file", driver: "GTiff" } | + +------------------------------------------------------------------------------------------------------------------+ + + .. code-tab:: sql + + SELECT rst_setsrid(tile, 9122) FROM table + +------------------------------------------------------------------------------------------------------------------+ + | rst_setsrid(tile, 9122) | + +------------------------------------------------------------------------------------------------------------------+ + | {index_id: 593308294097928191, tile: [00 01 10 ... 00], parentPath: "dbfs:/path_to_file", driver: "GTiff" } | + +------------------------------------------------------------------------------------------------------------------+ + rst_skewx ********* diff --git a/python/mosaic/api/raster.py b/python/mosaic/api/raster.py index 3fa75e098..3454cb0a7 100644 --- a/python/mosaic/api/raster.py +++ b/python/mosaic/api/raster.py @@ -95,8 +95,7 @@ def rst_avg(raster_tile: ColumnOrName) -> Column: """ return config.mosaic_context.invoke_function( - "rst_avg", - pyspark_to_java_column(raster_tile) + "rst_avg", pyspark_to_java_column(raster_tile) ) @@ -355,8 +354,9 @@ def rst_frombands(bands: ColumnOrName) -> Column: "rst_frombands", pyspark_to_java_column(bands) ) + def rst_fromcontent( - raster_bin: ColumnOrName, driver: ColumnOrName, size_in_mb: Any = -1 + raster_bin: ColumnOrName, driver: ColumnOrName, size_in_mb: Any = -1 ) -> Column: """ Tiles the raster binary into tiles of the given size. @@ -579,7 +579,9 @@ def rst_mapalgebra(raster_tile: ColumnOrName, json_spec: ColumnOrName) -> Column """ return config.mosaic_context.invoke_function( - "rst_mapalgebra", pyspark_to_java_column(raster_tile), pyspark_to_java_column(json_spec) + "rst_mapalgebra", + pyspark_to_java_column(raster_tile), + pyspark_to_java_column(json_spec), ) @@ -599,8 +601,7 @@ def rst_max(raster_tile: ColumnOrName) -> Column: """ return config.mosaic_context.invoke_function( - "rst_max", - pyspark_to_java_column(raster_tile) + "rst_max", pyspark_to_java_column(raster_tile) ) @@ -620,8 +621,7 @@ def rst_median(raster_tile: ColumnOrName) -> Column: """ return config.mosaic_context.invoke_function( - "rst_median", - pyspark_to_java_column(raster_tile) + "rst_median", pyspark_to_java_column(raster_tile) ) @@ -699,13 +699,12 @@ def rst_min(raster_tile: ColumnOrName) -> Column: """ return config.mosaic_context.invoke_function( - "rst_min", - pyspark_to_java_column(raster_tile) + "rst_min", pyspark_to_java_column(raster_tile) ) def rst_ndvi( - raster_tile: ColumnOrName, band1: ColumnOrName, band2: ColumnOrName + raster_tile: ColumnOrName, band1: ColumnOrName, band2: ColumnOrName ) -> Column: """ Computes the NDVI of the raster. @@ -734,6 +733,7 @@ def rst_ndvi( pyspark_to_java_column(band2), ) + def rst_numbands(raster_tile: ColumnOrName) -> Column: """ Parameters @@ -1329,10 +1329,10 @@ def rst_tessellate(raster_tile: ColumnOrName, resolution: ColumnOrName) -> Colum def rst_to_overlapping_tiles( - raster_tile: ColumnOrName, - width: ColumnOrName, - height: ColumnOrName, - overlap: ColumnOrName, + raster_tile: ColumnOrName, + width: ColumnOrName, + height: ColumnOrName, + overlap: ColumnOrName, ) -> Column: """ Tiles the raster into tiles of the given size. diff --git a/python/test/test_raster_functions.py b/python/test/test_raster_functions.py index 7e9a00c64..ca3571a27 100644 --- a/python/test/test_raster_functions.py +++ b/python/test/test_raster_functions.py @@ -56,7 +56,13 @@ def test_raster_scalar_functions(self): .withColumn("rst_height", api.rst_height("tile")) .withColumn("rst_initnodata", api.rst_initnodata("tile")) .withColumn("rst_isempty", api.rst_isempty("tile")) - .withColumn("rst_mapalgebra", api.rst_mapalgebra(array("tile", "rst_initnodata"), lit('{"calc": "A+B", "A_index": 0, "B_index": 1}'))) + .withColumn( + "rst_mapalgebra", + api.rst_mapalgebra( + array("tile", "rst_initnodata"), + lit('{"calc": "A+B", "A_index": 0, "B_index": 1}'), + ), + ) .withColumn("rst_memsize", api.rst_memsize("tile")) .withColumn("rst_merge", api.rst_merge(array("tile", "tile"))) .withColumn("rst_metadata", api.rst_metadata("tile")) @@ -245,7 +251,6 @@ def test_netcdf_load_tessellate_clip_merge(self): self.assertEqual(merged_precipitation.count(), 1) def test_dtmfromgeoms(self): - outputRegion = "POLYGON((348000 462000, 348000 461000, 349000 461000, 349000 462000, 348000 462000))" points_df = ( diff --git a/python/test/test_vector_functions.py b/python/test/test_vector_functions.py index e2c9740db..2c6fb18a4 100644 --- a/python/test/test_vector_functions.py +++ b/python/test/test_vector_functions.py @@ -324,7 +324,8 @@ def test_triangulate_interpolate(self): ) triangulation_df = df.withColumn( - "triangles", api.st_triangulate("masspoints", "breaklines", lit(0.0), lit(0.01)) + "triangles", + api.st_triangulate("masspoints", "breaklines", lit(0.0), lit(0.01)), ) triangulation_df.cache() self.assertEqual(triangulation_df.count(), 2) From 391acf4a27cc204b22d3c2b9397338f7cdedb5bd Mon Sep 17 00:00:00 2001 From: sllynn Date: Tue, 24 Sep 2024 13:53:58 +0200 Subject: [PATCH 02/12] standardize on rst_tooverlappingtiles, mark rst_to_overlapping_tiles as deprecated --- .../tests/testthat/testRasterFunctions.R | 6 ++--- .../tests/testthat/testRasterFunctions.R | 6 ++--- python/mosaic/api/raster.py | 25 +++++++++++++------ python/test/test_raster_functions.py | 15 +++++------ .../raster/RST_ToOverlappingTiles.scala | 2 +- .../labs/mosaic/functions/MosaicContext.scala | 4 +-- .../RST_ToOverlappingTilesBehaviors.scala | 4 +-- 7 files changed, 37 insertions(+), 25 deletions(-) diff --git a/R/sparkR-mosaic/sparkrMosaic/tests/testthat/testRasterFunctions.R b/R/sparkR-mosaic/sparkrMosaic/tests/testthat/testRasterFunctions.R index 6e3ea147e..c27b9ec46 100644 --- a/R/sparkR-mosaic/sparkrMosaic/tests/testthat/testRasterFunctions.R +++ b/R/sparkR-mosaic/sparkrMosaic/tests/testthat/testRasterFunctions.R @@ -67,7 +67,7 @@ test_that("raster flatmap functions behave as intended", { expect_equal(nrow(tessellate_sdf), 63) overlap_sdf <- generate_singleband_raster_df() - overlap_sdf <- withColumn(overlap_sdf, "rst_to_overlapping_tiles", rst_to_overlapping_tiles(column("tile"), lit(200L), lit(200L), lit(10L))) + overlap_sdf <- withColumn(overlap_sdf, "rst_tooverlappingtiles", rst_tooverlappingtiles(column("tile"), lit(200L), lit(200L), lit(10L))) expect_no_error(write.df(overlap_sdf, source = "noop", mode = "overwrite")) expect_equal(nrow(overlap_sdf), 87) @@ -76,7 +76,7 @@ test_that("raster flatmap functions behave as intended", { test_that("raster aggregation functions behave as intended", { collection_sdf <- generate_singleband_raster_df() collection_sdf <- withColumn(collection_sdf, "extent", st_astext(rst_boundingbox(column("tile")))) - collection_sdf <- withColumn(collection_sdf, "tile", rst_to_overlapping_tiles(column("tile"), lit(200L), lit(200L), lit(10L))) + collection_sdf <- withColumn(collection_sdf, "tile", rst_tooverlappingtiles(column("tile"), lit(200L), lit(200L), lit(10L))) merge_sdf <- summarize( groupBy(collection_sdf, "path"), @@ -127,7 +127,7 @@ test_that("the tessellate-join-clip-merge flow works on NetCDF files", { raster_sdf <- withColumn(raster_sdf, "timestep", element_at(rst_metadata(column("tile")), "NC_GLOBAL#GDAL_MOSAIC_BAND_INDEX")) raster_sdf <- where(raster_sdf, "timestep = 21") raster_sdf <- withColumn(raster_sdf, "tile", rst_setsrid(column("tile"), lit(4326L))) - raster_sdf <- withColumn(raster_sdf, "tile", rst_to_overlapping_tiles(column("tile"), lit(20L), lit(20L), lit(10L))) + raster_sdf <- withColumn(raster_sdf, "tile", rst_tooverlappingtiles(column("tile"), lit(20L), lit(20L), lit(10L))) raster_sdf <- withColumn(raster_sdf, "tile", rst_tessellate(column("tile"), lit(target_resolution))) clipped_sdf <- join(raster_sdf, census_sdf, raster_sdf$tile.index_id == census_sdf$index_id) diff --git a/R/sparklyr-mosaic/sparklyrMosaic/tests/testthat/testRasterFunctions.R b/R/sparklyr-mosaic/sparklyrMosaic/tests/testthat/testRasterFunctions.R index ea28a44d6..c60592e03 100644 --- a/R/sparklyr-mosaic/sparklyrMosaic/tests/testthat/testRasterFunctions.R +++ b/R/sparklyr-mosaic/sparklyrMosaic/tests/testthat/testRasterFunctions.R @@ -95,7 +95,7 @@ test_that("raster flatmap functions behave as intended", { expect_equal(sdf_nrow(tessellate_sdf), 63) overlap_sdf <- generate_singleband_raster_df() %>% - mutate(rst_to_overlapping_tiles = rst_to_overlapping_tiles(tile, 200L, 200L, 10L)) + mutate(rst_tooverlappingtiles = rst_tooverlappingtiles(tile, 200L, 200L, 10L)) expect_no_error(spark_write_source(overlap_sdf, "noop", mode = "overwrite")) expect_equal(sdf_nrow(overlap_sdf), 87) @@ -105,7 +105,7 @@ test_that("raster flatmap functions behave as intended", { test_that("raster aggregation functions behave as intended", { collection_sdf <- generate_singleband_raster_df() %>% mutate(extent = st_astext(rst_boundingbox(tile))) %>% - mutate(tile = rst_to_overlapping_tiles(tile, 200L, 200L, 10L)) + mutate(tile = rst_tooverlappingtiles(tile, 200L, 200L, 10L)) merge_sdf <- collection_sdf %>% group_by(path) %>% @@ -167,7 +167,7 @@ test_that("the tessellate-join-clip-merge flow works on NetCDF files", { indexed_raster_sdf <- sdf_sql(sc, "SELECT tile, element_at(rst_metadata(tile), 'NC_GLOBAL#GDAL_MOSAIC_BAND_INDEX') as timestep FROM raster") %>% filter(timestep == 21L) %>% mutate(tile = rst_setsrid(tile, 4326L)) %>% - mutate(tile = rst_to_overlapping_tiles(tile, 20L, 20L, 10L)) %>% + mutate(tile = rst_tooverlappingtiles(tile, 20L, 20L, 10L)) %>% mutate(tile = rst_tessellate(tile, target_resolution)) clipped_sdf <- indexed_raster_sdf %>% diff --git a/python/mosaic/api/raster.py b/python/mosaic/api/raster.py index 3454cb0a7..a60a5c3b5 100644 --- a/python/mosaic/api/raster.py +++ b/python/mosaic/api/raster.py @@ -67,7 +67,8 @@ "rst_summary", "rst_tessellate", "rst_transform", - "rst_to_overlapping_tiles", + "rst_tooverlappingtiles", + "rst_to_overlapping_tiles", # <- deprecated "rst_tryopen", "rst_updatetype", "rst_upperleftx", @@ -1328,11 +1329,11 @@ def rst_tessellate(raster_tile: ColumnOrName, resolution: ColumnOrName) -> Colum ) -def rst_to_overlapping_tiles( - raster_tile: ColumnOrName, - width: ColumnOrName, - height: ColumnOrName, - overlap: ColumnOrName, +def rst_tooverlappingtiles( + raster_tile: ColumnOrName, + width: ColumnOrName, + height: ColumnOrName, + overlap: ColumnOrName, ) -> Column: """ Tiles the raster into tiles of the given size. @@ -1342,7 +1343,7 @@ def rst_to_overlapping_tiles( """ return config.mosaic_context.invoke_function( - "rst_to_overlapping_tiles", + "rst_tooverlappingtiles", pyspark_to_java_column(raster_tile), pyspark_to_java_column(width), pyspark_to_java_column(height), @@ -1350,6 +1351,16 @@ def rst_to_overlapping_tiles( ) +def rst_to_overlapping_tiles( + raster_tile: ColumnOrName, + width: ColumnOrName, + height: ColumnOrName, + overlap: ColumnOrName, + ) -> Column: + + return rst_tooverlappingtiles(raster_tile, width, height, overlap) + + def rst_transform(raster_tile: ColumnOrName, srid: ColumnOrName) -> Column: """ Transforms the raster to the given SRID. diff --git a/python/test/test_raster_functions.py b/python/test/test_raster_functions.py index ca3571a27..68288be69 100644 --- a/python/test/test_raster_functions.py +++ b/python/test/test_raster_functions.py @@ -143,8 +143,8 @@ def test_raster_flatmap_functions(self): overlap_result = ( self.generate_singleband_raster_df() .withColumn( - "rst_to_overlapping_tiles", - api.rst_to_overlapping_tiles("tile", lit(200), lit(200), lit(10)), + "rst_tooverlappingtiles", + api.rst_tooverlappingtiles("tile", lit(200), lit(200), lit(10)), ) .withColumn("rst_subdatasets", api.rst_subdatasets("tile")) ) @@ -157,15 +157,16 @@ def test_raster_aggregator_functions(self): self.generate_singleband_raster_df() .withColumn("extent", api.st_astext(api.rst_boundingbox("tile"))) .withColumn( - "rst_to_overlapping_tiles", - api.rst_to_overlapping_tiles("tile", lit(200), lit(200), lit(10)), + "tile", + api.rst_tooverlappingtiles("tile", lit(200), lit(200), lit(10)), ) ) merge_result = ( collection.groupBy("path") - .agg(api.rst_merge_agg("tile").alias("tile")) - .withColumn("extent", api.st_astext(api.rst_boundingbox("tile"))) + .agg(api.rst_merge_agg("tile").alias("merge_tile")) + .withColumn("extent", api.st_astext(api.rst_boundingbox("merge_tile"))) + .cache() ) self.assertEqual(merge_result.count(), 1) @@ -225,7 +226,7 @@ def test_netcdf_load_tessellate_clip_merge(self): .withColumn("tile", api.rst_setsrid("tile", lit(4326))) .where(col("timestep") == 21) .withColumn( - "tile", api.rst_to_overlapping_tiles("tile", lit(20), lit(20), lit(10)) + "tile", api.rst_tooverlappingtiles("tile", lit(20), lit(20), lit(10)) ) .repartition(self.spark.sparkContext.defaultParallelism) ) diff --git a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_ToOverlappingTiles.scala b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_ToOverlappingTiles.scala index 2c3768513..d6fc5e2a6 100644 --- a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_ToOverlappingTiles.scala +++ b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_ToOverlappingTiles.scala @@ -41,7 +41,7 @@ case class RST_ToOverlappingTiles( /** Expression info required for the expression registration for spark SQL. */ object RST_ToOverlappingTiles extends WithExpressionInfo { - override def name: String = "rst_to_overlapping_tiles" + override def name: String = "rst_tooverlappingtiles" override def usage: String = """ diff --git a/src/main/scala/com/databricks/labs/mosaic/functions/MosaicContext.scala b/src/main/scala/com/databricks/labs/mosaic/functions/MosaicContext.scala index b0be5ed9f..1539f1f0b 100644 --- a/src/main/scala/com/databricks/labs/mosaic/functions/MosaicContext.scala +++ b/src/main/scala/com/databricks/labs/mosaic/functions/MosaicContext.scala @@ -803,9 +803,9 @@ class MosaicContext(indexSystem: IndexSystem, geometryAPI: GeometryAPI) extends ColumnAdapter(RST_FromFile(raster.expr, sizeInMB.expr, expressionConfig)) def rst_fromfile(raster: Column, sizeInMB: Int): Column = ColumnAdapter(RST_FromFile(raster.expr, lit(sizeInMB).expr, expressionConfig)) - def rst_to_overlapping_tiles(raster: Column, width: Int, height: Int, overlap: Int): Column = + def rst_tooverlappingtiles(raster: Column, width: Int, height: Int, overlap: Int): Column = ColumnAdapter(RST_ToOverlappingTiles(raster.expr, lit(width).expr, lit(height).expr, lit(overlap).expr, expressionConfig)) - def rst_to_overlapping_tiles(raster: Column, width: Column, height: Column, overlap: Column): Column = + def rst_tooverlappingtiles(raster: Column, width: Column, height: Column, overlap: Column): Column = ColumnAdapter(RST_ToOverlappingTiles(raster.expr, width.expr, height.expr, overlap.expr, expressionConfig)) def rst_tryopen(raster: Column): Column = ColumnAdapter(RST_TryOpen(raster.expr, expressionConfig)) def rst_subdivide(raster: Column, sizeInMB: Column): Column = diff --git a/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_ToOverlappingTilesBehaviors.scala b/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_ToOverlappingTilesBehaviors.scala index bfd202a3b..445d3d358 100644 --- a/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_ToOverlappingTilesBehaviors.scala +++ b/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_ToOverlappingTilesBehaviors.scala @@ -24,7 +24,7 @@ trait RST_ToOverlappingTilesBehaviors extends QueryTest { .load("src/test/resources/modis") val gridTiles = rastersInMemory - .withColumn("tile", rst_to_overlapping_tiles($"tile", lit(500), lit(500), lit(10))) + .withColumn("tile", rst_tooverlappingtiles($"tile", lit(500), lit(500), lit(10))) .select("tile") rastersInMemory @@ -32,7 +32,7 @@ trait RST_ToOverlappingTilesBehaviors extends QueryTest { noException should be thrownBy spark.sql( """ - |select rst_to_overlapping_tiles(tile, 500, 500, 10) + |select rst_tooverlappingtiles(tile, 500, 500, 10) | from source |""".stripMargin).take(1) From 5dabff7fc9e9dcd04d306ee28e13336ea7da9952 Mon Sep 17 00:00:00 2001 From: sllynn Date: Tue, 24 Sep 2024 15:42:21 +0200 Subject: [PATCH 03/12] fixed error in RST_WorldToRasterCoordY (was returning x) --- .../mosaic/expressions/raster/RST_WorldToRasterCoordX.scala | 3 ++- .../mosaic/expressions/raster/RST_WorldToRasterCoordY.scala | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_WorldToRasterCoordX.scala b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_WorldToRasterCoordX.scala index 43ea89de3..19b14cb60 100644 --- a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_WorldToRasterCoordX.scala +++ b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_WorldToRasterCoordX.scala @@ -28,8 +28,9 @@ case class RST_WorldToRasterCoordX( */ override def rasterTransform(tile: MosaicRasterTile, arg1: Any, arg2: Any): Any = { val xGeo = arg1.asInstanceOf[Double] + val yGeo = arg2.asInstanceOf[Double] val gt = tile.getRaster.getRaster.GetGeoTransform() - GDAL.fromWorldCoord(gt, xGeo, 0)._1 + GDAL.fromWorldCoord(gt, xGeo, yGeo)._1 } } diff --git a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_WorldToRasterCoordY.scala b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_WorldToRasterCoordY.scala index 93c0c27e7..44cd0b8ac 100644 --- a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_WorldToRasterCoordY.scala +++ b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_WorldToRasterCoordY.scala @@ -28,8 +28,9 @@ case class RST_WorldToRasterCoordY( */ override def rasterTransform(tile: MosaicRasterTile, arg1: Any, arg2: Any): Any = { val xGeo = arg1.asInstanceOf[Double] + val yGeo = arg2.asInstanceOf[Double] val gt = tile.getRaster.getRaster.GetGeoTransform() - GDAL.fromWorldCoord(gt, xGeo, 0)._2 + GDAL.fromWorldCoord(gt, xGeo, yGeo)._2 } } From fc7b52728007f2f9e922581b84df2be50c741ab7 Mon Sep 17 00:00:00 2001 From: sllynn Date: Tue, 24 Sep 2024 15:59:02 +0200 Subject: [PATCH 04/12] marked rst_to_overlapping_tiles deprecated in MosaicContext --- .../com/databricks/labs/mosaic/functions/MosaicContext.scala | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/main/scala/com/databricks/labs/mosaic/functions/MosaicContext.scala b/src/main/scala/com/databricks/labs/mosaic/functions/MosaicContext.scala index 1539f1f0b..6fd124963 100644 --- a/src/main/scala/com/databricks/labs/mosaic/functions/MosaicContext.scala +++ b/src/main/scala/com/databricks/labs/mosaic/functions/MosaicContext.scala @@ -1006,6 +1006,10 @@ class MosaicContext(indexSystem: IndexSystem, geometryAPI: GeometryAPI) extends def try_sql(inCol: Column): Column = ColumnAdapter(TrySql(inCol.expr)) // Legacy API + @deprecated("Please use 'rst_tooverlappingtiles' expression instead.") + def rst_to_overlapping_tiles(raster: Column, width: Int, height: Int, overlap: Int): Column = rst_tooverlappingtiles(raster, width, height, overlap) + @deprecated("Please use 'rst_tooverlappingtiles' expression instead.") + def rst_to_overlapping_tiles(raster: Column, width: Column, height: Column, overlap: Column): Column = rst_tooverlappingtiles(raster, width, height, overlap) @deprecated("Please use 'st_intersects_agg' expression instead.") def st_intersects_aggregate(leftIndex: Column, rightIndex: Column): Column = st_intersects_agg(leftIndex, rightIndex) @deprecated("Please use 'st_intersection_agg' expression instead.") From 9a0b0872bd6aec2fcbefc493723813c4ac3d1b0b Mon Sep 17 00:00:00 2001 From: sllynn Date: Tue, 24 Sep 2024 16:07:18 +0200 Subject: [PATCH 05/12] updated RST_PixelCount to take extra params --- docs/source/api/raster-functions.rst | 26 +++++++++++-- python/mosaic/api/raster.py | 19 +++++++-- .../raster/gdal/MosaicRasterBandGDAL.scala | 39 ++++++++++++------- .../expressions/raster/RST_PixelCount.scala | 30 ++++++++++---- .../labs/mosaic/functions/MosaicContext.scala | 4 +- .../raster/RST_PixelCountBehaviors.scala | 3 +- 6 files changed, 91 insertions(+), 30 deletions(-) diff --git a/docs/source/api/raster-functions.rst b/docs/source/api/raster-functions.rst index ec51fa33f..1a38f64a1 100644 --- a/docs/source/api/raster-functions.rst +++ b/docs/source/api/raster-functions.rst @@ -1737,16 +1737,36 @@ rst_numbands +---------------------+ rst_pixelcount -*************** +************** -.. function:: rst_pixelcount(tile) +.. function:: rst_pixelcount(tile, count_nodata, count_all) - Returns an array containing valid pixel count values for each band. + Returns an array containing pixel count values for each band; default excludes mask and nodata pixels. :param tile: A column containing the raster tile. :type tile: Column (RasterTileType) + :param count_nodata: A column to specify whether to count nodata pixels. + :type count_nodata: Column (BooleanType) + :param count_all: A column to specify whether to count all pixels. + :type count_all: Column (BooleanType) :rtype: Column: ArrayType(LongType) +.. note:: + + Notes: + + If pixel value is noData or mask value is 0.0, the pixel is not counted by default. + + :code:`count_nodata` + - This is an optional param. + - if specified as true, include the noData (not mask) pixels in the count (default is false). + + :code:`count_all` + - This is an optional param; as a positional arg, must also pass :code:`count_nodata` + (value of :code:`count_nodata` is ignored). + - if specified as true, simply return bandX * bandY in the count (default is false). +.. + :example: .. tabs:: diff --git a/python/mosaic/api/raster.py b/python/mosaic/api/raster.py index deb6769e2..a53b7988b 100644 --- a/python/mosaic/api/raster.py +++ b/python/mosaic/api/raster.py @@ -755,21 +755,34 @@ def rst_numbands(raster_tile: ColumnOrName) -> Column: ) -def rst_pixelcount(raster_tile: ColumnOrName) -> Column: +def rst_pixelcount(raster_tile: ColumnOrName, count_nodata: Any = False, count_all: Any = False) -> Column: """ Parameters ---------- raster_tile : Column (RasterTileType) Mosaic raster tile struct column. - + count_nodata : Column(BooleanType) + If false do not include noData pixels in count (default is false). + count_all : Column(BooleanType) + If true, simply return bandX * bandY (default is false). Returns ------- Column (ArrayType(LongType)) Array containing valid pixel count values for each band. """ + + if type(count_nodata) == bool: + count_nodata = lit(count_nodata) + + if type(count_all) == bool: + count_all = lit(count_all) + return config.mosaic_context.invoke_function( - "rst_pixelcount", pyspark_to_java_column(raster_tile) + "rst_pixelcount", + pyspark_to_java_column(raster_tile), + pyspark_to_java_column(count_nodata), + pyspark_to_java_column(count_all), ) diff --git a/src/main/scala/com/databricks/labs/mosaic/core/raster/gdal/MosaicRasterBandGDAL.scala b/src/main/scala/com/databricks/labs/mosaic/core/raster/gdal/MosaicRasterBandGDAL.scala index fb9edb14e..91acd50bb 100644 --- a/src/main/scala/com/databricks/labs/mosaic/core/raster/gdal/MosaicRasterBandGDAL.scala +++ b/src/main/scala/com/databricks/labs/mosaic/core/raster/gdal/MosaicRasterBandGDAL.scala @@ -242,25 +242,36 @@ case class MosaicRasterBandGDAL(band: Band, id: Int) { /** * Counts the number of pixels in the band. The mask is used to determine * if a pixel is valid. If pixel value is noData or mask value is 0.0, the - * pixel is not counted. - * + * pixel is not counted by default. + * @param countNoData + * If specified as true, include the noData (default is false). + * @param countAll + * If specified as true, simply return bandX * bandY (default is false). * @return * Returns the band's pixel count. */ - def pixelCount: Int = { - val line = Array.ofDim[Double](band.GetXSize()) - val maskLine = Array.ofDim[Double](band.GetXSize()) - var count = 0 - for (y <- 0 until band.GetYSize()) { - band.ReadRaster(0, y, band.GetXSize(), 1, line) - val maskRead = band.GetMaskBand().ReadRaster(0, y, band.GetXSize(), 1, maskLine) - if (maskRead != gdalconstConstants.CE_None) { - count = count + line.count(_ != noDataValue) - } else { - count = count + line.zip(maskLine).count { case (pixel, mask) => pixel != noDataValue && mask != 0.0 } + def pixelCount(countNoData: Boolean = false, countAll: Boolean = false): Int = { + if (countAll) { + // all pixels returned + band.GetXSize() * band.GetYSize() + } else { + // nodata not included (default) + val line = Array.ofDim[Double](band.GetXSize()) + var count = 0 + for (y <- 0 until band.GetYSize()) { + band.ReadRaster(0, y, band.GetXSize(), 1, line) + val maskLine = Array.ofDim[Double](band.GetXSize()) + val maskRead = band.GetMaskBand().ReadRaster(0, y, band.GetXSize(), 1, maskLine) + if (maskRead != gdalconstConstants.CE_None) { + count = count + line.count(pixel => countNoData || pixel != noDataValue) + } else { + count = count + line.zip(maskLine).count { + case (pixel, mask) => mask != 0.0 && (countNoData || pixel != noDataValue) + } + } } + count } - count } /** diff --git a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_PixelCount.scala b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_PixelCount.scala index ebc4ebc15..bcb0701cf 100644 --- a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_PixelCount.scala +++ b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_PixelCount.scala @@ -2,7 +2,7 @@ package com.databricks.labs.mosaic.expressions.raster import com.databricks.labs.mosaic.core.types.model.MosaicRasterTile import com.databricks.labs.mosaic.expressions.base.{GenericExpressionFactory, WithExpressionInfo} -import com.databricks.labs.mosaic.expressions.raster.base.RasterExpression +import com.databricks.labs.mosaic.expressions.raster.base.Raster2ArgExpression import com.databricks.labs.mosaic.functions.MosaicExpressionConfig import org.apache.spark.sql.catalyst.analysis.FunctionRegistry.FunctionBuilder import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback @@ -11,17 +11,31 @@ import org.apache.spark.sql.catalyst.util.ArrayData import org.apache.spark.sql.types._ /** Returns an array containing valid pixel count values for each band. */ -case class RST_PixelCount(rasterExpr: Expression, expressionConfig: MosaicExpressionConfig) - extends RasterExpression[RST_PixelCount](rasterExpr, returnsRaster = false, expressionConfig) +case class RST_PixelCount( + rasterExpr: Expression, + noDataExpr: Expression, + allExpr: Expression, + expressionConfig: MosaicExpressionConfig) + extends Raster2ArgExpression[RST_PixelCount](rasterExpr, noDataExpr, allExpr, returnsRaster = false, expressionConfig) with NullIntolerant with CodegenFallback { override def dataType: DataType = ArrayType(LongType) - /** Returns an array containing valid pixel count values for each band. */ - override def rasterTransform(tile: MosaicRasterTile): Any = { + /** + * Returns an array containing valid pixel count values for each band. + * - default is to exclude nodata and mask pixels. + * - if countNoData specified as true, include the noData (not mask) pixels in the count (default is false). + * - if countAll specified as true, simply return bandX * bandY in the count (default is false). countAll ignores + * countNodData + */ + override def rasterTransform(tile: MosaicRasterTile, arg1: Any, arg2: Any): Any = { val bandCount = tile.raster.raster.GetRasterCount() - val pixelCount = (1 to bandCount).map(tile.raster.getBand(_).pixelCount) + val countNoData = arg1.asInstanceOf[Boolean] + val countAll = arg2.asInstanceOf[Boolean] + val pixelCount = (1 to bandCount).map( + tile.raster.getBand(_).pixelCount(countNoData, countAll) + ) ArrayData.toArrayData(pixelCount.toArray) } @@ -32,7 +46,7 @@ object RST_PixelCount extends WithExpressionInfo { override def name: String = "rst_pixelcount" - override def usage: String = "_FUNC_(expr1) - Returns an array containing valid pixel count values for each band." + override def usage: String = "_FUNC_(expr1) - Returns an array containing pixel count values for each band (default excludes nodata and mask)." override def example: String = """ @@ -42,7 +56,7 @@ object RST_PixelCount extends WithExpressionInfo { | """.stripMargin override def builder(expressionConfig: MosaicExpressionConfig): FunctionBuilder = { - GenericExpressionFactory.getBaseBuilder[RST_PixelCount](1, expressionConfig) + GenericExpressionFactory.getBaseBuilder[RST_PixelCount](3, expressionConfig) } } diff --git a/src/main/scala/com/databricks/labs/mosaic/functions/MosaicContext.scala b/src/main/scala/com/databricks/labs/mosaic/functions/MosaicContext.scala index 6fd124963..809e348c1 100644 --- a/src/main/scala/com/databricks/labs/mosaic/functions/MosaicContext.scala +++ b/src/main/scala/com/databricks/labs/mosaic/functions/MosaicContext.scala @@ -703,7 +703,9 @@ class MosaicContext(indexSystem: IndexSystem, geometryAPI: GeometryAPI) extends def rst_convolve(raster: Column, kernel: Column): Column = ColumnAdapter(RST_Convolve(raster.expr, kernel.expr, expressionConfig)) def rst_dtmfromgeoms(pointsArray: Column, linesArray: Column, mergeTol: Column, snapTol: Column, origin: Column, xWidth: Column, yWidth: Column, xSize: Column, ySize: Column): Column = ColumnAdapter(RST_DTMFromGeoms(pointsArray.expr, linesArray.expr, mergeTol.expr, snapTol.expr, origin.expr, xWidth.expr, yWidth.expr, xSize.expr, ySize.expr, expressionConfig)) - def rst_pixelcount(raster: Column): Column = ColumnAdapter(RST_PixelCount(raster.expr, expressionConfig)) + def rst_pixelcount(raster: Column): Column = ColumnAdapter(RST_PixelCount(raster.expr, lit(false).expr, lit(false).expr, expressionConfig)) + def rst_pixelcount(raster: Column, countNoData: Column): Column = ColumnAdapter(RST_PixelCount(raster.expr, countNoData.expr, lit(false).expr, expressionConfig)) + def rst_pixelcount(raster: Column, countNoData: Column, countAll: Column): Column = ColumnAdapter(RST_PixelCount(raster.expr, countNoData.expr, countAll.expr, expressionConfig)) def rst_combineavg(rasterArray: Column): Column = ColumnAdapter(RST_CombineAvg(rasterArray.expr, expressionConfig)) def rst_derivedband(raster: Column, pythonFunc: Column, funcName: Column): Column = ColumnAdapter(RST_DerivedBand(raster.expr, pythonFunc.expr, funcName.expr, expressionConfig)) diff --git a/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_PixelCountBehaviors.scala b/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_PixelCountBehaviors.scala index 87582df1f..c6a076592 100644 --- a/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_PixelCountBehaviors.scala +++ b/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_PixelCountBehaviors.scala @@ -31,8 +31,9 @@ trait RST_PixelCountBehaviors extends QueryTest { .withColumn("tile", rst_tessellate($"tile", lit(3))) .createOrReplaceTempView("source") + // TODO: modified to 3 args... should this be revisited? noException should be thrownBy spark.sql(""" - |select rst_pixelcount(tile) from source + |select rst_pixelcount(tile,false,false) from source |""".stripMargin) noException should be thrownBy rastersInMemory From 6868e0a4a65bd506150741a73d75c5d31c81c26e Mon Sep 17 00:00:00 2001 From: sllynn Date: Tue, 24 Sep 2024 16:08:38 +0200 Subject: [PATCH 06/12] fixed failing python test --- python/mosaic/api/raster.py | 27 ++++++++++++++------------- python/test/test_raster_functions.py | 2 +- 2 files changed, 15 insertions(+), 14 deletions(-) diff --git a/python/mosaic/api/raster.py b/python/mosaic/api/raster.py index a53b7988b..b0eeb0a99 100644 --- a/python/mosaic/api/raster.py +++ b/python/mosaic/api/raster.py @@ -68,7 +68,7 @@ "rst_tessellate", "rst_transform", "rst_tooverlappingtiles", - "rst_to_overlapping_tiles", # <- deprecated + "rst_to_overlapping_tiles", # <- deprecated "rst_type", "rst_to_overlapping_tiles", "rst_tryopen", @@ -755,7 +755,9 @@ def rst_numbands(raster_tile: ColumnOrName) -> Column: ) -def rst_pixelcount(raster_tile: ColumnOrName, count_nodata: Any = False, count_all: Any = False) -> Column: +def rst_pixelcount( + raster_tile: ColumnOrName, count_nodata: Any = False, count_all: Any = False +) -> Column: """ Parameters ---------- @@ -776,7 +778,7 @@ def rst_pixelcount(raster_tile: ColumnOrName, count_nodata: Any = False, count_a count_nodata = lit(count_nodata) if type(count_all) == bool: - count_all = lit(count_all) + count_all = lit(count_all) return config.mosaic_context.invoke_function( "rst_pixelcount", @@ -1366,10 +1368,10 @@ def rst_tessellate(raster_tile: ColumnOrName, resolution: ColumnOrName) -> Colum def rst_tooverlappingtiles( - raster_tile: ColumnOrName, - width: ColumnOrName, - height: ColumnOrName, - overlap: ColumnOrName, + raster_tile: ColumnOrName, + width: ColumnOrName, + height: ColumnOrName, + overlap: ColumnOrName, ) -> Column: """ Tiles the raster into tiles of the given size. @@ -1388,12 +1390,11 @@ def rst_tooverlappingtiles( def rst_to_overlapping_tiles( - raster_tile: ColumnOrName, - width: ColumnOrName, - height: ColumnOrName, - overlap: ColumnOrName, - ) -> Column: - + raster_tile: ColumnOrName, + width: ColumnOrName, + height: ColumnOrName, + overlap: ColumnOrName, +) -> Column: return rst_tooverlappingtiles(raster_tile, width, height, overlap) diff --git a/python/test/test_raster_functions.py b/python/test/test_raster_functions.py index 68288be69..a5f27fed9 100644 --- a/python/test/test_raster_functions.py +++ b/python/test/test_raster_functions.py @@ -59,7 +59,7 @@ def test_raster_scalar_functions(self): .withColumn( "rst_mapalgebra", api.rst_mapalgebra( - array("tile", "rst_initnodata"), + array("tile_from_file", "rst_initnodata"), lit('{"calc": "A+B", "A_index": 0, "B_index": 1}'), ), ) From 3e0b9a7562e4a1ef72d449f0d4b7e45c95c88ab9 Mon Sep 17 00:00:00 2001 From: sllynn Date: Tue, 24 Sep 2024 21:35:26 +0200 Subject: [PATCH 07/12] added rst_write expression for writing rasters --- .../tests/testthat/testRasterFunctions.R | 1 + .../tests/testthat/testRasterFunctions.R | 1 + docs/source/api/raster-functions.rst | 57 +++++++++ python/mosaic/api/raster.py | 23 ++++ python/test/test_raster_functions.py | 1 + .../labs/mosaic/core/raster/api/GDAL.scala | 7 +- .../mosaic/expressions/raster/RST_Write.scala | 113 ++++++++++++++++++ .../labs/mosaic/functions/MosaicContext.scala | 6 +- .../labs/mosaic/utils/FileUtils.scala | 68 ++++++++++- .../raster/RST_WriteBehaviors.scala | 81 +++++++++++++ .../expressions/raster/RST_WriteTest.scala | 13 ++ 11 files changed, 366 insertions(+), 5 deletions(-) create mode 100644 src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_Write.scala create mode 100644 src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_WriteBehaviors.scala create mode 100644 src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_WriteTest.scala diff --git a/R/sparkR-mosaic/sparkrMosaic/tests/testthat/testRasterFunctions.R b/R/sparkR-mosaic/sparkrMosaic/tests/testthat/testRasterFunctions.R index c27b9ec46..a158275b2 100644 --- a/R/sparkR-mosaic/sparkrMosaic/tests/testthat/testRasterFunctions.R +++ b/R/sparkR-mosaic/sparkrMosaic/tests/testthat/testRasterFunctions.R @@ -43,6 +43,7 @@ test_that("scalar raster functions behave as intended", { sdf <- withColumn(sdf, "rst_worldtorastercoordx", rst_worldtorastercoordx(column("tile"), lit(0.0), lit(0.0))) sdf <- withColumn(sdf, "rst_worldtorastercoordy", rst_worldtorastercoordy(column("tile"), lit(0.0), lit(0.0))) sdf <- withColumn(sdf, "rst_worldtorastercoord", rst_worldtorastercoord(column("tile"), lit(0.0), lit(0.0))) + sdf <- withColumn(sdf, "rst_write", rst_write(column("tile"), lit("/tmp/mosaic_tmp/"))) expect_no_error(write.df(sdf, source = "noop", mode = "overwrite")) }) diff --git a/R/sparklyr-mosaic/sparklyrMosaic/tests/testthat/testRasterFunctions.R b/R/sparklyr-mosaic/sparklyrMosaic/tests/testthat/testRasterFunctions.R index c60592e03..a2a783f53 100644 --- a/R/sparklyr-mosaic/sparklyrMosaic/tests/testthat/testRasterFunctions.R +++ b/R/sparklyr-mosaic/sparklyrMosaic/tests/testthat/testRasterFunctions.R @@ -71,6 +71,7 @@ test_that("scalar raster functions behave as intended", { mutate(rst_worldtorastercoordx = rst_worldtorastercoordx(tile, as.double(0.0), as.double(0.0))) %>% mutate(rst_worldtorastercoordy = rst_worldtorastercoordy(tile, as.double(0.0), as.double(0.0))) %>% mutate(rst_worldtorastercoord = rst_worldtorastercoord(tile, as.double(0.0), as.double(0.0))) + mutate(rst_write = rst_write(tile, "/tmp/mosaic_tmp")) expect_no_error(spark_write_source(sdf, "noop", mode = "overwrite")) }) diff --git a/docs/source/api/raster-functions.rst b/docs/source/api/raster-functions.rst index 1a38f64a1..194ecca00 100644 --- a/docs/source/api/raster-functions.rst +++ b/docs/source/api/raster-functions.rst @@ -3580,3 +3580,60 @@ rst_worldtorastercoordy +------------------------------------------------------------------------------------------------------------------+ | 997 | +------------------------------------------------------------------------------------------------------------------+ + +rst_write +********* + +.. function:: rst_write(input, dir) + + Writes raster tiles from the input column to a specified directory. + + :param input: A column containing the raster tile. + :type input: Column + :param dir: The directory, e.g. fuse, to write the tile's raster as file. + :type dir: Column(StringType) + :rtype: Column: RasterTileType + +.. note:: + **Notes** + - Use :code:`RST_Write` to save a 'tile' column to a specified directory (e.g. fuse) location using its + already populated GDAL driver and tile information. + - Useful for formalizing the tile 'path' when writing a Lakehouse table. An example might be to turn on checkpointing + for internal data pipeline phase operations in which multiple interim tiles are populated, but at the end of the phase + use this function to set the final path to be used in the phase's persisted table. Then, you are free to delete + the internal tiles that accumulated in the configured checkpointing directory. +.. + + :example: + +.. tabs:: + .. code-tab:: py + + df.select(rst_write("tile", ).alias("tile")).limit(1).display() + +------------------------------------------------------------------------+ + | tile | + +------------------------------------------------------------------------+ + | {"index_id":null,"tile":"","metadata":{ | + | "parentPath":"no_path","driver":"GTiff","path":"...","last_error":""}} | + +------------------------------------------------------------------------+ + + .. code-tab:: scala + + df.select(rst_write(col("tile", )).as("tile)).limit(1).show + +------------------------------------------------------------------------+ + | tile | + +------------------------------------------------------------------------+ + | {"index_id":null,"tile":"","metadata":{ | + | "parentPath":"no_path","driver":"GTiff","path":"...","last_error":""}} | + +------------------------------------------------------------------------+ + + .. code-tab:: sql + + SELECT rst_write(tile, ) as tile FROM table LIMIT 1 + +------------------------------------------------------------------------+ + | tile | + +------------------------------------------------------------------------+ + | {"index_id":null,"tile":"","metadata":{ | + | "parentPath":"no_path","driver":"GTiff","path":"...","last_error":""}} | + +------------------------------------------------------------------------+ + diff --git a/python/mosaic/api/raster.py b/python/mosaic/api/raster.py index b0eeb0a99..464e945fe 100644 --- a/python/mosaic/api/raster.py +++ b/python/mosaic/api/raster.py @@ -79,6 +79,7 @@ "rst_worldtorastercoord", "rst_worldtorastercoordx", "rst_worldtorastercoordy", + "rst_write", ] @@ -1634,3 +1635,25 @@ def rst_worldtorastercoordy( pyspark_to_java_column(x), pyspark_to_java_column(y), ) + + +def rst_write(raster_tile: ColumnOrName, path: ColumnOrName) -> Column: + """ + Write rasters to the given director `path`. + + Parameters + ---------- + raster_tile : Column (RasterTileType) + Mosaic raster tile struct column. + path : Column (StringType) + The path to write the raster. + + Returns + ------- + Column (RasterTileType) + The raster with an updated location at `path`. + + """ + return config.mosaic_context.invoke_function( + "rst_write", pyspark_to_java_column(raster_tile), pyspark_to_java_column(path) + ) diff --git a/python/test/test_raster_functions.py b/python/test/test_raster_functions.py index a5f27fed9..30e951029 100644 --- a/python/test/test_raster_functions.py +++ b/python/test/test_raster_functions.py @@ -115,6 +115,7 @@ def test_raster_scalar_functions(self): "rst_worldtorastercoord", api.rst_worldtorastercoord("tile", lit(0.0), lit(0.0)), ) + .withColumn("rst_write", api.rst_write("tile", lit("/tmp/mosaic_tmp/"))) ) result.write.format("noop").mode("overwrite").save() self.assertEqual(result.count(), 1) diff --git a/src/main/scala/com/databricks/labs/mosaic/core/raster/api/GDAL.scala b/src/main/scala/com/databricks/labs/mosaic/core/raster/api/GDAL.scala index 008717ee1..d40ff6231 100644 --- a/src/main/scala/com/databricks/labs/mosaic/core/raster/api/GDAL.scala +++ b/src/main/scala/com/databricks/labs/mosaic/core/raster/api/GDAL.scala @@ -186,10 +186,13 @@ object GDAL { ) } - private def writeRasterString(raster: MosaicRasterGDAL): UTF8String = { + def writeRasterString(raster: MosaicRasterGDAL, path: Option[String] = None): UTF8String = { val uuid = UUID.randomUUID().toString val extension = GDAL.getExtension(raster.getDriversShortName) - val writePath = s"${getCheckpointPath}/$uuid.$extension" + val writePath = path match { + case Some(p) => p + case None => s"${getCheckpointPath}/$uuid.$extension" + } val outPath = raster.writeToPath(writePath) RasterCleaner.dispose(raster) UTF8String.fromString(outPath) diff --git a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_Write.scala b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_Write.scala new file mode 100644 index 000000000..b314d5099 --- /dev/null +++ b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_Write.scala @@ -0,0 +1,113 @@ +package com.databricks.labs.mosaic.expressions.raster + +import com.databricks.labs.mosaic.core.raster.api.GDAL +import com.databricks.labs.mosaic.core.raster.gdal.MosaicRasterGDAL +import com.databricks.labs.mosaic.core.types.RasterTileType +import com.databricks.labs.mosaic.core.types.model.MosaicRasterTile +import com.databricks.labs.mosaic.expressions.base.WithExpressionInfo +import com.databricks.labs.mosaic.expressions.raster.base.Raster1ArgExpression +import com.databricks.labs.mosaic.functions.MosaicExpressionConfig +import org.apache.spark.sql.catalyst.analysis.FunctionRegistry.FunctionBuilder +import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback +import org.apache.spark.sql.catalyst.expressions.{Expression, Literal, NullIntolerant} +import org.apache.spark.sql.types._ +import org.apache.spark.unsafe.types.UTF8String + +import scala.util.Try + +/** + * Writes raster tiles from the input column to a specified directory. + * - expects the driver to already have been set on the inputExpr ("tile" + * column). + * @param inputExpr + * The expression for the raster. If the raster is stored on disc, the path + * to the raster is provided. If the raster is stored in memory, the bytes of + * the raster are provided. + * @param dirExpr + * Write to directory. + * @param expressionConfig + * Additional arguments for the expression (expressionConfigs). + */ +case class RST_Write( + inputExpr: Expression, + dirExpr: Expression, + expressionConfig: MosaicExpressionConfig +) extends Raster1ArgExpression[RST_Write]( + inputExpr, + dirExpr, + returnsRaster = true, + expressionConfig = expressionConfig + ) + with NullIntolerant + with CodegenFallback { + + // serialize data type + override def dataType: DataType = { + require(dirExpr.isInstanceOf[Literal]) + RasterTileType(expressionConfig.getCellIdType, StringType, useCheckpoint = false) + } + + /** + * write a raster to dir. + * + * @param tile + * The raster to be used. + * @param arg1 + * The dir. + * @return + * tile using the new path + */ + override def rasterTransform(tile: MosaicRasterTile, arg1: Any): Any = { + tile.copy( + raster = copyToArg1Dir(tile, arg1) + ) + } + + private def copyToArg1Dir(inTile: MosaicRasterTile, arg1: Any): MosaicRasterGDAL = { + val inRaster = inTile.getRaster + val inPath = inRaster.createInfo("path") + val inDriver = inRaster.createInfo("driver") + val outPath = GDAL.writeRasterString( + inRaster, + Some(arg1.asInstanceOf[UTF8String].toString) + ) + .toString + + MosaicRasterGDAL.readRaster( + Map("path" -> outPath, "driver" -> inDriver, "parentPath" -> inPath) + ) + } + +} + +/** Expression info required for the expression registration for spark SQL. */ +object RST_Write extends WithExpressionInfo { + + override def name: String = "rst_write" + + override def usage: String = + """ + |_FUNC_(expr1) - Returns a new raster written to the specified directory. + |""".stripMargin + + override def example: String = + """ + | Examples: + | > SELECT _FUNC_(raster_tile, fuse_dir); + | {index_id, raster, parent_path, driver} + | ... + | """.stripMargin + + override def builder(expressionConfig: MosaicExpressionConfig): FunctionBuilder = { (children: Seq[Expression]) => + { + def checkDir(dir: Expression) = Try(dir.eval().asInstanceOf[String]).isSuccess + + children match { + // Note type checking only works for literals + case Seq(input, dir) if checkDir(dir) => RST_Write(input, dir, expressionConfig) + case _ => RST_Write(children.head, children(1), expressionConfig) + } + } + } + +} diff --git a/src/main/scala/com/databricks/labs/mosaic/functions/MosaicContext.scala b/src/main/scala/com/databricks/labs/mosaic/functions/MosaicContext.scala index 809e348c1..0899d03f8 100644 --- a/src/main/scala/com/databricks/labs/mosaic/functions/MosaicContext.scala +++ b/src/main/scala/com/databricks/labs/mosaic/functions/MosaicContext.scala @@ -340,6 +340,7 @@ class MosaicContext(indexSystem: IndexSystem, geometryAPI: GeometryAPI) extends mosaicRegistry.registerExpression[RST_WorldToRasterCoord](expressionConfig) mosaicRegistry.registerExpression[RST_WorldToRasterCoordX](expressionConfig) mosaicRegistry.registerExpression[RST_WorldToRasterCoordY](expressionConfig) + mosaicRegistry.registerExpression[RST_Write](expressionConfig) /** Aggregators */ registry.registerFunction( @@ -833,7 +834,10 @@ class MosaicContext(indexSystem: IndexSystem, geometryAPI: GeometryAPI) extends ColumnAdapter(RST_WorldToRasterCoordY(raster.expr, x.expr, y.expr, expressionConfig)) def rst_worldtorastercoordy(raster: Column, x: Double, y: Double): Column = ColumnAdapter(RST_WorldToRasterCoordY(raster.expr, lit(x).expr, lit(y).expr, expressionConfig)) - + def rst_write(raster: Column, path: Column): Column = + ColumnAdapter(RST_Write(raster.expr, path.expr, expressionConfig)) + def rst_write(raster: Column, path: String): Column = + ColumnAdapter(RST_Write(raster.expr, lit(path).expr, expressionConfig)) /** Aggregators */ def st_asgeojsontile_agg(geom: Column, attributes: Column): Column = diff --git a/src/main/scala/com/databricks/labs/mosaic/utils/FileUtils.scala b/src/main/scala/com/databricks/labs/mosaic/utils/FileUtils.scala index 0a881d785..f73a27cbe 100644 --- a/src/main/scala/com/databricks/labs/mosaic/utils/FileUtils.scala +++ b/src/main/scala/com/databricks/labs/mosaic/utils/FileUtils.scala @@ -1,7 +1,10 @@ package com.databricks.labs.mosaic.utils -import java.io.{BufferedInputStream, FileInputStream} -import java.nio.file.{Files, Paths} +import java.io.{BufferedInputStream, File, FileInputStream, IOException} +import java.nio.file.attribute.BasicFileAttributes +import java.nio.file._ +import java.util.concurrent.atomic.AtomicInteger +import scala.util.Try object FileUtils { @@ -32,4 +35,65 @@ object FileUtils { tempDir.toFile.getAbsolutePath } + /** Delete provided path (only deletes empty dirs). */ + def tryDeleteFileOrDir(path: Path): Boolean = { + Try(Files.delete(path)).isSuccess + } + + /** + * Delete files recursively (no conditions). + * + * @param root + * May be a directory or a file. + * @param keepRoot + * If true, avoid deleting the root dir itself. + */ + def deleteRecursively(root: Path, keepRoot: Boolean): Unit = { + + Files.walkFileTree(root, new SimpleFileVisitor[Path] { + + val handles = new AtomicInteger(0) + + override def visitFile(file: Path, attributes: BasicFileAttributes): FileVisitResult = { + synchronized { + val numHandles = handles.incrementAndGet() + if (numHandles >= 100000) { + //scalastyle:off println + println(s"FileUtils - deleteRecursively -> attempting gc at next 100K+ handles detected ($numHandles) ...") + handles.set(0) + Try(System.gc()) + println(s"FileUtils - deleteRecursively -> gc complete ...") + //scalastyle:on println + } + } + + tryDeleteFileOrDir(file) + FileVisitResult.CONTINUE + } + + override def postVisitDirectory(dir: Path, exception: IOException): FileVisitResult = { + if ((!keepRoot || dir.compareTo(root) != 0) && isEmptyDir(dir)) { + tryDeleteFileOrDir(dir) + } + FileVisitResult.CONTINUE + } + }) + } + + def deleteRecursively(root: File, keepRoot: Boolean): Unit = { + deleteRecursively(root.toPath, keepRoot) + } + + def deleteRecursively(path: String, keepRoot: Boolean): Unit = { + deleteRecursively(Paths.get(path), keepRoot) + } + + def isEmptyDir(dir: Path): Boolean = { + if (Files.exists(dir) && Files.isDirectory(dir)) { + !Files.list(dir).findAny().isPresent + } else { + false + } + } + } diff --git a/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_WriteBehaviors.scala b/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_WriteBehaviors.scala new file mode 100644 index 000000000..e2dee079b --- /dev/null +++ b/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_WriteBehaviors.scala @@ -0,0 +1,81 @@ +package com.databricks.labs.mosaic.expressions.raster + +import com.databricks.labs.mosaic.core.geometry.api.GeometryAPI +import com.databricks.labs.mosaic.core.index.IndexSystem +import com.databricks.labs.mosaic.functions.MosaicContext +import com.databricks.labs.mosaic.utils.FileUtils +import org.apache.spark.sql.QueryTest +import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema +import org.scalatest.matchers.should.Matchers.{be, convertToAnyShouldWrapper} + +import java.nio.file.{Files, Paths} +import scala.util.Try + + +trait RST_WriteBehaviors extends QueryTest { + + // noinspection MapGetGet + def behaviors(indexSystem: IndexSystem, geometryAPI: GeometryAPI): Unit = { + val sc = this.spark + import sc.implicits._ + sc.sparkContext.setLogLevel("ERROR") + + // init + val mc = MosaicContext.build(indexSystem, geometryAPI) + mc.register(sc) + import mc.functions._ + + val writeDir = "/tmp/mosaic_tmp/write-tile" + val writeDirJava = Paths.get(writeDir) + Try(FileUtils.deleteRecursively(writeDir, keepRoot = false)) + Files.createDirectories(writeDirJava) + Files.list(Paths.get(writeDir)).count() should be (0) + + val rasterDf = spark.read + .format("binaryFile") + .option("pathGlobFilter", "*.TIF") + .load("src/test/resources/modis") + + // test write path tiles (scala for this) + val gridTiles1 = rasterDf + .withColumn("tile", rst_maketiles($"path")) + .filter(!rst_isempty($"tile")) + .select(rst_write($"tile", writeDir)) + .first() + .asInstanceOf[GenericRowWithSchema].get(0) + + val createInfo1 = gridTiles1.asInstanceOf[GenericRowWithSchema].getAs[Map[String, String]](2) + val path1Java = Paths.get(createInfo1("path")) + + Files.list(path1Java.getParent).count() should be (1) + Try(FileUtils.deleteRecursively(writeDir, keepRoot = false)) + Files.createDirectories(writeDirJava) + Files.list(Paths.get(writeDir)).count() should be (0) + + // test write content tiles (sql for this) + rasterDf.createOrReplaceTempView("source") + + val gridTilesSQL = spark + .sql( + s""" + |with subquery as ( + | select rst_maketiles(content, 'GTiff', -1) as tile from source + |) + |select rst_write(tile, '$writeDir') as result + |from subquery + |where not rst_isempty(tile) + |""".stripMargin) + .first() + .asInstanceOf[GenericRowWithSchema].get(0) + + val createInfo2 = gridTilesSQL.asInstanceOf[GenericRowWithSchema].getAs[Map[String, String]](2) + val path2Java = Paths.get(createInfo2("path")) + + // should equal 2: original file plus file written during checkpointing + Files.list(path2Java.getParent).count() should be (2) + Try(FileUtils.deleteRecursively(writeDir, keepRoot = false)) + Files.createDirectories(writeDirJava) + Files.list(Paths.get(writeDir)).count() should be (0) + } + +} diff --git a/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_WriteTest.scala b/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_WriteTest.scala new file mode 100644 index 000000000..537cf68c9 --- /dev/null +++ b/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_WriteTest.scala @@ -0,0 +1,13 @@ +package com.databricks.labs.mosaic.expressions.raster + +import com.databricks.labs.mosaic.core.geometry.api.JTS +import com.databricks.labs.mosaic.core.index.H3IndexSystem +import org.apache.spark.sql.QueryTest +import org.apache.spark.sql.test.SharedSparkSessionGDAL + +class RST_WriteTest extends QueryTest with SharedSparkSessionGDAL with RST_WriteBehaviors { + test("write raster tiles") { + behaviors(H3IndexSystem, JTS) + } + +} From 63b96be489e2d27e2944741a4926f6df4dd89de5 Mon Sep 17 00:00:00 2001 From: sllynn Date: Wed, 25 Sep 2024 11:32:07 +0200 Subject: [PATCH 08/12] fixed R test --- .../sparklyrMosaic/tests/testthat/testRasterFunctions.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/sparklyr-mosaic/sparklyrMosaic/tests/testthat/testRasterFunctions.R b/R/sparklyr-mosaic/sparklyrMosaic/tests/testthat/testRasterFunctions.R index a2a783f53..8c1b4d334 100644 --- a/R/sparklyr-mosaic/sparklyrMosaic/tests/testthat/testRasterFunctions.R +++ b/R/sparklyr-mosaic/sparklyrMosaic/tests/testthat/testRasterFunctions.R @@ -70,7 +70,7 @@ test_that("scalar raster functions behave as intended", { mutate(rst_width = rst_width(tile)) %>% mutate(rst_worldtorastercoordx = rst_worldtorastercoordx(tile, as.double(0.0), as.double(0.0))) %>% mutate(rst_worldtorastercoordy = rst_worldtorastercoordy(tile, as.double(0.0), as.double(0.0))) %>% - mutate(rst_worldtorastercoord = rst_worldtorastercoord(tile, as.double(0.0), as.double(0.0))) + mutate(rst_worldtorastercoord = rst_worldtorastercoord(tile, as.double(0.0), as.double(0.0))) %>% mutate(rst_write = rst_write(tile, "/tmp/mosaic_tmp")) expect_no_error(spark_write_source(sdf, "noop", mode = "overwrite")) From bed01dedab3356b286ef5086df20b1ccffada34a Mon Sep 17 00:00:00 2001 From: sllynn Date: Wed, 25 Sep 2024 14:27:20 +0200 Subject: [PATCH 09/12] added cutline option to RST_Clip --- docs/source/api/raster-functions.rst | 19 ++++++-- python/mosaic/api/raster.py | 16 ++++--- .../core/raster/gdal/MosaicRasterGDAL.scala | 11 ++++- .../operator/clip/RasterClipByVector.scala | 20 +++++++-- .../raster/operator/clip/VectorClipper.scala | 3 +- .../mosaic/expressions/raster/RST_Clip.scala | 42 ++++++++++++----- .../labs/mosaic/functions/MosaicContext.scala | 4 +- .../raster/RST_ClipBehaviors.scala | 45 ++++++++++++++++++- .../expressions/raster/RST_ClipTest.scala | 11 ++++- 9 files changed, 142 insertions(+), 29 deletions(-) diff --git a/docs/source/api/raster-functions.rst b/docs/source/api/raster-functions.rst index 194ecca00..ed3f5a1cd 100644 --- a/docs/source/api/raster-functions.rst +++ b/docs/source/api/raster-functions.rst @@ -193,7 +193,7 @@ rst_boundingbox rst_clip ******** -.. function:: rst_clip(tile, geometry) +.. function:: rst_clip(tile, geometry, cutline_all_touched) Clips :code:`tile` with :code:`geometry`, provided in a supported encoding (WKB, WKT or GeoJSON). @@ -201,15 +201,26 @@ rst_clip :type tile: Column (RasterTileType) :param geometry: A column containing the geometry to clip the raster to. :type geometry: Column (GeometryType) + :param cutline_all_touched: A column to specify pixels boundary behavior. + :type cutline_all_touched: Column (BooleanType) :rtype: Column: RasterTileType .. note:: - Notes + **Notes** - :code:`geometry` is expected to be: - - in the same coordinate reference system as the raster. + The :code:`geometry` parameter is expected to be: + - expressed in the same coordinate reference system as the raster. - a polygon or a multipolygon. + The :code:`cutline_all_touched` parameter: + - Optional: default is true. this is a GDAL warp config for the operation. + - If set to true, the pixels touching the geometry are included in the clip, + regardless of half-in or half-out; this is important for tessellation behaviors. + - If set to false, only at least half-in pixels are included in the clip. + + The actual GDAL command to clip looks something like the following (after some setup): + :code:`"gdalwarp -wo CUTLINE_ALL_TOUCHED= -cutline -crop_to_cutline"` + The output raster tiles will have: - the same extent as the input geometry. - the same number of bands as the input raster. diff --git a/python/mosaic/api/raster.py b/python/mosaic/api/raster.py index 464e945fe..a4a051179 100644 --- a/python/mosaic/api/raster.py +++ b/python/mosaic/api/raster.py @@ -147,18 +147,20 @@ def rst_boundingbox(raster_tile: ColumnOrName) -> Column: ) -def rst_clip(raster_tile: ColumnOrName, geometry: ColumnOrName) -> Column: +def rst_clip(raster_tile: ColumnOrName, geometry: ColumnOrName, cutline_all_touched: Any = True) -> Column: """ - Clips the raster to the given supported geometry (WKT, WKB, GeoJSON). - The result is Mosaic raster tile struct column to the clipped raster. - The result is stored in the checkpoint directory. + Clips `raster_tile` to the given supported `geometry` (WKT, WKB, GeoJSON). + The result is a Mosaic raster tile representing the clipped raster. Parameters ---------- raster_tile : Column (RasterTileType) Mosaic raster tile struct column. geometry : Column (StringType) - The geometry to clip the raster to. + The geometry to clip the tile to. + cutline_all_touched : Column (BooleanType) + optional override to specify whether any pixels touching + cutline should be included vs half-in only, default is true Returns ------- @@ -166,10 +168,14 @@ def rst_clip(raster_tile: ColumnOrName, geometry: ColumnOrName) -> Column: Mosaic raster tile struct column. """ + if type(cutline_all_touched) == bool: + cutline_all_touched = lit(cutline_all_touched) + return config.mosaic_context.invoke_function( "rst_clip", pyspark_to_java_column(raster_tile), pyspark_to_java_column(geometry), + pyspark_to_java_column(cutline_all_touched) ) diff --git a/src/main/scala/com/databricks/labs/mosaic/core/raster/gdal/MosaicRasterGDAL.scala b/src/main/scala/com/databricks/labs/mosaic/core/raster/gdal/MosaicRasterGDAL.scala index b81958cd2..b989b4eee 100644 --- a/src/main/scala/com/databricks/labs/mosaic/core/raster/gdal/MosaicRasterGDAL.scala +++ b/src/main/scala/com/databricks/labs/mosaic/core/raster/gdal/MosaicRasterGDAL.scala @@ -81,6 +81,13 @@ case class MosaicRasterGDAL( val gt = getGeoTransform val sourceCRS = getSpatialReference + + val destCRSEPSGCode = (destCRS.GetAuthorityName(null), destCRS.GetAuthorityCode(null)) match { + case (null, _) => 0 + case (name: String, code: String) if name == "EPSG" => code.toInt + case _ => 0 + } + val transform = new osr.CoordinateTransformation(sourceCRS, destCRS) val bbox = geometryAPI.geometry( @@ -96,7 +103,9 @@ case class MosaicRasterGDAL( val geom1 = org.gdal.ogr.ogr.CreateGeometryFromWkb(bbox.toWKB) geom1.Transform(transform) - geometryAPI.geometry(geom1.ExportToWkb(), "WKB") + val mosaicGeom = geometryAPI.geometry(geom1.ExportToWkb(), "WKB") + mosaicGeom.setSpatialReference(destCRSEPSGCode) + mosaicGeom } /** @return The diagonal size of a raster. */ diff --git a/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/clip/RasterClipByVector.scala b/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/clip/RasterClipByVector.scala index ddae2fed2..9468426e3 100644 --- a/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/clip/RasterClipByVector.scala +++ b/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/clip/RasterClipByVector.scala @@ -8,6 +8,8 @@ import com.databricks.labs.mosaic.core.raster.operator.gdal.GDALWarp import com.databricks.labs.mosaic.utils.PathUtils import org.gdal.osr.SpatialReference +import java.util.Locale + /** * RasterClipByVector is an object that defines the interface for clipping a * raster by a vector geometry. @@ -31,25 +33,35 @@ object RasterClipByVector { * The geometry CRS. * @param geometryAPI * The geometry API. + * @param cutlineAllTouched + * Whether pixels touching cutline are automatically included (true) or + * included only the cutline crosses the centre point of the pixel + * (false) default: true. * @return * A clipped raster. */ - def clip(raster: MosaicRasterGDAL, geometry: MosaicGeometry, geomCRS: SpatialReference, geometryAPI: GeometryAPI): MosaicRasterGDAL = { + def clip( + raster: MosaicRasterGDAL, + geometry: MosaicGeometry, + geomCRS: SpatialReference, + geometryAPI: GeometryAPI, + cutlineAllTouched: Boolean = true + ): MosaicRasterGDAL = { val rasterCRS = raster.getSpatialReference val outShortName = raster.getDriversShortName val geomSrcCRS = if (geomCRS == null) rasterCRS else geomCRS + val cutlineToken = cutlineAllTouched.toString.toUpperCase(Locale.US) + val resultFileName = PathUtils.createTmpFilePath(GDAL.getExtension(outShortName)) val shapeFileName = VectorClipper.generateClipper(geometry, geomSrcCRS, rasterCRS, geometryAPI) // For -wo consult https://gdal.org/doxygen/structGDALWarpOptions.html - // SOURCE_EXTRA=3 is used to ensure that when the raster is clipped, the - // pixels that touch the geometry are included. The default is 1, 3 seems to be a good empirical value. val result = GDALWarp.executeWarp( resultFileName, Seq(raster), - command = s"gdalwarp -wo CUTLINE_ALL_TOUCHED=TRUE -cutline $shapeFileName -crop_to_cutline" + command = s"gdalwarp -wo CUTLINE_ALL_TOUCHED=$cutlineToken -cutline $shapeFileName -crop_to_cutline" ) VectorClipper.cleanUpClipper(shapeFileName) diff --git a/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/clip/VectorClipper.scala b/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/clip/VectorClipper.scala index 41d98fbcf..421155fde 100644 --- a/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/clip/VectorClipper.scala +++ b/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/clip/VectorClipper.scala @@ -64,8 +64,9 @@ object VectorClipper { val projectedGeom = geometry.osrTransformCRS(srcCrs, dstCrs, geometryAPI) val geom = ogr.CreateGeometryFromWkb(projectedGeom.toWKB(2)) + geom.AssignSpatialReference(dstCrs) - val geomLayer = shpDataSource.CreateLayer("geom") + val geomLayer = shpDataSource.CreateLayer("geom", dstCrs) val idField = new org.gdal.ogr.FieldDefn("id", OFTInteger) geomLayer.CreateField(idField) diff --git a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_Clip.scala b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_Clip.scala index f9ea405f6..6fee89aaa 100644 --- a/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_Clip.scala +++ b/src/main/scala/com/databricks/labs/mosaic/expressions/raster/RST_Clip.scala @@ -5,34 +5,39 @@ import com.databricks.labs.mosaic.core.raster.api.GDAL import com.databricks.labs.mosaic.core.raster.operator.clip.RasterClipByVector import com.databricks.labs.mosaic.core.types.RasterTileType import com.databricks.labs.mosaic.core.types.model.MosaicRasterTile -import com.databricks.labs.mosaic.expressions.base.{GenericExpressionFactory, WithExpressionInfo} -import com.databricks.labs.mosaic.expressions.raster.base.Raster1ArgExpression +import com.databricks.labs.mosaic.expressions.base.WithExpressionInfo +import com.databricks.labs.mosaic.expressions.raster.base.Raster2ArgExpression import com.databricks.labs.mosaic.functions.MosaicExpressionConfig import org.apache.spark.sql.catalyst.analysis.FunctionRegistry.FunctionBuilder import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback -import org.apache.spark.sql.catalyst.expressions.{Expression, NullIntolerant} +import org.apache.spark.sql.catalyst.expressions.{Expression, Literal, NullIntolerant} +import org.apache.spark.sql.types.BooleanType + +import scala.util.Try /** The expression for clipping a raster by a vector. */ case class RST_Clip( rastersExpr: Expression, geometryExpr: Expression, + cutlineAllTouchedExpr: Expression, expressionConfig: MosaicExpressionConfig -) extends Raster1ArgExpression[RST_Clip]( +) extends Raster2ArgExpression[RST_Clip]( rastersExpr, geometryExpr, + cutlineAllTouchedExpr, returnsRaster = true, expressionConfig = expressionConfig ) with NullIntolerant with CodegenFallback { + val geometryAPI: GeometryAPI = GeometryAPI(expressionConfig.getGeometryAPI) + override def dataType: org.apache.spark.sql.types.DataType = { GDAL.enable(expressionConfig) RasterTileType(expressionConfig.getCellIdType, rastersExpr, expressionConfig.isRasterUseCheckpoint) } - val geometryAPI: GeometryAPI = GeometryAPI(expressionConfig.getGeometryAPI) - /** * Clips a raster by a vector. * @@ -40,14 +45,17 @@ case class RST_Clip( * The raster to be used. * @param arg1 * The vector to be used. + * @param arg2 + * cutline handling (boolean). * @return * The clipped raster. */ - override def rasterTransform(tile: MosaicRasterTile, arg1: Any): Any = { + override def rasterTransform(tile: MosaicRasterTile, arg1: Any, arg2: Any): Any = { val geometry = geometryAPI.geometry(arg1, geometryExpr.dataType) val geomCRS = geometry.getSpatialReferenceOSR + val cutlineAllTouched = arg2.asInstanceOf[Boolean] tile.copy( - raster = RasterClipByVector.clip(tile.getRaster, geometry, geomCRS, geometryAPI) + raster = RasterClipByVector.clip(tile.getRaster, geometry, geomCRS, geometryAPI, cutlineAllTouched) ) } @@ -60,7 +68,7 @@ object RST_Clip extends WithExpressionInfo { override def usage: String = """ - |_FUNC_(expr1) - Returns a raster tile clipped by provided vector. + |_FUNC_(expr1, expr2) - Returns a raster tile clipped by provided vector. |""".stripMargin override def example: String = @@ -72,8 +80,20 @@ object RST_Clip extends WithExpressionInfo { | ... | """.stripMargin - override def builder(expressionConfig: MosaicExpressionConfig): FunctionBuilder = { - GenericExpressionFactory.getBaseBuilder[RST_Clip](2, expressionConfig) + override def builder(exprConfig: MosaicExpressionConfig): FunctionBuilder = { (children: Seq[Expression]) => + { + def checkCutline(cutline: Expression): Boolean = Try(cutline.eval().asInstanceOf[Boolean]).isSuccess + val noCutlineArg = new Literal(true, BooleanType) // default is true for tessellation needs + + children match { + // Note type checking only works for literals + case Seq(input, vector) => + RST_Clip(input, vector, noCutlineArg, exprConfig) + case Seq(input, vector, cutline) if checkCutline(cutline) => + RST_Clip(input, vector, cutline, exprConfig) + case _ => RST_Clip(children.head, children(1), children(2), exprConfig) + } + } } } diff --git a/src/main/scala/com/databricks/labs/mosaic/functions/MosaicContext.scala b/src/main/scala/com/databricks/labs/mosaic/functions/MosaicContext.scala index 0899d03f8..0a3c14a1b 100644 --- a/src/main/scala/com/databricks/labs/mosaic/functions/MosaicContext.scala +++ b/src/main/scala/com/databricks/labs/mosaic/functions/MosaicContext.scala @@ -700,7 +700,9 @@ class MosaicContext(indexSystem: IndexSystem, geometryAPI: GeometryAPI) extends def rst_bandmetadata(raster: Column, band: Int): Column = ColumnAdapter(RST_BandMetaData(raster.expr, lit(band).expr, expressionConfig)) def rst_boundingbox(raster: Column): Column = ColumnAdapter(RST_BoundingBox(raster.expr, expressionConfig)) - def rst_clip(raster: Column, geometry: Column): Column = ColumnAdapter(RST_Clip(raster.expr, geometry.expr, expressionConfig)) + def rst_clip(raster: Column, geometry: Column): Column = ColumnAdapter(RST_Clip(raster.expr, geometry.expr, lit(true).expr, expressionConfig)) + def rst_clip(raster: Column, geometry: Column, cutline: Boolean): Column = ColumnAdapter(RST_Clip(raster.expr, geometry.expr, lit(cutline).expr, expressionConfig)) + def rst_clip(raster: Column, geometry: Column, cutline: Column): Column = ColumnAdapter(RST_Clip(raster.expr, geometry.expr, cutline.expr, expressionConfig)) def rst_convolve(raster: Column, kernel: Column): Column = ColumnAdapter(RST_Convolve(raster.expr, kernel.expr, expressionConfig)) def rst_dtmfromgeoms(pointsArray: Column, linesArray: Column, mergeTol: Column, snapTol: Column, origin: Column, xWidth: Column, yWidth: Column, xSize: Column, ySize: Column): Column = ColumnAdapter(RST_DTMFromGeoms(pointsArray.expr, linesArray.expr, mergeTol.expr, snapTol.expr, origin.expr, xWidth.expr, yWidth.expr, xSize.expr, ySize.expr, expressionConfig)) diff --git a/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_ClipBehaviors.scala b/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_ClipBehaviors.scala index 9d9328ca6..486f03736 100644 --- a/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_ClipBehaviors.scala +++ b/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_ClipBehaviors.scala @@ -2,14 +2,17 @@ package com.databricks.labs.mosaic.expressions.raster import com.databricks.labs.mosaic.core.geometry.api.GeometryAPI import com.databricks.labs.mosaic.core.index.IndexSystem +import com.databricks.labs.mosaic.core.raster.gdal.MosaicRasterGDAL import com.databricks.labs.mosaic.functions.MosaicContext +import com.databricks.labs.mosaic.test.mocks.filePath import org.apache.spark.sql.QueryTest +import org.apache.spark.sql.functions.lit import org.scalatest.matchers.should.Matchers._ trait RST_ClipBehaviors extends QueryTest { // noinspection MapGetGet - def behaviors(indexSystem: IndexSystem, geometryAPI: GeometryAPI): Unit = { + def basicBehaviour(indexSystem: IndexSystem, geometryAPI: GeometryAPI): Unit = { spark.sparkContext.setLogLevel("ERROR") val mc = MosaicContext.build(indexSystem, geometryAPI) mc.register() @@ -47,4 +50,44 @@ trait RST_ClipBehaviors extends QueryTest { } + def cutlineBehaviour(indexSystem: IndexSystem, geometryAPI: GeometryAPI): Unit = { + spark.sparkContext.setLogLevel("ERROR") + val mc = MosaicContext.build(indexSystem, geometryAPI) + mc.register() + val sc = spark + import mc.functions._ + import sc.implicits._ + + val testPath = filePath("/binary/geotiff-small/chicago_sp27.tif") + + val createInfo = Map("path" -> testPath, "parentPath" -> testPath) + val testRaster = MosaicRasterGDAL.readRaster(createInfo) + + val ftMeters = 0.3 // ~0.3 ft in meter + val ftUnits = 0.3 // epsg:26771 0.3 ft per unit + val buffVal: Double = testRaster.pixelXSize * ftMeters * ftUnits * 50.5 + val clipRegion = testRaster + .bbox(geometryAPI, testRaster.getSpatialReference) + .getCentroid + .buffer(buffVal) + + clipRegion.getSpatialReference shouldBe 26771 + + val df = spark.read.format("gdal").load(testPath) + .withColumn("clipGeom", st_geomfromgeojson(lit(clipRegion.toJSON))) + + val df_include = df + .withColumn("clippedRaster", rst_clip($"tile", $"clipGeom")) + .select(rst_pixelcount($"clippedRaster").alias("pCount")) + val df_exclude = df + .withColumn("clippedRaster", rst_clip($"tile", $"clipGeom", lit(false))) + .select(rst_pixelcount($"clippedRaster").alias("pCount")) + + val pCountInclude = df_include.first.getSeq[Long](0).head + val pCountExclude = df_exclude.first.getSeq[Long](0).head + + pCountInclude should be > pCountExclude + + } + } diff --git a/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_ClipTest.scala b/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_ClipTest.scala index e3597141d..ddc974715 100644 --- a/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_ClipTest.scala +++ b/src/test/scala/com/databricks/labs/mosaic/expressions/raster/RST_ClipTest.scala @@ -25,7 +25,16 @@ class RST_ClipTest extends QueryTest with SharedSparkSessionGDAL with RST_ClipBe test("Testing RST_Clip with manual GDAL registration (H3, JTS).") { noCodegen { assume(System.getProperty("os.name") == "Linux") - behaviors(H3IndexSystem, JTS) + basicBehaviour(H3IndexSystem, JTS) + } + } + + // These tests are not index system nor geometry API specific. + // Only testing one pairing is sufficient. + test("Testing RST_Clip with different cutline options with manual GDAL registration (H3, JTS).") { + noCodegen { + assume(System.getProperty("os.name") == "Linux") + cutlineBehaviour(H3IndexSystem, JTS) } } From a9f43fd5dc20711b19f57918a059cada9a47819e Mon Sep 17 00:00:00 2001 From: sllynn Date: Wed, 25 Sep 2024 14:40:23 +0200 Subject: [PATCH 10/12] minor docs tweak --- docs/source/api/raster-functions.rst | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/docs/source/api/raster-functions.rst b/docs/source/api/raster-functions.rst index ed3f5a1cd..c76f98d58 100644 --- a/docs/source/api/raster-functions.rst +++ b/docs/source/api/raster-functions.rst @@ -207,26 +207,29 @@ rst_clip .. note:: **Notes** + Geometry input + The :code:`geometry` parameter is expected to be a polygon or a multipolygon. - The :code:`geometry` parameter is expected to be: - - expressed in the same coordinate reference system as the raster. - - a polygon or a multipolygon. + Cutline handling + The :code:`cutline_all_touched` parameter: - The :code:`cutline_all_touched` parameter: - - Optional: default is true. this is a GDAL warp config for the operation. + - Optional: default is true. This is a GDAL warp config for the operation. - If set to true, the pixels touching the geometry are included in the clip, regardless of half-in or half-out; this is important for tessellation behaviors. - If set to false, only at least half-in pixels are included in the clip. + - More information can be found `here `__ - The actual GDAL command to clip looks something like the following (after some setup): + The actual GDAL command employed to perform the clipping operation is as follows: :code:`"gdalwarp -wo CUTLINE_ALL_TOUCHED= -cutline -crop_to_cutline"` - The output raster tiles will have: - - the same extent as the input geometry. - - the same number of bands as the input raster. - - the same pixel data type as the input raster. - - the same pixel size as the input raster. - - the same coordinate reference system as the input raster. + Output + Output raster tiles will have: + + - the same extent as the input geometry. + - the same number of bands as the input raster. + - the same pixel data type as the input raster. + - the same pixel size as the input raster. + - the same coordinate reference system as the input raster. .. :example: From f91610ba6cfa210299f2f66768804542a5865141 Mon Sep 17 00:00:00 2001 From: sllynn Date: Wed, 25 Sep 2024 15:28:19 +0200 Subject: [PATCH 11/12] `MosaicRasterGDAL` does not need `memsize` at instantiation. Will be estimated from band data types and sizes if not known --- .../labs/mosaic/core/raster/api/GDAL.scala | 4 +- .../raster/gdal/MosaicRasterBandGDAL.scala | 31 +++++++++++++++ .../core/raster/gdal/MosaicRasterGDAL.scala | 37 ++++++++---------- .../raster/operator/gdal/GDALBuildVRT.scala | 2 +- .../raster/operator/gdal/GDALTranslate.scala | 2 +- .../core/raster/operator/gdal/GDALWarp.scala | 2 +- .../mosaic/core/raster/TestRasterGDAL.scala | 39 +++++++++++++------ 7 files changed, 81 insertions(+), 36 deletions(-) diff --git a/src/main/scala/com/databricks/labs/mosaic/core/raster/api/GDAL.scala b/src/main/scala/com/databricks/labs/mosaic/core/raster/api/GDAL.scala index d40ff6231..bc358d19d 100644 --- a/src/main/scala/com/databricks/labs/mosaic/core/raster/api/GDAL.scala +++ b/src/main/scala/com/databricks/labs/mosaic/core/raster/api/GDAL.scala @@ -120,7 +120,7 @@ object GDAL { inputDT: DataType ): MosaicRasterGDAL = { if (inputRaster == null) { - MosaicRasterGDAL(null, createInfo, -1) + MosaicRasterGDAL(null, createInfo) } else { inputDT match { case _: StringType => @@ -153,7 +153,7 @@ object GDAL { val zippedPath = s"/vsizip/$parentPath" MosaicRasterGDAL.readRaster(bytes, createInfo + ("path" -> zippedPath)) } catch { - case _: Throwable => MosaicRasterGDAL(null, createInfo, -1) + case _: Throwable => MosaicRasterGDAL(null, createInfo) } } diff --git a/src/main/scala/com/databricks/labs/mosaic/core/raster/gdal/MosaicRasterBandGDAL.scala b/src/main/scala/com/databricks/labs/mosaic/core/raster/gdal/MosaicRasterBandGDAL.scala index 91acd50bb..ffd32e109 100644 --- a/src/main/scala/com/databricks/labs/mosaic/core/raster/gdal/MosaicRasterBandGDAL.scala +++ b/src/main/scala/com/databricks/labs/mosaic/core/raster/gdal/MosaicRasterBandGDAL.scala @@ -87,6 +87,37 @@ case class MosaicRasterBandGDAL(band: Band, id: Int) { case _ => "Unknown" } + /** + * @return + * Returns the estimated number of bytes in each pixel. + */ + def dataTypeBytes: Int = { + val pixelMemSizeMap = Map( + gdalconstConstants.GDT_Byte -> 1, + gdalconstConstants.GDT_UInt16 -> 2, + gdalconstConstants.GDT_Int16 -> 2, + gdalconstConstants.GDT_UInt32 -> 4, + gdalconstConstants.GDT_Int32 -> 4, + gdalconstConstants.GDT_Float32 -> 4, + gdalconstConstants.GDT_Float64 -> 8, + gdalconstConstants.GDT_CInt16 -> 2, + gdalconstConstants.GDT_CInt32 -> 4, + gdalconstConstants.GDT_CFloat32 -> 4, + gdalconstConstants.GDT_CFloat64 -> 8 + ) + if (pixelMemSizeMap.contains(dataType)) pixelMemSizeMap(dataType) else 0 + } + + /** + * @return + * Returns the estimated size in memory of the band. + */ + def estimatedSizeInMem: Int = { + val pixelMemSize = dataTypeBytes + val pixelCount = xSize * ySize + pixelMemSize * pixelCount + } + /** * @return * Returns the band's x size. diff --git a/src/main/scala/com/databricks/labs/mosaic/core/raster/gdal/MosaicRasterGDAL.scala b/src/main/scala/com/databricks/labs/mosaic/core/raster/gdal/MosaicRasterGDAL.scala index b989b4eee..3ae7ed432 100644 --- a/src/main/scala/com/databricks/labs/mosaic/core/raster/gdal/MosaicRasterGDAL.scala +++ b/src/main/scala/com/databricks/labs/mosaic/core/raster/gdal/MosaicRasterGDAL.scala @@ -33,8 +33,7 @@ import scala.util.{Failure, Success, Try} //noinspection DuplicatedCode case class MosaicRasterGDAL( raster: Dataset, - createInfo: Map[String, String], - memSize: Long + createInfo: Map[String, String] ) extends RasterWriter with RasterCleaner { @@ -132,16 +131,15 @@ case class MosaicRasterGDAL( * Returns the amount of memory occupied by the file in bytes. */ def getMemSize: Long = { - if (memSize == -1) { - val toRead = if (path.startsWith("/vsizip/")) path.replace("/vsizip/", "") else path - if (Files.notExists(Paths.get(toRead))) { - throw new Exception(s"File not found: ${gdal.GetLastErrorMsg()}") - } + val toRead = if (path.startsWith("/vsizip/")) path.replace("/vsizip/", "") else path + if (Files.exists(Paths.get(toRead))) { Files.size(Paths.get(toRead)) + } else if (getBands.nonEmpty) { + //estimate based on pixels and datatype + getBands.map(_.estimatedSizeInMem).sum } else { - memSize + -1 } - } /** @@ -264,7 +262,7 @@ case class MosaicRasterGDAL( "parentPath" -> parentPath, "driver" -> getDriversShortName ) - MosaicRasterGDAL(newRaster, newCreateInfo, -1) + MosaicRasterGDAL(newRaster, newCreateInfo) } /** @return Returns the raster's SRID. This is the EPSG code of the raster's CRS. */ @@ -329,7 +327,7 @@ case class MosaicRasterGDAL( "driver" -> getDriversShortName ) - val result = MosaicRasterGDAL(outputRaster, newCreateInfo, this.memSize) + val result = MosaicRasterGDAL(outputRaster, newCreateInfo) result.flushCache() } @@ -364,7 +362,7 @@ case class MosaicRasterGDAL( "driver" -> getDriversShortName ) - val result = MosaicRasterGDAL(outputRaster, newCreateInfo, this.memSize) + val result = MosaicRasterGDAL(outputRaster, newCreateInfo) result.flushCache() } @@ -435,7 +433,7 @@ case class MosaicRasterGDAL( else "" } ) - MosaicRasterGDAL(ds, newCreateInfo, -1) + MosaicRasterGDAL(ds, newCreateInfo) } /** @@ -572,7 +570,7 @@ case class MosaicRasterGDAL( * Returns [[MosaicRasterGDAL]]. */ def refresh(): MosaicRasterGDAL = { - MosaicRasterGDAL(pathAsDataset(path), createInfo, memSize) + MosaicRasterGDAL(pathAsDataset(path), createInfo) } /** @@ -754,7 +752,7 @@ object MosaicRasterGDAL extends RasterReader { */ override def readRaster(contentBytes: Array[Byte], createInfo: Map[String, String]): MosaicRasterGDAL = { if (Option(contentBytes).isEmpty || contentBytes.isEmpty) { - MosaicRasterGDAL(null, createInfo, -1) + MosaicRasterGDAL(null, createInfo) } else { // This is a temp UUID for purposes of reading the raster through GDAL from memory // The stable UUID is kept in metadata of the raster @@ -784,12 +782,12 @@ object MosaicRasterGDAL extends RasterReader { if (ds2 == null) { throw new Exception(s"Error reading raster from bytes: ${prompt._3}") } - MosaicRasterGDAL(ds2, createInfo + ("path" -> unzippedPath), contentBytes.length) + MosaicRasterGDAL(ds2, createInfo + ("path" -> unzippedPath)) } else { - MosaicRasterGDAL(ds1, createInfo + ("path" -> readPath), contentBytes.length) + MosaicRasterGDAL(ds1, createInfo + ("path" -> readPath)) } } else { - MosaicRasterGDAL(dataset, createInfo + ("path" -> tmpPath), contentBytes.length) + MosaicRasterGDAL(dataset, createInfo + ("path" -> tmpPath)) } } } @@ -832,8 +830,7 @@ object MosaicRasterGDAL extends RasterReader { Map( "driver" -> driverShortName, "last_error" -> error - ), - -1 + ) ) raster } diff --git a/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/gdal/GDALBuildVRT.scala b/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/gdal/GDALBuildVRT.scala index cb79dc263..60e352263 100644 --- a/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/gdal/GDALBuildVRT.scala +++ b/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/gdal/GDALBuildVRT.scala @@ -34,7 +34,7 @@ object GDALBuildVRT { "all_parents" -> rasters.map(_.getParentPath).mkString(";") ) // VRT files are just meta files, mem size doesnt make much sense so we keep -1 - MosaicRasterGDAL(result, createInfo, -1).flushCache() + MosaicRasterGDAL(result, createInfo).flushCache() } } diff --git a/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/gdal/GDALTranslate.scala b/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/gdal/GDALTranslate.scala index 2fb106fda..cb8609343 100644 --- a/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/gdal/GDALTranslate.scala +++ b/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/gdal/GDALTranslate.scala @@ -42,7 +42,7 @@ object GDALTranslate { "all_parents" -> raster.getParentPath ) raster - .copy(raster = result, createInfo = createInfo, memSize = size) + .copy(raster = result, createInfo = createInfo) .flushCache() } diff --git a/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/gdal/GDALWarp.scala b/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/gdal/GDALWarp.scala index 516560a76..5061f7de4 100644 --- a/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/gdal/GDALWarp.scala +++ b/src/main/scala/com/databricks/labs/mosaic/core/raster/operator/gdal/GDALWarp.scala @@ -38,7 +38,7 @@ object GDALWarp { "last_error" -> errorMsg, "all_parents" -> rasters.map(_.getParentPath).mkString(";") ) - rasters.head.copy(raster = result, createInfo = createInfo, memSize = size).flushCache() + rasters.head.copy(raster = result, createInfo = createInfo).flushCache() } } diff --git a/src/test/scala/com/databricks/labs/mosaic/core/raster/TestRasterGDAL.scala b/src/test/scala/com/databricks/labs/mosaic/core/raster/TestRasterGDAL.scala index 642b31281..481b1728c 100644 --- a/src/test/scala/com/databricks/labs/mosaic/core/raster/TestRasterGDAL.scala +++ b/src/test/scala/com/databricks/labs/mosaic/core/raster/TestRasterGDAL.scala @@ -33,12 +33,29 @@ class TestRasterGDAL extends SharedSparkSessionGDAL { resultExecutors.foreach(s => s should include("GDAL")) } -// test("Verify that checkpoint is not used.") { -// spark.conf.get(MOSAIC_TEST_MODE) shouldBe "true" -// MosaicGDAL.isUseCheckpoint shouldBe false -// } -// - test("Read raster metadata from GeoTIFF file.") { + test("Verify memsize handling") { + val createInfo = Map( + "path" -> "no_path", + "parentPath" -> "no_path", + "driver" -> "GTiff" + ) + val null_raster = MosaicRasterGDAL(null, createInfo) + null_raster.getMemSize should be(-1) + + val np_content = spark.read.format("binaryFile") + .load("src/test/resources/modis/MCD43A4.A2018185.h10v07.006.2018194033728_B04.TIF") + .select("content").first.getAs[Array[Byte]](0) + val np_raster = MosaicRasterGDAL.readRaster(np_content, createInfo) +// val np_raster = RasterIO.readRasterHydratedFromContent(np_content, createInfo, getExprConfigOpt) + np_raster.getMemSize > 0 should be(true) + info(s"np_content length? ${np_content.length}") + info(s"np_raster memsize? ${np_raster.getMemSize}") + + null_raster.destroy() + np_raster.destroy() + } + + test("Read tile metadata from GeoTIFF file.") { assume(System.getProperty("os.name") == "Linux") val createInfo = Map( @@ -147,7 +164,7 @@ class TestRasterGDAL extends SharedSparkSessionGDAL { "parentPath" -> "", "driver" -> "GTiff" ) - var result = MosaicRasterGDAL(ds, createInfo, -1).filter(5, "avg").flushCache() + var result = MosaicRasterGDAL(ds, createInfo).filter(5, "avg").flushCache() var resultValues = result.getBand(1).values @@ -174,7 +191,7 @@ class TestRasterGDAL extends SharedSparkSessionGDAL { // mode - result = MosaicRasterGDAL(ds, createInfo, -1).filter(5, "mode").flushCache() + result = MosaicRasterGDAL(ds, createInfo).filter(5, "mode").flushCache() resultValues = result.getBand(1).values @@ -227,7 +244,7 @@ class TestRasterGDAL extends SharedSparkSessionGDAL { // median - result = MosaicRasterGDAL(ds, createInfo, -1).filter(5, "median").flushCache() + result = MosaicRasterGDAL(ds, createInfo).filter(5, "median").flushCache() resultValues = result.getBand(1).values @@ -266,7 +283,7 @@ class TestRasterGDAL extends SharedSparkSessionGDAL { // min filter - result = MosaicRasterGDAL(ds, createInfo, -1).filter(5, "min").flushCache() + result = MosaicRasterGDAL(ds, createInfo).filter(5, "min").flushCache() resultValues = result.getBand(1).values @@ -305,7 +322,7 @@ class TestRasterGDAL extends SharedSparkSessionGDAL { // max filter - result = MosaicRasterGDAL(ds, createInfo, -1).filter(5, "max").flushCache() + result = MosaicRasterGDAL(ds, createInfo).filter(5, "max").flushCache() resultValues = result.getBand(1).values From a540a5e6741aa630fe932cc64fa7a36a8a2afdc4 Mon Sep 17 00:00:00 2001 From: sllynn Date: Wed, 25 Sep 2024 16:07:24 +0200 Subject: [PATCH 12/12] formatter run on the python bindings --- python/mosaic/api/raster.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/python/mosaic/api/raster.py b/python/mosaic/api/raster.py index a4a051179..ef3c42e2c 100644 --- a/python/mosaic/api/raster.py +++ b/python/mosaic/api/raster.py @@ -147,7 +147,9 @@ def rst_boundingbox(raster_tile: ColumnOrName) -> Column: ) -def rst_clip(raster_tile: ColumnOrName, geometry: ColumnOrName, cutline_all_touched: Any = True) -> Column: +def rst_clip( + raster_tile: ColumnOrName, geometry: ColumnOrName, cutline_all_touched: Any = True +) -> Column: """ Clips `raster_tile` to the given supported `geometry` (WKT, WKB, GeoJSON). The result is a Mosaic raster tile representing the clipped raster. @@ -175,7 +177,7 @@ def rst_clip(raster_tile: ColumnOrName, geometry: ColumnOrName, cutline_all_touc "rst_clip", pyspark_to_java_column(raster_tile), pyspark_to_java_column(geometry), - pyspark_to_java_column(cutline_all_touched) + pyspark_to_java_column(cutline_all_touched), )