Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Raster api tweaks #578

Merged
merged 13 commits into from
Sep 25, 2024
Merged
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ test_that("scalar raster functions behave as intended", {
sdf <- withColumn(sdf, "rst_worldtorastercoordx", rst_worldtorastercoordx(column("tile"), lit(0.0), lit(0.0)))
sdf <- withColumn(sdf, "rst_worldtorastercoordy", rst_worldtorastercoordy(column("tile"), lit(0.0), lit(0.0)))
sdf <- withColumn(sdf, "rst_worldtorastercoord", rst_worldtorastercoord(column("tile"), lit(0.0), lit(0.0)))
sdf <- withColumn(sdf, "rst_write", rst_write(column("tile"), lit("/tmp/mosaic_tmp/")))

expect_no_error(write.df(sdf, source = "noop", mode = "overwrite"))
})
Expand All @@ -67,7 +68,7 @@ test_that("raster flatmap functions behave as intended", {
expect_equal(nrow(tessellate_sdf), 63)

overlap_sdf <- generate_singleband_raster_df()
overlap_sdf <- withColumn(overlap_sdf, "rst_to_overlapping_tiles", rst_to_overlapping_tiles(column("tile"), lit(200L), lit(200L), lit(10L)))
overlap_sdf <- withColumn(overlap_sdf, "rst_tooverlappingtiles", rst_tooverlappingtiles(column("tile"), lit(200L), lit(200L), lit(10L)))

expect_no_error(write.df(overlap_sdf, source = "noop", mode = "overwrite"))
expect_equal(nrow(overlap_sdf), 87)
Expand All @@ -76,7 +77,7 @@ test_that("raster flatmap functions behave as intended", {
test_that("raster aggregation functions behave as intended", {
collection_sdf <- generate_singleband_raster_df()
collection_sdf <- withColumn(collection_sdf, "extent", st_astext(rst_boundingbox(column("tile"))))
collection_sdf <- withColumn(collection_sdf, "tile", rst_to_overlapping_tiles(column("tile"), lit(200L), lit(200L), lit(10L)))
collection_sdf <- withColumn(collection_sdf, "tile", rst_tooverlappingtiles(column("tile"), lit(200L), lit(200L), lit(10L)))

merge_sdf <- summarize(
groupBy(collection_sdf, "path"),
Expand Down Expand Up @@ -127,7 +128,7 @@ test_that("the tessellate-join-clip-merge flow works on NetCDF files", {
raster_sdf <- withColumn(raster_sdf, "timestep", element_at(rst_metadata(column("tile")), "NC_GLOBAL#GDAL_MOSAIC_BAND_INDEX"))
raster_sdf <- where(raster_sdf, "timestep = 21")
raster_sdf <- withColumn(raster_sdf, "tile", rst_setsrid(column("tile"), lit(4326L)))
raster_sdf <- withColumn(raster_sdf, "tile", rst_to_overlapping_tiles(column("tile"), lit(20L), lit(20L), lit(10L)))
raster_sdf <- withColumn(raster_sdf, "tile", rst_tooverlappingtiles(column("tile"), lit(20L), lit(20L), lit(10L)))
raster_sdf <- withColumn(raster_sdf, "tile", rst_tessellate(column("tile"), lit(target_resolution)))

clipped_sdf <- join(raster_sdf, census_sdf, raster_sdf$tile.index_id == census_sdf$index_id)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,8 @@ test_that("scalar raster functions behave as intended", {
mutate(rst_width = rst_width(tile)) %>%
mutate(rst_worldtorastercoordx = rst_worldtorastercoordx(tile, as.double(0.0), as.double(0.0))) %>%
mutate(rst_worldtorastercoordy = rst_worldtorastercoordy(tile, as.double(0.0), as.double(0.0))) %>%
mutate(rst_worldtorastercoord = rst_worldtorastercoord(tile, as.double(0.0), as.double(0.0)))
mutate(rst_worldtorastercoord = rst_worldtorastercoord(tile, as.double(0.0), as.double(0.0))) %>%
mutate(rst_write = rst_write(tile, "/tmp/mosaic_tmp"))

expect_no_error(spark_write_source(sdf, "noop", mode = "overwrite"))
})
Expand All @@ -95,7 +96,7 @@ test_that("raster flatmap functions behave as intended", {
expect_equal(sdf_nrow(tessellate_sdf), 63)

overlap_sdf <- generate_singleband_raster_df() %>%
mutate(rst_to_overlapping_tiles = rst_to_overlapping_tiles(tile, 200L, 200L, 10L))
mutate(rst_tooverlappingtiles = rst_tooverlappingtiles(tile, 200L, 200L, 10L))

expect_no_error(spark_write_source(overlap_sdf, "noop", mode = "overwrite"))
expect_equal(sdf_nrow(overlap_sdf), 87)
Expand All @@ -105,7 +106,7 @@ test_that("raster flatmap functions behave as intended", {
test_that("raster aggregation functions behave as intended", {
collection_sdf <- generate_singleband_raster_df() %>%
mutate(extent = st_astext(rst_boundingbox(tile))) %>%
mutate(tile = rst_to_overlapping_tiles(tile, 200L, 200L, 10L))
mutate(tile = rst_tooverlappingtiles(tile, 200L, 200L, 10L))

merge_sdf <- collection_sdf %>%
group_by(path) %>%
Expand Down Expand Up @@ -167,7 +168,7 @@ test_that("the tessellate-join-clip-merge flow works on NetCDF files", {
indexed_raster_sdf <- sdf_sql(sc, "SELECT tile, element_at(rst_metadata(tile), 'NC_GLOBAL#GDAL_MOSAIC_BAND_INDEX') as timestep FROM raster") %>%
filter(timestep == 21L) %>%
mutate(tile = rst_setsrid(tile, 4326L)) %>%
mutate(tile = rst_to_overlapping_tiles(tile, 20L, 20L, 10L)) %>%
mutate(tile = rst_tooverlappingtiles(tile, 20L, 20L, 10L)) %>%
mutate(tile = rst_tessellate(tile, target_resolution))

clipped_sdf <- indexed_raster_sdf %>%
Expand Down
168 changes: 151 additions & 17 deletions docs/source/api/raster-functions.rst
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ rst_avg

Returns an array containing mean values for each band.

:param tile: A column containing the raster tile.
:param tile: A column containing the raster tile.
:type tile: Column (RasterTileType)
:rtype: Column: ArrayType(DoubleType)

Expand Down Expand Up @@ -92,7 +92,7 @@ rst_bandmetadata
Extract the metadata describing the raster band.
Metadata is return as a map of key value pairs.

:param tile: A column containing the raster tile.
:param tile: A column containing the raster tile.
:type tile: Column (RasterTileType)
:param band: The band number to extract metadata for.
:type band: Column (IntegerType)
Expand Down Expand Up @@ -193,29 +193,43 @@ rst_boundingbox
rst_clip
********

.. function:: rst_clip(tile, geometry)
.. function:: rst_clip(tile, geometry, cutline_all_touched)

Clips :code:`tile` with :code:`geometry`, provided in a supported encoding (WKB, WKT or GeoJSON).

:param tile: A column containing the raster tile.
:type tile: Column (RasterTileType)
:param geometry: A column containing the geometry to clip the raster to.
:type geometry: Column (GeometryType)
:param cutline_all_touched: A column to specify pixels boundary behavior.
:type cutline_all_touched: Column (BooleanType)
:rtype: Column: RasterTileType

.. note::
Notes
**Notes**
Geometry input
The :code:`geometry` parameter is expected to be a polygon or a multipolygon.

:code:`geometry` is expected to be:
- in the same coordinate reference system as the raster.
- a polygon or a multipolygon.
Cutline handling
The :code:`cutline_all_touched` parameter:

The output raster tiles will have:
- the same extent as the input geometry.
- the same number of bands as the input raster.
- the same pixel data type as the input raster.
- the same pixel size as the input raster.
- the same coordinate reference system as the input raster.
- Optional: default is true. This is a GDAL warp config for the operation.
- If set to true, the pixels touching the geometry are included in the clip,
regardless of half-in or half-out; this is important for tessellation behaviors.
- If set to false, only at least half-in pixels are included in the clip.
- More information can be found `here <https://gdal.org/en/latest/api/gdalwarp_cpp.html>`__

The actual GDAL command employed to perform the clipping operation is as follows:
:code:`"gdalwarp -wo CUTLINE_ALL_TOUCHED=<TRUE|FALSE> -cutline <GEOMETRY> -crop_to_cutline"`

Output
Output raster tiles will have:

- the same extent as the input geometry.
- the same number of bands as the input raster.
- the same pixel data type as the input raster.
- the same pixel size as the input raster.
- the same coordinate reference system as the input raster.
..

:example:
Expand Down Expand Up @@ -1737,16 +1751,36 @@ rst_numbands
+---------------------+

rst_pixelcount
***************
**************

.. function:: rst_pixelcount(tile)
.. function:: rst_pixelcount(tile, count_nodata, count_all)

Returns an array containing valid pixel count values for each band.
Returns an array containing pixel count values for each band; default excludes mask and nodata pixels.

:param tile: A column containing the raster tile.
:type tile: Column (RasterTileType)
:param count_nodata: A column to specify whether to count nodata pixels.
:type count_nodata: Column (BooleanType)
:param count_all: A column to specify whether to count all pixels.
:type count_all: Column (BooleanType)
:rtype: Column: ArrayType(LongType)

.. note::

Notes:

If pixel value is noData or mask value is 0.0, the pixel is not counted by default.

:code:`count_nodata`
- This is an optional param.
- if specified as true, include the noData (not mask) pixels in the count (default is false).

:code:`count_all`
- This is an optional param; as a positional arg, must also pass :code:`count_nodata`
(value of :code:`count_nodata` is ignored).
- if specified as true, simply return bandX * bandY in the count (default is false).
..

:example:

.. tabs::
Expand Down Expand Up @@ -2616,7 +2650,7 @@ rst_separatebands
+--------------------------------------------------------------------------------------------------------------------------------+

rst_setnodata
**********************
*************

.. function:: rst_setnodata(tile, nodata)

Expand Down Expand Up @@ -2668,6 +2702,49 @@ rst_setnodata
| {index_id: 593308294097928192, raster: [00 01 10 ... 00], parentPath: "dbfs:/path_to_file", driver: "GTiff" } |
+------------------------------------------------------------------------------------------------------------------+

rst_setsrid
***********

.. function:: rst_setsrid(tile, srid)

Set the SRID of the raster tile as an EPSG code.

:param tile: A column containing the raster tile.
:type tile: Column (RasterTileType)
:param srid: The SRID to set
:type srid: Column (IntegerType)
:rtype: Column: (RasterTileType)

:example:

.. tabs::
.. code-tab:: py

df.select(mos.rst_setsrid('tile', F.lit(9122))).display()
+------------------------------------------------------------------------------------------------------------------+
| rst_setsrid(tile, 9122) |
+------------------------------------------------------------------------------------------------------------------+
| {index_id: 593308294097928191, tile: [00 01 10 ... 00], parentPath: "dbfs:/path_to_file", driver: "GTiff" } |
+------------------------------------------------------------------------------------------------------------------+

.. code-tab:: scala

df.select(rst_setsrid(col("tile"), lit(9122))).show
+------------------------------------------------------------------------------------------------------------------+
| rst_setsrid(tile, 9122) |
+------------------------------------------------------------------------------------------------------------------+
| {index_id: 593308294097928191, tile: [00 01 10 ... 00], parentPath: "dbfs:/path_to_file", driver: "GTiff" } |
+------------------------------------------------------------------------------------------------------------------+

.. code-tab:: sql

SELECT rst_setsrid(tile, 9122) FROM table
+------------------------------------------------------------------------------------------------------------------+
| rst_setsrid(tile, 9122) |
+------------------------------------------------------------------------------------------------------------------+
| {index_id: 593308294097928191, tile: [00 01 10 ... 00], parentPath: "dbfs:/path_to_file", driver: "GTiff" } |
+------------------------------------------------------------------------------------------------------------------+

rst_skewx
*********

Expand Down Expand Up @@ -3517,3 +3594,60 @@ rst_worldtorastercoordy
+------------------------------------------------------------------------------------------------------------------+
| 997 |
+------------------------------------------------------------------------------------------------------------------+

rst_write
*********

.. function:: rst_write(input, dir)

Writes raster tiles from the input column to a specified directory.

:param input: A column containing the raster tile.
:type input: Column
:param dir: The directory, e.g. fuse, to write the tile's raster as file.
:type dir: Column(StringType)
:rtype: Column: RasterTileType

.. note::
**Notes**
- Use :code:`RST_Write` to save a 'tile' column to a specified directory (e.g. fuse) location using its
already populated GDAL driver and tile information.
- Useful for formalizing the tile 'path' when writing a Lakehouse table. An example might be to turn on checkpointing
for internal data pipeline phase operations in which multiple interim tiles are populated, but at the end of the phase
use this function to set the final path to be used in the phase's persisted table. Then, you are free to delete
the internal tiles that accumulated in the configured checkpointing directory.
..

:example:

.. tabs::
.. code-tab:: py

df.select(rst_write("tile", <write_dir>).alias("tile")).limit(1).display()
+------------------------------------------------------------------------+
| tile |
+------------------------------------------------------------------------+
| {"index_id":null,"tile":"<write_path>","metadata":{ |
| "parentPath":"no_path","driver":"GTiff","path":"...","last_error":""}} |
+------------------------------------------------------------------------+

.. code-tab:: scala

df.select(rst_write(col("tile", <write_dir>)).as("tile)).limit(1).show
+------------------------------------------------------------------------+
| tile |
+------------------------------------------------------------------------+
| {"index_id":null,"tile":"<write_path>","metadata":{ |
| "parentPath":"no_path","driver":"GTiff","path":"...","last_error":""}} |
+------------------------------------------------------------------------+

.. code-tab:: sql

SELECT rst_write(tile, <write_dir>) as tile FROM table LIMIT 1
+------------------------------------------------------------------------+
| tile |
+------------------------------------------------------------------------+
| {"index_id":null,"tile":"<write_path>","metadata":{ |
| "parentPath":"no_path","driver":"GTiff","path":"...","last_error":""}} |
+------------------------------------------------------------------------+

Loading
Loading