diff --git a/rioxarray/_io.py b/rioxarray/_io.py index 505327b4..bc87e7a8 100644 --- a/rioxarray/_io.py +++ b/rioxarray/_io.py @@ -13,6 +13,8 @@ from distutils.version import LooseVersion import numpy as np +import rasterio +from rasterio.vrt import WarpedVRT from xarray import DataArray, Dataset from xarray.backends.common import BackendArray from xarray.backends.file_manager import CachingFileManager @@ -233,6 +235,144 @@ def build_subdataset_filter(group_names, variable_names): ) +def _rio_transform(riods): + """ + Get the transform from a rasterio dataset + reguardless of rasterio version. + """ + try: + return riods.transform + except AttributeError: + return riods.affine # rasterio < 1.0 + + +def _get_rasterio_attrs(riods, masked): + """ + Get rasterio specific attributes/encoding + """ + # Add rasterio attributes + attrs = _parse_tags(riods.tags(1)) + encoding = dict() + # Affine transformation matrix (always available) + # This describes coefficients mapping pixel coordinates to CRS + # For serialization store as tuple of 6 floats, the last row being + # always (0, 0, 1) per definition (see + # https://github.com/sgillies/affine) + attrs["transform"] = tuple(_rio_transform(riods))[:6] + if hasattr(riods, "nodata") and riods.nodata is not None: + # The nodata values for the raster bands + if masked: + encoding["_FillValue"] = riods.nodata + else: + attrs["_FillValue"] = riods.nodata + if hasattr(riods, "scales"): + # The scale values for the raster bands + attrs["scales"] = riods.scales + if hasattr(riods, "offsets"): + # The offset values for the raster bands + attrs["offsets"] = riods.offsets + if hasattr(riods, "descriptions") and any(riods.descriptions): + # Descriptions for each dataset band + attrs["descriptions"] = riods.descriptions + if hasattr(riods, "units") and any(riods.units): + # A list of units string for each dataset band + attrs["units"] = riods.units + return attrs, encoding + + +def _parse_driver_tags(riods, attrs, coords): + # Parse extra metadata from tags, if supported + parsers = {"ENVI": _parse_envi} + + driver = riods.driver + if driver in parsers: + meta = parsers[driver](riods.tags(ns=driver)) + + for k, v in meta.items(): + # Add values as coordinates if they match the band count, + # as attributes otherwise + if isinstance(v, (list, np.ndarray)) and len(v) == riods.count: + coords[k] = ("band", np.asarray(v)) + else: + attrs[k] = v + + +def _load_subdatasets( + riods, group, variable, parse_coordinates, chunks, cache, lock, masked +): + """ + Load in rasterio subdatasets + """ + base_tags = _parse_tags(riods.tags()) + dim_groups = {} + subdataset_filter = None + if any((group, variable)): + subdataset_filter = build_subdataset_filter(group, variable) + for iii, subdataset in enumerate(riods.subdatasets): + if subdataset_filter is not None and not subdataset_filter.match(subdataset): + continue + with rasterio.open(subdataset) as rds: + shape = rds.shape + rioda = open_rasterio( + subdataset, + parse_coordinates=shape not in dim_groups and parse_coordinates, + chunks=chunks, + cache=cache, + lock=lock, + masked=masked, + default_name=subdataset.split(":")[-1].lstrip("/").replace("/", "_"), + ) + if shape not in dim_groups: + dim_groups[shape] = {rioda.name: rioda} + else: + dim_groups[shape][rioda.name] = rioda + + if len(dim_groups) > 1: + dataset = [ + Dataset(dim_group, attrs=base_tags) for dim_group in dim_groups.values() + ] + elif not dim_groups: + dataset = Dataset(attrs=base_tags) + else: + dataset = Dataset(list(dim_groups.values())[0], attrs=base_tags) + return dataset + + +def _prepare_dask(result, riods, filename, chunks): + """ + Prepare the data for dask computations + """ + from dask.base import tokenize + + # augment the token with the file modification time + try: + mtime = os.path.getmtime(filename) + except OSError: + # the filename is probably an s3 bucket rather than a regular file + mtime = None + + if chunks in (True, "auto"): + from dask.array.core import normalize_chunks + import dask + + if LooseVersion(dask.__version__) < LooseVersion("0.18.0"): + msg = ( + "Automatic chunking requires dask.__version__ >= 0.18.0 . " + "You currently have version %s" % dask.__version__ + ) + raise NotImplementedError(msg) + block_shape = (1,) + riods.block_shapes[0] + chunks = normalize_chunks( + chunks=(1, "auto", "auto"), + shape=(riods.count, riods.height, riods.width), + dtype=riods.dtypes[0], + previous_chunks=tuple((c,) for c in block_shape), + ) + token = tokenize(filename, mtime, chunks) + name_prefix = "open_rasterio-%s" % token + return result.chunk(chunks, name_prefix=name_prefix, token=token) + + def open_rasterio( filename, parse_coordinates=None, @@ -306,10 +446,6 @@ def open_rasterio( The newly created DataArray. """ parse_coordinates = True if parse_coordinates is None else parse_coordinates - - import rasterio - from rasterio.vrt import WarpedVRT - vrt_params = None if isinstance(filename, rasterio.io.DatasetReader): filename = filename.name @@ -338,29 +474,18 @@ def open_rasterio( rasterio.open, filename, lock=lock, mode="r", kwargs=open_kwargs ) riods = manager.acquire() - # open the subdatasets if they exist if riods.subdatasets: - subdataset_filter = None - if any((group, variable)): - subdataset_filter = build_subdataset_filter(group, variable) - data_arrays = {} - for iii, subdataset in enumerate(riods.subdatasets): - if subdataset_filter is not None and not subdataset_filter.match( - subdataset - ): - continue - rioda = open_rasterio( - subdataset, - parse_coordinates=iii == 0 and parse_coordinates, - chunks=chunks, - cache=cache, - lock=lock, - masked=masked, - default_name=subdataset.split(":")[-1].lstrip("/").replace("/", "_"), - ) - data_arrays[rioda.name] = rioda - return Dataset(data_arrays) + return _load_subdatasets( + riods=riods, + group=group, + variable=variable, + parse_coordinates=parse_coordinates, + chunks=chunks, + cache=cache, + lock=lock, + masked=masked, + ) if vrt_params is not None: riods = WarpedVRT(riods, **vrt_params) @@ -368,20 +493,19 @@ def open_rasterio( if cache is None: cache = chunks is None - coords = OrderedDict() - # Get bands if riods.count < 1: raise ValueError("Unknown dims") + coords = OrderedDict() coords["band"] = np.asarray(riods.indexes) - # Get coordinates - if LooseVersion(rasterio.__version__) < LooseVersion("1.0"): - transform = riods.affine - else: - transform = riods.transform + # parse tags + attrs, encoding = _get_rasterio_attrs(riods=riods, masked=masked) + _parse_driver_tags(riods=riods, attrs=attrs, coords=coords) - if transform.is_rectilinear and parse_coordinates: + # Get geospatial coordinates + transform = _rio_transform(riods) + if parse_coordinates and transform.is_rectilinear: # 1d coordinates coords.update(affine_to_coords(riods.transform, riods.width, riods.height)) elif parse_coordinates: @@ -395,49 +519,6 @@ def open_rasterio( stacklevel=3, ) - # Attributes - attrs = _parse_tags(riods.tags(1)) - encoding = dict() - # Affine transformation matrix (always available) - # This describes coefficients mapping pixel coordinates to CRS - # For serialization store as tuple of 6 floats, the last row being - # always (0, 0, 1) per definition (see - # https://github.com/sgillies/affine) - attrs["transform"] = tuple(transform)[:6] - if hasattr(riods, "nodata") and riods.nodata is not None: - # The nodata values for the raster bands - if masked: - encoding["_FillValue"] = riods.nodata - else: - attrs["_FillValue"] = riods.nodata - if hasattr(riods, "scales"): - # The scale values for the raster bands - attrs["scales"] = riods.scales - if hasattr(riods, "offsets"): - # The offset values for the raster bands - attrs["offsets"] = riods.offsets - if hasattr(riods, "descriptions") and any(riods.descriptions): - # Descriptions for each dataset band - attrs["descriptions"] = riods.descriptions - if hasattr(riods, "units") and any(riods.units): - # A list of units string for each dataset band - attrs["units"] = riods.units - - # Parse extra metadata from tags, if supported - parsers = {"ENVI": _parse_envi} - - driver = riods.driver - if driver in parsers: - meta = parsers[driver](riods.tags(ns=driver)) - - for k, v in meta.items(): - # Add values as coordinates if they match the band count, - # as attributes otherwise - if isinstance(v, (list, np.ndarray)) and len(v) == riods.count: - coords[k] = ("band", np.asarray(v)) - else: - attrs[k] = v - data = indexing.LazilyOuterIndexedArray( RasterioArrayWrapper(manager, lock, vrt_params, masked=masked) ) @@ -447,6 +528,7 @@ def open_rasterio( if cache and chunks is None: data = indexing.MemoryCachedArray(data) + # create the output data array da_name = attrs.pop("NETCDF_VARNAME", default_name) result = DataArray( data=data, dims=("band", "y", "x"), coords=coords, attrs=attrs, name=da_name @@ -457,35 +539,7 @@ def open_rasterio( result.rio.write_crs(riods.crs, inplace=True) if chunks is not None: - from dask.base import tokenize - - # augment the token with the file modification time - try: - mtime = os.path.getmtime(filename) - except OSError: - # the filename is probably an s3 bucket rather than a regular file - mtime = None - - if chunks in (True, "auto"): - from dask.array.core import normalize_chunks - import dask - - if LooseVersion(dask.__version__) < LooseVersion("0.18.0"): - msg = ( - "Automatic chunking requires dask.__version__ >= 0.18.0 . " - "You currently have version %s" % dask.__version__ - ) - raise NotImplementedError(msg) - block_shape = (1,) + riods.block_shapes[0] - chunks = normalize_chunks( - chunks=(1, "auto", "auto"), - shape=(riods.count, riods.height, riods.width), - dtype=riods.dtypes[0], - previous_chunks=tuple((c,) for c in block_shape), - ) - token = tokenize(filename, mtime, chunks) - name_prefix = "open_rasterio-%s" % token - result = result.chunk(chunks, name_prefix=name_prefix, token=token) + result = _prepare_dask(result, riods, filename, chunks) # Make the file closeable result._file_obj = manager diff --git a/sphinx/history.rst b/sphinx/history.rst index 3eb43ce7..c9a2359f 100644 --- a/sphinx/history.rst +++ b/sphinx/history.rst @@ -1,6 +1,10 @@ History ======= +0.0.16 +------ +- Add support for netcdf/hdf groups with different shapes (pull #62) + 0.0.15 ------ - Added `variable` and `group` kwargs to `rioxarray.open_rasterio()` to allow filtering of subdatasets (pull #57) diff --git a/test/integration/test_integration__io.py b/test/integration/test_integration__io.py index 40d4c243..720a8c63 100644 --- a/test/integration/test_integration__io.py +++ b/test/integration/test_integration__io.py @@ -57,91 +57,106 @@ False, ), ( - 'HDF4_EOS:EOS_GRID:"./modis/MOD09GQ.A2017290.h11v04.006.NRT.hdf":MODIS_Grid_2D:sur_refl_b01_1', + 'HDF4_EOS:EOS_GRID:"./modis/MOD09GQ.A2017290.h11v04.006.NRT.hdf"' + ":MODIS_Grid_2D:sur_refl_b01_1", ["sur_refl_b01_1"], None, True, ), ( - 'HDF4_EOS:EOS_GRID:"./modis/MOD09GQ.A2017290.h11v04.006.NRT.hdf":MODIS_Grid_2D:sur_refl_b01_1', + 'HDF4_EOS:EOS_GRID:"./modis/MOD09GQ.A2017290.h11v04.006.NRT.hdf"' + ":MODIS_Grid_2D:sur_refl_b01_1", None, ["MODIS_Grid_2D"], True, ), ( - 'HDF4_EOS:EOS_GRID:"./modis/MOD09GQ.A2017290.h11v04.006.NRT.hdf":MODIS_Grid_2D:sur_refl_b01_1', + 'HDF4_EOS:EOS_GRID:"./modis/MOD09GQ.A2017290.h11v04.006.NRT.hdf"' + ":MODIS_Grid_2D:sur_refl_b01_1", ("sur_refl_b01_1",), ("MODIS_Grid_2D",), True, ), ( - 'HDF4_EOS:EOS_GRID:"./modis/MOD09GQ.A2017290.h11v04.006.NRT.hdf":MODIS_Grid_2D:sur_refl_b01_1', + 'HDF4_EOS:EOS_GRID:"./modis/MOD09GQ.A2017290.h11v04.006.NRT.hdf"' + ":MODIS_Grid_2D:sur_refl_b01_1", "blue", "gr", False, ), ( - 'HDF4_EOS:EOS_GRID:"./modis/MOD09GQ.A2017290.h11v04.006.NRT.hdf":MODIS_Grid_2D:sur_refl_b01_1', + 'HDF4_EOS:EOS_GRID:"./modis/MOD09GQ.A2017290.h11v04.006.NRT.hdf"' + ":MODIS_Grid_2D:sur_refl_b01_1", "sur_refl_b01_1", "gr", False, ), ( - 'HDF4_EOS:EOS_GRID:"./modis/MOD09GQ.A2017290.h11v04.006.NRT.hdf":MODIS_Grid_2D:sur_refl_b01_1', + 'HDF4_EOS:EOS_GRID:"./modis/MOD09GQ.A2017290.h11v04.006.NRT.hdf"' + ":MODIS_Grid_2D:sur_refl_b01_1", None, "gr", False, ), ( - 'HDF4_EOS:EOS_GRID:"./modis/MOD09GQ.A2017290.h11v04.006.NRT.hdf"://MODIS_Grid_2D://sur_refl_b01_1', + 'HDF4_EOS:EOS_GRID:"./modis/MOD09GQ.A2017290.h11v04.006.NRT.hdf"' + "://MODIS_Grid_2D://sur_refl_b01_1", "sur_refl_b01_1", None, True, ), ( - 'HDF4_EOS:EOS_GRID:"./modis/MOD09GQ.A2017290.h11v04.006.NRT.hdf"://MODIS_Grid_2D://sur_refl_b01_1', + 'HDF4_EOS:EOS_GRID:"./modis/MOD09GQ.A2017290.h11v04.006.NRT.hdf"' + "://MODIS_Grid_2D://sur_refl_b01_1", None, "MODIS_Grid_2D", True, ), ( - 'HDF4_EOS:EOS_GRID:"./modis/MOD09GQ.A2017290.h11v04.006.NRT.hdf"://MODIS_Grid_2D://sur_refl_b01_1', + 'HDF4_EOS:EOS_GRID:"./modis/MOD09GQ.A2017290.h11v04.006.NRT.hdf"' + "://MODIS_Grid_2D://sur_refl_b01_1", "sur_refl_b01_1", "MODIS_Grid_2D", True, ), ( - 'HDF4_EOS:EOS_GRID:"./modis/MOD09GQ.A2017290.h11v04.006.NRT.hdf"://MODIS_Grid_2D://sur_refl_b01_1', + 'HDF4_EOS:EOS_GRID:"./modis/MOD09GQ.A2017290.h11v04.006.NRT.hdf"' + "://MODIS_Grid_2D://sur_refl_b01_1", "blue", "gr", False, ), ( - 'HDF4_EOS:EOS_GRID:"./modis/MOD09GQ.A2017290.h11v04.006.NRT.hdf"://MODIS_Grid_2D://sur_refl_b01_1', + 'HDF4_EOS:EOS_GRID:"./modis/MOD09GQ.A2017290.h11v04.006.NRT.hdf"' + "://MODIS_Grid_2D://sur_refl_b01_1", "sur_refl_b01_1", "gr", False, ), ( - 'HDF4_EOS:EOS_GRID:"./modis/MOD09GQ.A2017290.h11v04.006.NRT.hdf"://MODIS_Grid_2D://sur_refl_b01_1', + 'HDF4_EOS:EOS_GRID:"./modis/MOD09GQ.A2017290.h11v04.006.NRT.hdf"' + "://MODIS_Grid_2D://sur_refl_b01_1", None, "gr", False, ), ( - "netcdf:S5P_NRTI_L2__NO2____20190513T181819_20190513T182319_08191_01_010301_20190513T185033.nc:/PRODUCT/tm5_constant_a", + "netcdf:S5P_NRTI_L2__NO2____20190513T181819_20190513T182319_08191_" + "01_010301_20190513T185033.nc:/PRODUCT/tm5_constant_a", None, "PRODUCT", True, ), ( - "netcdf:S5P_NRTI_L2__NO2____20190513T181819_20190513T182319_08191_01_010301_20190513T185033.nc:/PRODUCT/tm5_constant_a", + "netcdf:S5P_NRTI_L2__NO2____20190513T181819_20190513T182319_08191_" + "01_010301_20190513T185033.nc:/PRODUCT/tm5_constant_a", "tm5_constant_a", "PRODUCT", True, ), ( - "netcdf:S5P_NRTI_L2__NO2____20190513T181819_20190513T182319_08191_01_010301_20190513T185033.nc:/PRODUCT/tm5_constant_a", + "netcdf:S5P_NRTI_L2__NO2____20190513T181819_20190513T182319_08191_" + "01_010301_20190513T185033.nc:/PRODUCT/tm5_constant_a", "tm5_constant_a", "/PRODUCT", True, @@ -161,7 +176,7 @@ def test_open_variable_filter(): assert list(rds.data_vars) == ["blue"] -def test_open_group_filter(): +def test_open_group_filter__missing(): with rioxarray.open_rasterio( os.path.join(TEST_INPUT_DATA_DIR, "PLANET_SCOPE_3D.nc"), variable="blue", @@ -170,6 +185,42 @@ def test_open_group_filter(): assert list(rds.data_vars) == [] +def test_open_multiple_resolution(): + rds_list = rioxarray.open_rasterio( + os.path.join( + TEST_INPUT_DATA_DIR, "MOD09GA.A2008296.h14v17.006.2015181011753.hdf" + ) + ) + assert isinstance(rds_list, list) + assert len(rds_list) == 2 + for rds in rds_list: + assert rds.attrs["SHORTNAME"] == "MOD09GA" + assert rds_list[0].dims == {"y": 1200, "x": 1200, "band": 1} + assert rds_list[1].dims == {"y": 2400, "x": 2400, "band": 1} + + +def test_open_group_filter(): + with rioxarray.open_rasterio( + os.path.join( + TEST_INPUT_DATA_DIR, "MOD09GA.A2008296.h14v17.006.2015181011753.hdf" + ), + group="MODIS_Grid_500m_2D", + ) as rds: + assert sorted(rds.data_vars) == [ + "QC_500m_1", + "iobs_res_1", + "num_observations_500m", + "obscov_500m_1", + "sur_refl_b01_1", + "sur_refl_b02_1", + "sur_refl_b03_1", + "sur_refl_b04_1", + "sur_refl_b05_1", + "sur_refl_b06_1", + "sur_refl_b07_1", + ] + + def test_open_rasterio_mask_chunk_clip(): with rioxarray.open_rasterio( os.path.join(TEST_COMPARE_DATA_DIR, "small_dem_3m_merged.tif"), diff --git a/test/test_data/input/MOD09GA.A2008296.h14v17.006.2015181011753.hdf b/test/test_data/input/MOD09GA.A2008296.h14v17.006.2015181011753.hdf new file mode 100644 index 00000000..9e5415ef Binary files /dev/null and b/test/test_data/input/MOD09GA.A2008296.h14v17.006.2015181011753.hdf differ