Add ability to process stacked data

AlexeyPechnikov · Jul 5, 2024 · d927200 · d927200
1 parent a02fecb
commit d927200
Show file tree

Hide file tree

Showing 8 changed files with 225 additions and 68 deletions.
diff --git a/pygmtsar/pygmtsar/IO.py b/pygmtsar/pygmtsar/IO.py
@@ -347,10 +347,6 @@ def open_cube(self, name):
 
         # Workaround: open the dataset without chunking
         data = xr.open_dataset(filename, engine=self.netcdf_engine)
-        # Determine the proper chunk sizes
-        chunks = {dim: 1 if dim in ['pair', 'date'] else self.chunksize for dim in data.dims}
-        # Re-chunk the dataset using the chunks dictionary
-        data = data.chunk(chunks)
 
         if 'stack' in data.dims:
             if 'y' in data.coords and 'x' in data.coords:
@@ -359,6 +355,13 @@ def open_cube(self, name):
                 multi_index_names = ['lat', 'lon']
             multi_index = pd.MultiIndex.from_arrays([data.y.values, data.x.values], names=multi_index_names)
             data = data.assign_coords(stack=multi_index)
+            chunksize = self.chunksize1d
+        else:
+            chunksize = self.chunksize
+
+        # set the proper chunk sizes
+        chunks = {dim: 1 if dim in ['pair', 'date'] else chunksize for dim in data.dims}
+        data = data.chunk(chunks)
 
         # attributes are empty when dataarray is prezented as dataset
         # revert dataarray converted to dataset
@@ -457,9 +460,12 @@ def save_cube(self, data, name=None, caption='Saving NetCDF 2D/3D Dataset'):
         elif name is None:
             raise ValueError('Specify name for the output NetCDF file')
 
+        chunksize = None
         if 'stack' in data.dims and isinstance(data.coords['stack'].to_index(), pd.MultiIndex):
             # replace multiindex by sequential numbers 0,1,...
             data = data.reset_index('stack')
+            # single-dimensional data compression required
+            chunksize = self.netcdf_chunksize1d
 
         for dim in ['y', 'x', 'lat', 'lon']:
             if dim in data.dims:
@@ -477,7 +483,8 @@ def save_cube(self, data, name=None, caption='Saving NetCDF 2D/3D Dataset'):
             data = data.to_dataset().assign_attrs({'dataarray': data.name})
 
         is_dask = isinstance(data[list(data.data_vars)[0]].data, dask.array.Array)
-        encoding = {varname: self._compression(data[varname].shape) for varname in data.data_vars}
+        encoding = {varname: self._compression(data[varname].shape, chunksize=chunksize) for varname in data.data_vars}
+        #print ('save_cube encoding', encoding)
         #print ('is_dask', is_dask, 'encoding', encoding)
 
         # save to NetCDF file
@@ -550,18 +557,22 @@ def open_stack(self, name, stack=None):
             concat_dim='stackvar',
             combine='nested'
         )
-        # Determine the proper chunk sizes
-        chunks = {dim: 1 if dim in ['pair', 'date'] else self.chunksize for dim in data.dims}
-        data = data.chunk(chunks)
-
+
         if 'stack' in data.dims:
             if 'y' in data.coords and 'x' in data.coords:
                 multi_index_names = ['y', 'x']
             elif 'lat' in data.coords and 'lon' in data.coords:
                 multi_index_names = ['lat', 'lon']
             multi_index = pd.MultiIndex.from_arrays([data.y.values, data.x.values], names=multi_index_names)
             data = data.assign_coords(stack=multi_index)
-
+            chunksize = self.chunksize1d
+        else:
+            chunksize = self.chunksize
+
+        # set the proper chunk sizes
+        chunks = {dim: 1 if dim in ['stackvar'] else chunksize for dim in data.dims}
+        data = data.chunk(chunks)
+
         # revert dataarray converted to dataset
         data_vars = list(data.data_vars)
         if len(data_vars) == 1 and 'dataarray' in data.attrs:
@@ -734,7 +745,7 @@ def save_stack(self, data, name, caption='Saving 2D Stack', queue=None, timeout=
         if isinstance(data, xr.DataArray):
             data = data.to_dataset().assign_attrs({'dataarray': data.name})
         encoding = {varname: self._compression(data[varname].shape[1:]) for varname in data.data_vars}
-        #print ('encoding', encoding)
+        #print ('save_stack encoding', encoding)
 
         # Applying iterative processing to prevent Dask scheduler deadlocks.
         counter = 0

diff --git a/pygmtsar/pygmtsar/Stack_detrend.py b/pygmtsar/pygmtsar/Stack_detrend.py
@@ -329,6 +329,18 @@ def polyfit(self, data, weight=None, degree=0, days=None, count=None):
         warnings.filterwarnings('ignore', module='dask')
         warnings.filterwarnings('ignore', module='dask.core')
 
+        multi_index = None
+        if 'stack' in data.dims and isinstance(data.coords['stack'].to_index(), pd.MultiIndex):
+            multi_index = data['stack']
+            data = data.reset_index('stack')
+            if weight is not None:
+                if not ('stack' in weight.dims and isinstance(weight.coords['stack'].to_index(), pd.MultiIndex)):
+                    raise ValueError('ERROR: "weight", if provided, must be stacked consistently with "data".')
+                data = data.reset_index('stack')
+        else:
+            if 'stack' in weight.dims and isinstance(weight.coords['stack'].to_index(), pd.MultiIndex):
+                raise ValueError('ERROR: "weight", if provided, must be stacked consistently with "data".')
+
         pairs, dates = self.get_pairs(data, dates=True)
 
         models = []
@@ -362,8 +374,10 @@ def polyfit(self, data, weight=None, degree=0, days=None, count=None):
 
         out = xr.concat([(model.sel(date=ref).drop('date') - model.sel(date=rep).drop('date'))\
                                  .assign_coords(pair=str(ref.date()) + ' ' + str(rep.date()), ref=ref, rep=rep) \
-                          for ref, rep in zip(pairs['ref'], pairs['rep'])], dim='pair')
-        return out.rename(data.name)
+                          for ref, rep in zip(pairs['ref'], pairs['rep'])], dim='pair').rename(data.name)
+        if multi_index is not None:
+            return out.assign_coords(stack=multi_index)
+        return out
 
 #     def polyfit(self, data, weight=None, degree=0, variable=None, count=None):
 #         import xarray as xr
@@ -872,6 +886,8 @@ def turbulence(self, phase, weight=None):
         import xarray as xr
         import pandas as pd
 
+        print ('NOTE: this function is deprecated, use instead Stack.polyfit()')
+
         pairs, dates = self.get_pairs(phase, dates=True)
 
         turbos = []
@@ -904,14 +920,22 @@ def turbulence(self, phase, weight=None):
         return phase_turbo.rename('turbulence')
 
     def velocity(self, data):
+        import pandas as pd
         import numpy as np
         #years = ((data.date.max() - data.date.min()).dt.days/365.25).item()
         #nanoseconds = (data.date.max().astype(int) - data.date.min().astype(int)).item()
         #print ('years', np.round(years, 3), 'nanoseconds', nanoseconds)
+        multi_index = None
+        if 'stack' in data.dims and isinstance(data.coords['stack'].to_index(), pd.MultiIndex):
+            multi_index = data.coords['stack']
+            # replace multiindex by sequential numbers 0,1,...
+            data = data.reset_index('stack')
         #velocity = nanoseconds*data.polyfit('date', 1).polyfit_coefficients.sel(degree=1)/years
         nanoseconds_per_year = 365.25*24*60*60*1e9
         # calculate slope per year
         velocity = nanoseconds_per_year*data.polyfit('date', 1).polyfit_coefficients.sel(degree=1).astype(np.float32).rename('trend')
+        if multi_index is not None:
+            return velocity.assign_coords(stack=multi_index)
         return velocity
 
 #     def trend(self, data, deg=1):
@@ -927,7 +951,12 @@ def trend(self, data, dim='auto', deg=1):
         import xarray as xr
         import pandas as pd
         import numpy as np
-
+
+        multi_index = None
+        if 'stack' in data.dims and isinstance(data.coords['stack'].to_index(), pd.MultiIndex):
+            multi_index = data['stack']
+            data = data.reset_index('stack')
+
         stackdim = [_dim for _dim in ['date', 'pair'] if _dim in data.dims]
         if len(stackdim) != 1:
             raise ValueError("The 'data' argument must include a 'date' or 'pair' dimension to detect trends.")
@@ -951,5 +980,7 @@ def trend(self, data, dim='auto', deg=1):
         # fit existing coordinate values
         fit_coeff = data.polyfit(dim, deg).polyfit_coefficients.astype(np.float32)
         fit = xr.polyval(data[dim], fit_coeff).astype(np.float32).rename('trend')
-        return xr.merge([fit, fit_coeff]).rename(polyfit_coefficients='coefficients')
-
+        out = xr.merge([fit, fit_coeff]).rename(polyfit_coefficients='coefficients')
+        if multi_index is not None:
+            return out.assign_coords(stack=multi_index)
+        return out
diff --git a/pygmtsar/pygmtsar/Stack_phasediff.py b/pygmtsar/pygmtsar/Stack_phasediff.py
@@ -848,8 +848,12 @@ def plot_phase(self, data, caption='Phase, [rad]',
                    quantile=None, vmin=None, vmax=None, symmetrical=False,
                    cmap='turbo', aspect=None, **kwargs):
         import numpy as np
+        import pandas as pd
         import matplotlib.pyplot as plt
 
+        if 'stack' in data.dims and isinstance(data.coords['stack'].to_index(), pd.MultiIndex):
+            data = data.unstack('stack')
+
         if quantile is not None:
             assert vmin is None and vmax is None, "ERROR: arguments 'quantile' and 'vmin', 'vmax' cannot be used together"
 
@@ -873,8 +877,12 @@ def plot_phase(self, data, caption='Phase, [rad]',
     def plot_phases(self, data, caption='Phase, [rad]', cols=4, size=4, nbins=5, aspect=1.2, y=1.05,
                     quantile=None, vmin=None, vmax=None, symmetrical=False, **kwargs):
         import numpy as np
+        import pandas as pd
         import matplotlib.pyplot as plt
 
+        if 'stack' in data.dims and isinstance(data.coords['stack'].to_index(), pd.MultiIndex):
+            data = data.unstack('stack')
+
         if quantile is not None:
             assert vmin is None and vmax is None, "ERROR: arguments 'quantile' and 'vmin', 'vmax' cannot be used together"
 
@@ -903,8 +911,12 @@ def plot_phases(self, data, caption='Phase, [rad]', cols=4, size=4, nbins=5, asp
 
     def plot_interferogram(self, data, caption='Phase, [rad]', cmap='gist_rainbow_r', aspect=None, **kwargs):
         import numpy as np
+        import pandas as pd
         import matplotlib.pyplot as plt
 
+        if 'stack' in data.dims and isinstance(data.coords['stack'].to_index(), pd.MultiIndex):
+            data = data.unstack('stack')
+
         plt.figure()
         data.plot.imshow(vmin=-np.pi, vmax=np.pi, cmap=cmap)
         self.plot_AOI(**kwargs)
@@ -915,8 +927,12 @@ def plot_interferogram(self, data, caption='Phase, [rad]', cmap='gist_rainbow_r'
 
     def plot_interferograms(self, data, caption='Phase, [rad]', cols=4, size=4, nbins=5, aspect=1.2, y=1.05, **kwargs):
         import numpy as np
+        import pandas as pd
         import matplotlib.pyplot as plt
 
+        if 'stack' in data.dims and isinstance(data.coords['stack'].to_index(), pd.MultiIndex):
+            data = data.unstack('stack')
+
         # multi-plots ineffective for linked lazy data
         fg = data.plot.imshow(
             col='pair',
@@ -932,8 +948,12 @@ def plot_interferograms(self, data, caption='Phase, [rad]', cols=4, size=4, nbin
         self.plots_POI(fg, **kwargs)
 
     def plot_correlation(self, data, caption='Correlation', cmap='gray', aspect=None, **kwargs):
+        import pandas as pd
         import matplotlib.pyplot as plt
 
+        if 'stack' in data.dims and isinstance(data.coords['stack'].to_index(), pd.MultiIndex):
+            data = data.unstack('stack')
+
         plt.figure()
         data.plot.imshow(vmin=0, vmax=1, cmap=cmap)
         self.plot_AOI(**kwargs)
@@ -943,9 +963,13 @@ def plot_correlation(self, data, caption='Correlation', cmap='gray', aspect=None
         plt.title(caption)
 
     def plot_correlations(self, data, caption='Correlation', cmap='auto', cols=4, size=4, nbins=5, aspect=1.2, y=1.05, **kwargs):
+        import pandas as pd
         import matplotlib.pyplot as plt
         import matplotlib.colors as mcolors
 
+        if 'stack' in data.dims and isinstance(data.coords['stack'].to_index(), pd.MultiIndex):
+            data = data.unstack('stack')
+
         if isinstance(cmap, str) and cmap == 'auto':
             cmap = mcolors.LinearSegmentedColormap.from_list(
                 name='custom_gray', 
@@ -968,9 +992,13 @@ def plot_correlations(self, data, caption='Correlation', cmap='auto', cols=4, si
 
     def plot_correlation_stack(self, data, threshold=None, caption='Correlation Stack', bins=100, cmap='auto', **kwargs):
         import numpy as np
+        import pandas as pd
         import matplotlib.pyplot as plt
         import matplotlib.colors as mcolors
-
+
+        if 'stack' in data.dims and isinstance(data.coords['stack'].to_index(), pd.MultiIndex):
+            data = data.unstack('stack')
+
         if isinstance(cmap, str) and cmap == 'auto':
             cmap = mcolors.LinearSegmentedColormap.from_list(
                 name='custom_gray', 

diff --git a/pygmtsar/pygmtsar/Stack_ps.py b/pygmtsar/pygmtsar/Stack_ps.py
@@ -69,10 +69,13 @@ def psfunction(self, ps='auto', name='ps'):
 
     def plot_psfunction(self, data='auto', caption='PS Function', cmap='gray', quantile=None, vmin=None, vmax=None, **kwargs):
         import numpy as np
+        import pandas as pd
         import matplotlib.pyplot as plt
 
         if isinstance(data, str) and data == 'auto':
             data = self.psfunction()
+        elif 'stack' in data.dims and isinstance(data.coords['stack'].to_index(), pd.MultiIndex):
+            data = data.unstack('stack')
 
         if quantile is not None:
             assert vmin is None and vmax is None, "ERROR: arguments 'quantile' and 'vmin', 'vmax' cannot be used together"
@@ -155,6 +158,7 @@ def plot_amplitudes(self, dates=None, data='auto', norm='auto', func=None, inten
                        caption='auto', cmap='gray', cols=4, size=4, nbins=5, aspect=1.2, y=1.05,
                        quantile=None, vmin=None, vmax=None, symmetrical=False, **kwargs):
         import numpy as np
+        import pandas as pd
         import matplotlib.pyplot as plt
         import types
 
@@ -166,6 +170,8 @@ def plot_amplitudes(self, dates=None, data='auto', norm='auto', func=None, inten
             data = np.abs(self.open_data(dates=dates, scale=scale))
             if intensity:
                 data = np.square(data)
+        elif 'stack' in data.dims and isinstance(data.coords['stack'].to_index(), pd.MultiIndex):
+            data = data.unstack('stack')
 
         if func is not None:
             data = func(data)