CSV Files (#108)

* Added in base class for downloading flat files * Added in snowex and csas flat files * Added a test for checking valid links * Added in test datasets * Rearranged the base validation to avoid complexity flake issues. Converted from assertions to exceptions! * Broke out validation checks. Added associated tests
M3Works · Jul 3, 2024 · 58dc522 · 58dc522
1 parent 43d578d
commit 58dc522
Show file tree

Hide file tree

Showing 28 changed files with 5,877 additions and 22 deletions.
diff --git a/.gitignore b/.gitignore
@@ -111,3 +111,4 @@ ENV/
 # scratch dir
 scratch/
 **/.ipynb_checkpoints/*
+**/cache/**
diff --git a/README.rst b/README.rst
@@ -19,8 +19,7 @@ metloom
 Location Oriented Observed Meteorology
 
 metloom is a python library created with the goal of consistent, simple sampling of
-meteorology and snow related point measurments from a variety of datasources across the
-Western US. metloom is developed by `M3 Works <https://m3works.io>`_ as a tool for validating
+meteorology and snow related point measurments from a variety of datasources is developed by `M3 Works <https://m3works.io>`_ as a tool for validating
 computational hydrology model results. Contributions welcome!
 
 Warning - This software is provided as is (see the license), so use at your own risk.
@@ -45,6 +44,8 @@ Features
     * `GEOSPHERE AUSTRIA <https://data.hub.geosphere.at/dataset/>`_
     * `UCSB CUES <https://snow.ucsb.edu/#>`_
     * `MET NORWAY <https://frost.met.no/index.html>`_
+    * `SNOWEX MET STATIONS <https://nsidc.org/data/snex_met/versions/1>`_
+    * `CENTER FOR SNOW AND AVALANCHE STUDIES (CSAS) <https://snowstudies.org/csas-facilities/>`_
 
 Requirements
 ------------

diff --git a/docs/gallery/csas_example.ipynb b/docs/gallery/csas_example.ipynb
diff --git a/docs/usage.rst b/docs/usage.rst
@@ -80,7 +80,58 @@ To pull stations using Mesowest::
     )
     print(df)
 
+Center for Snow and Avalanche Studies (CSAS)
+--------------------------------------------
+There are 4 stations of interest maintained by the CSAS. Senator Beck Study plot,
+Swamp Angel Study Plot, Senator Beck Stream Gauge and Putney Study plot. These four stations
+contain a wealth of data useful for studying and validating snow processes. The files exist as a
+flat csv file so requests using this will simply download the file, interpret the datetime
+index and crop according to your request. Since it is a CSV the file will be stored in a local cache
+in the same directory you ran your code. This way the download times are reduced.
+
+Additionally, the CSAS data is not available in realtime (at least as of June 2024).
+Data is updated annually and stored on the website. Metloom will try to stay as up to date as
+possible when the files are updated. Please feel free to submit a PR if you know the data has been
+updated. Checkout the `facilities page <https://snowstudies.org/csas-facilities/>`_ on CSAS to see more about the stations.
+
+To pull stations using CSAS::
+
+    from metloom.pointdata import CSASMet
+    from metloom.variables import CSASVariables
+    from datetime import datetime
+
+    start = datetime(2023, 1, 1)
+    end = datetime(2023, 6, 1)
+    sbsp = CSASMet('SBSP')
+    df_sbsp = sbsp.get_daily_data(start, end, [CSASVariables.SNOWDEPTH])
+
+If you use these data, please use the `appropriate citations <https://snowstudies.org/data-use-policy/>`_ and give credit to the
+institution.
+
+SnowEx
+------
+During the `NASA SnowEx campaign <https://snow.nasa.gov/campaigns/snowex>`_
+there were a handful of met stations deployed which are now published on the
+`NSIDC <https://nsidc.org/data/snex_met/versions/1>`_. These stations have been
+mapped into metloom to increase the utility/convenience of these data. The SnowEx
+data is in a csv file format and thus any queries will download the appropriate
+files to a local cache to reduce download times. For this to work you need to have
+a `.netrc` and an account with the NSIDC. See the
+`access guide <https://nsidc.org/data/user-resources/help-center/programmatic-data-access-guide>`_
+for more help.
+
+To pull stations using SnowEx::
+
+    from metloom.pointdata import SnowExMet
+    from metloom.variables import SnowExVariables
+    from datetime import datetime
+
+    start = datetime(2020, 1, 1)
+    end = datetime(2020, 6, 1)
 
+    # Grand Mesa Study Plot
+    gmsp = SnowExMet('GMSP')
+    df_gmsp = gmsp.get_daily_data(start, end, [SnowExVariables.SNOWDEPTH])
 
 My variables aren't here
 ------------------------

diff --git a/metloom/pointdata/__init__.py b/metloom/pointdata/__init__.py
@@ -6,9 +6,13 @@
 from .geosphere_austria import GeoSphereHistPointData, GeoSphereCurrentPointData
 from .norway import MetNorwayPointData
 from .cues import CuesLevel1
+from .files import CSVPointData, StationInfo
+from .snowex import SnowExMet
+from .csas import CSASMet
 
 __all__ = [
     "PointData", "PointDataCollection", "CDECPointData", "SnotelPointData",
     "MesowestPointData", "USGSPointData", "GeoSphereHistPointData",
-    "GeoSphereCurrentPointData", "CuesLevel1", "MetNorwayPointData"
+    "GeoSphereCurrentPointData", "CuesLevel1", "MetNorwayPointData",
+    "CSVPointData", "StationInfo", "SnowExMet", "CSASMet"
 ]
diff --git a/metloom/pointdata/base.py b/metloom/pointdata/base.py
@@ -242,42 +242,68 @@ def points_from_geometry(
         raise NotImplementedError("points_from_geometry not implemented")
 
     @classmethod
-    def validate_sensor_df(cls, gdf: gpd.GeoDataFrame):
-        """
-        Validate that the GeoDataFrame returned is formatted correctly.
-        The goal of this method is to ensure base classes are returning a
-        consistent format of dataframe
-        """
-        if gdf is None:
-            return
-        assert isinstance(gdf, gpd.GeoDataFrame)
-        columns = gdf.columns
-        index_names = gdf.index.names
-        # check for required indexes
+    def _validate_geodataframe(cls, gdf: gpd.GeoDataFrame):
+        if not isinstance(gdf, gpd.GeoDataFrame):
+            raise DataValidationError('Returned DataFrame must be a GeoDataframe')
+
+    @classmethod
+    def _validate_df_indicies(cls, gdf: gpd.GeoDataFrame):
+        """ Confirm the df is indexed properly"""
         for ei in cls.EXPECTED_INDICES:
-            if ei not in index_names:
+            if ei not in gdf.index.names:
                 raise DataValidationError(
                     f"{ei} was expected, but not found as an"
                     f" index of the final dataframe"
                 )
+
+    @classmethod
+    def _validate_df_columns(cls, gdf: gpd.GeoDataFrame, expected_columns: List[str]):
         # check for expected columns - avoid modifying at class level
-        expected_columns = copy.deepcopy(cls.EXPECTED_COLUMNS)
         possible_extras = ["measurementDate", "quality_code"]
+        columns = gdf.columns
         for pe in possible_extras:
             if pe in columns:
                 expected_columns += [pe]
+
         for column in expected_columns:
             if column not in columns:
                 raise DataValidationError(
                     f"{column} was expected, but not found as a"
                     f" column of the final dataframe"
                 )
 
-        remaining_columns = [c for c in columns if c not in expected_columns]
+    @classmethod
+    def _validate_df_units(cls, gdf: gpd.GeoDataFrame, expected_columns: List[str]):
+        """
+        Check the variables requested have units associated
+        """
+        remaining_columns = [c for c in gdf.columns if c not in expected_columns]
         # make sure all variables have a units column as well
         for rc in remaining_columns:
             if "_units" not in rc:
-                assert f"{rc}_units" in remaining_columns
+                if f"{rc}_units" not in remaining_columns:
+                    raise DataValidationError(f'Missing units column for {rc}')
+
+    @classmethod
+    def validate_sensor_df(cls, gdf: gpd.GeoDataFrame):
+        """
+        Validate that the GeoDataFrame returned is formatted correctly.
+        The goal of this method is to ensure base classes are returning a
+        consistent format of dataframe
+        """
+        if gdf is None:
+            return
+
+        expected_columns = copy.deepcopy(cls.EXPECTED_COLUMNS)
+
+        # Confirm the dataframe is a geodataframe
+        cls._validate_geodataframe(gdf)
+        # Confirm the df is indexed properly
+        cls._validate_df_indicies(gdf)
+        # Confirm the columns are correct
+        cls._validate_df_columns(gdf, expected_columns)
+        # Confirm that any columns from variables have associated units
+        cls._validate_df_units(gdf, expected_columns)
 
     def __repr__(self):
         return f"{self.__class__.__name__}({self.id!r}, {self.name!r})"

diff --git a/metloom/pointdata/csas.py b/metloom/pointdata/csas.py
@@ -0,0 +1,80 @@
+"""
+Data reader for the Center for Snow and Avalanche Studies
+"""
+from metloom.pointdata import CSVPointData, StationInfo
+from metloom.variables import CSASVariables
+import os
+from datetime import datetime, timedelta
+
+
+class InvalidDateRange(Exception):
+    """
+    Exception to indicate there is no know data for the available date range
+    """
+
+
+class CSASStationInfo(StationInfo):
+    # Name, id, lat, long, elevation, http path
+    SENATOR_BECK = ("Senator Beck Study Plot", "SBSP", 37.90688, -107.72627, 12186,
+                    "2023/11/SBSP_1hr_2003-2009.csv")
+    SWAMP_ANGEL = ("Swamp Angel Study Plot", "SASP", 37.90691, -107.71132, 11060,
+                   "2023/11/SASP_1hr_2003-2009.csv")
+    PUTNEY = ("Putney Study Plot", "PTSP", 37.89233, -107.69577, 12323,
+              "2023/11/PTSP_1hr.csv")
+    SENATOR_BECK_STREAM_GAUGE = ("Senator Beck Stream Gauge", "SBSG", 37.90678,
+                                 -107.70943, 11030, "2023/11/SBSG_1hr.csv")
+
+
+class CSASMet(CSVPointData):
+    """
+    """
+    ALLOWED_VARIABLES = CSASVariables
+    ALLOWED_STATIONS = CSASStationInfo
+
+    # Data is in Mountain time
+    UTC_OFFSET_HOURS = -7
+
+    URL = "https://snowstudies.org/wp-content/uploads/"
+    DATASOURCE = "CSAS"
+    DOI = ""
+
+    def _file_urls(self, station_id, start, end):
+        """
+        Navigate the system using dates. Data for SASP and SBSP is stored in
+        two csvs. 2003-2009 and 2010-2023. Not sure what happens when the
+        next year is made available. This function will grab the necessary urls
+        depending on the requested data
+        """
+        urls = []
+
+        if station_id in ['SASP', 'SBSP']:
+            current_available_year = datetime.today().year - 1
+
+            if start.year <= 2009:
+                urls.append(os.path.join(self.URL, self._station_info.path))
+
+            # Account for later file use or even straddling thge data
+            if start.year > 2009 or end.year > 2009:  # TODO: add to the info enum?
+                partial = str(self._station_info.path).replace("2003", "2010")
+
+                filename = partial.replace('2009', str(current_available_year))
+                urls.append(os.path.join(self.URL, filename))
+
+            if start.year < 2003 or end.year > current_available_year:
+                raise InvalidDateRange(f"CSAS data is only available from 2003-"
+                                       f"{current_available_year}")
+        else:
+            urls.append(os.path.join(self.URL, self._station_info.path))
+
+        return urls
+
+    @staticmethod
+    def _parse_datetime(row):
+        # Julian day is not zero based Jan 1 == DOY 1
+        dt = timedelta(days=int(row['DOY']) - 1, hours=int(row['Hour'] / 100))
+        return datetime(int(row['Year']), 1, 1) + dt
+
+    def _assign_datetime(self, resp_df):
+        resp_df['datetime'] = resp_df.apply(lambda row: self._parse_datetime(row),
+                                            axis=1)
+        return resp_df.set_index('datetime')