From d18c9049cb335c1ac1ef233704544747449d469b Mon Sep 17 00:00:00 2001
From: ouyangwenyu <wenyuouyang@outlook.com>
Date: Mon, 25 Mar 2024 14:56:55 +0800
Subject: [PATCH] add data format checker for own data

---
 README.md                                 |  51 ++-
 env-dev.yml                               |   2 +-
 hydromodel/datasets/__init__.py           |  44 +++
 hydromodel/datasets/camels_format_data.py | 392 ----------------------
 hydromodel/datasets/data_preprocess.py    | 313 ++++++++++++-----
 requirements.txt                          |   2 +-
 requirements_dev.txt                      |   2 +-
 scripts/check_data_format.py              |  27 ++
 test/test_data.py                         |   7 +-
 test/test_data_preprocess.py              | 234 +++++++++++++
 10 files changed, 560 insertions(+), 514 deletions(-)
 delete mode 100644 hydromodel/datasets/camels_format_data.py
 create mode 100644 scripts/check_data_format.py
 create mode 100644 test/test_data_preprocess.py
diff --git a/README.md b/README.md
index d97dc05..34d0a15 100644
--- a/README.md
+++ b/README.md
@@ -53,7 +53,26 @@ $ python -m ipykernel install --user --name xaj --display-name "xaj"
 
 ### Prepare data
 
-To use your own data to run the model, we set a data interface, here is the convention:
+To use your own data to run the model, you can prepare the data in the required format:
+
+For one basin (We only support one basin now), the data is put in one csv/txt file.
+There are three necessary columns: "time", "prcp", "pet", and "flow". "time" is the time series, "prcp" is the precipitation, "pet" is the potential evapotranspiration, and "flow" is the observed streamflow. 
+The time series should be continuous (NaN values are allowed), and the time step should be the same for all columns. The time format should be "YYYY-MM-DD HH:MM:SS". The data should be sorted by time.
+
+You can run a checker function to see if the data is in the right format:
+
+```Shell
+$ cd hydromodel/scripts
+$ python check_data_format.py --data_file <absolute path of the data file>
+```
+
+Then, you can use the data_preprocess module to transform the data to the required format:
+
+```Shell
+$ python datapreprocess4calibrate.py --data <name of the data file> --exp <name of the directory of the prepared data>
+```
+
+The data will be transformed in data interface, here is the convention:
 
 - All input data for models are three-dimensional NumPy array: [time, basin, variable], which means "time" series data
   for "variables" in "basins"
@@ -90,36 +109,6 @@ More details about the analysis could be seen in show_results.ipynb file. It is
 
 Now we only provide some simple statistics calculations.
 
-### How to make the sample data
-
-In this part, we simply introduce how we prepare the sample data.
-
-Here We provide an example for some basins in [the CAMELS dataset](https://ral.ucar.edu/solutions/products/camels), a very common used dataset for hydrological model evaluation.
-
-You can download CAMELS according to this [instruction](https://github.com/OuyangWenyu/hydrodataset).
-
-Check if you have successfully downloaded and put it in the right place.
-
-```Shell
-$ conda activate xaj
-$ python
->>> import os
->>> from hydrodataset.camels import Camels
->>> camels = Camels(data_path=os.path.join("camels", "camels_us"), download=False, region="US")
-```
-
-if any error is raised, please see this [instruction](https://github.com/OuyangWenyu/hydrodataset) again.
-
-Then, we provide a script to transform data organized like CAMELS to the required format, you can use it like this:
-
-```Shell
-$ cd hydromodel/app
-$ python datapreprocess4calibrate.py --camels_dir <name of camels_dir> --exp <name of directory of the prepared data> --calibrate_period <calibration period> --test_period <test period> --basin_id <basin id>
-# such as: python datapreprocess4calibrate.py --camels_name camels_us --exp xxx --calibrate_period 1990-10-01 2000-10-01 --test_period 2000-10-01 2010-10-01 --basin_id 01439500 06885500 08104900 09510200
-```
-
-Then you can see some files in hydromodel/example/xxx directory.
-
 ## Why does hydro-model-xaj exist
 
 When we want to learn about the rainfall-runoff process and make forecasts for floods, etc. We often use classic hydrological
diff --git a/env-dev.yml b/env-dev.yml
index 2da99b2..2fea46a 100644
--- a/env-dev.yml
+++ b/env-dev.yml
@@ -26,4 +26,4 @@ dependencies:
     - twine
     - bump2version
     - muskingumcunge
-    - hydrodataset
+    - hydrodata
diff --git a/hydromodel/datasets/__init__.py b/hydromodel/datasets/__init__.py
index e69de29..f513940 100644
--- a/hydromodel/datasets/__init__.py
+++ b/hydromodel/datasets/__init__.py
@@ -0,0 +1,44 @@
+PRCP_NAME = "prcp(mm/day)"
+PET_NAME = "pet(mm/day)"
+ET_NAME = "et(mm/day)"
+FLOW_NAME = "flow(m^3/s)"
+NODE_FLOW_NAME = "node1_flow(m^3/s)"
+AREA_NAME = "area(km^2)"
+TIME_NAME = "time"
+TIME_FORMAT = "%Y-%m-%d %H:%M:%S"
+ID_NAME = "id"
+NAME_NAME = "name"
+
+
+def remove_unit_from_name(name_with_unit):
+    """
+    Remove the unit from a variable name.
+
+    Parameters
+    ----------
+    name_with_unit : str
+        The name of the variable including its unit, e.g., "prcp(mm/day)".
+
+    Returns
+    -------
+    str
+        The name of the variable without the unit, e.g., "prcp".
+    """
+    return name_with_unit.split("(")[0]
+
+
+def get_unit_from_name(name_with_unit):
+    """
+    Extract the unit from a variable name.
+
+    Parameters
+    ----------
+    name_with_unit : str
+        The name of the variable including its unit, e.g., "prcp(mm/day)".
+
+    Returns
+    -------
+    str
+        The unit of the variable, e.g., "mm/day".
+    """
+    return name_with_unit.split("(")[1].strip(")") if "(" in name_with_unit else ""
diff --git a/hydromodel/datasets/camels_format_data.py b/hydromodel/datasets/camels_format_data.py
deleted file mode 100644
index 9b92714..0000000
--- a/hydromodel/datasets/camels_format_data.py
+++ /dev/null
@@ -1,392 +0,0 @@
-import collections
-import os
-from typing import Union
-import pandas as pd
-import numpy as np
-from pandas.core.dtypes.common import is_string_dtype, is_numeric_dtype
-from tqdm import tqdm
-
-from hydroutils import hydro_time
-import hydrodataset
-
-
-class MyCamels(hydrodataset.Camels):
-    def __init__(self, data_path, download=False, region: str = "CC"):
-        """
-        Initialization for my own CAMELS format dataset
-
-        Parameters
-        ----------
-        data_path
-            where we put the dataset
-        download
-            if true, download
-        region
-            remember the name of your own region
-        """
-        hydrodataset.camels.CAMELS_REGIONS = hydrodataset.camels.CAMELS_REGIONS + [
-            region
-        ]
-        super().__init__(data_path, download, region)
-
-    def set_data_source_describe(self) -> collections.OrderedDict:
-        """
-        Introduce the files in the dataset and list their location in the file system
-
-        Returns
-        -------
-        collections.OrderedDict
-            the description for a CAMELS dataset
-        """
-        camels_db = self.data_source_dir
-        # shp files of basins
-        camels_shp_files_dir = os.path.join(camels_db, "basin_boudaries")
-        # attr, flow and forcing data are all in the same dir. each basin has one dir.
-        flow_dir = os.path.join(camels_db, "streamflow")
-        sm_dir = os.path.join(camels_db, "soil_moisture")
-        et_dir = os.path.join(camels_db, "evapotranspiration")
-        forcing_dir = os.path.join(camels_db, "basin_mean_forcing")
-        attr_dir = os.path.join(camels_db, "attribute")
-        # no gauge id file for CAMELS_CC, just read from any attribute file
-        gauge_id_file = os.path.join(camels_db, "gage_points.csv")
-        attr_key_lst = [
-            "climate",
-            "geology",
-            "land_cover",
-            "permeability_porosity",
-            "root_depth",
-            "soil",
-            "topo_elev_slope",
-            "topo_shape_factors",
-        ]
-        return collections.OrderedDict(
-            CAMELS_DIR=camels_db,
-            CAMELS_FLOW_DIR=flow_dir,
-            CAMELS_SM_DIR=sm_dir,
-            CAMELS_ET_DIR=et_dir,
-            CAMELS_FORCING_DIR=forcing_dir,
-            CAMELS_ATTR_DIR=attr_dir,
-            CAMELS_ATTR_KEY_LST=attr_key_lst,
-            CAMELS_GAUGE_FILE=gauge_id_file,
-            CAMELS_BASINS_SHP_DIR=camels_shp_files_dir,
-        )
-
-    def read_site_info(self) -> pd.DataFrame:
-        """
-        Read the basic information of gages in a CAMELS dataset
-
-        Returns
-        -------
-        pd.DataFrame
-            basic info of gages
-        """
-        camels_file = self.data_source_description["CAMELS_GAUGE_FILE"]
-        data = pd.read_csv(camels_file, sep=",", dtype={"gage_id": str})
-        return data
-
-    def get_constant_cols(self) -> np.array:
-        """
-        all readable attrs in CAMELS
-
-        Returns
-        -------
-        np.array
-            attribute types
-        """
-        data_folder = self.data_source_description["CAMELS_ATTR_DIR"]
-        files = np.sort(os.listdir(data_folder))
-        attr_types = []
-        for file_ in files:
-            file = os.path.join(data_folder, file_)
-            attr_tmp = pd.read_csv(file, sep=",", dtype={"gage_id": str})
-            attr_types = attr_types + attr_tmp.columns[1:].values.tolist()
-        return np.array(attr_types)
-
-    def get_relevant_cols(self) -> np.array:
-        """
-        all readable forcing types
-
-        Returns
-        -------
-        np.array
-            forcing types
-        """
-        forcing_dir = self.data_source_description["CAMELS_FORCING_DIR"]
-        forcing_file = os.path.join(forcing_dir, os.listdir(forcing_dir)[0])
-        forcing_tmp = pd.read_csv(forcing_file, sep="\s+", dtype={"gage_id": str})
-        return forcing_tmp.columns.values
-
-    def get_target_cols(self) -> np.array:
-        """
-        For CAMELS, the target vars are streamflows
-
-        Returns
-        -------
-        np.array
-            streamflow types
-        """
-        # ssm is the surface soil moisture
-        return np.array(["Q", "ssm", "ET"])
-
-    def read_object_ids(self, **kwargs) -> np.array:
-        """
-        read station ids
-
-        Parameters
-        ----------
-        **kwargs
-            optional params if needed
-
-        Returns
-        -------
-        np.array
-            gage/station ids
-        """
-        return self.camels_sites["gage_id"].values
-
-    def read_target_cols(
-        self,
-        gage_id_lst: Union[list, np.array] = None,
-        t_range: list = None,
-        target_cols: Union[list, np.array] = None,
-        **kwargs
-    ) -> np.array:
-        """
-        read target values; for all CAMELS, they are streamflows except for CAMELS-CC (inlcude soil moisture)
-
-        default target_cols is an one-value list
-        Notice: the unit of target outputs in different regions are not totally same
-
-        Parameters
-        ----------
-        gage_id_lst
-            station ids
-        t_range
-            the time range, for example, ["1990-01-01", "2000-01-01"]
-        target_cols
-            the default is None, but we neea at least one default target.
-        kwargs
-            some other params if needed
-
-        Returns
-        -------
-        np.array
-            streamflow data, 3-dim [station, time, streamflow], unit is m3/s
-        """
-        if target_cols is None:
-            return np.array([])
-        else:
-            nf = len(target_cols)
-        t_range_list = hydro_time.t_range_days(t_range)
-        nt = t_range_list.shape[0]
-        y = np.full([len(gage_id_lst), nt, nf], np.nan)
-        for j in tqdm(range(len(target_cols)), desc="Read Q/SSM/ET data of CAMELS-CC"):
-            for k in tqdm(range(len(gage_id_lst))):
-                if target_cols[j] == "ssm":
-                    sm_file = os.path.join(
-                        self.data_source_description["CAMELS_SM_DIR"],
-                        gage_id_lst[k] + "_lump_nasa_usda_smap.txt",
-                    )
-                    sm_data = pd.read_csv(sm_file, sep=",")
-                    df_date = sm_data[["Year", "Mnth", "Day"]]
-                    df_date.columns = ["year", "month", "day"]
-                    date = pd.to_datetime(df_date).values.astype("datetime64[D]")
-                    [c, ind1, ind2] = np.intersect1d(
-                        date, t_range_list, return_indices=True
-                    )
-                    y[k, ind2, j] = sm_data["ssm(mm)"].values[ind1]
-                elif target_cols[j] == "ET":
-                    et_file = os.path.join(
-                        self.data_source_description["CAMELS_ET_DIR"],
-                        gage_id_lst[k] + "_lump_modis16a2v006_et.txt",
-                    )
-                    et_data = pd.read_csv(et_file, sep=",")
-                    df_date = et_data[["Year", "Mnth", "Day"]]
-                    df_date.columns = ["year", "month", "day"]
-                    # all dates in a table
-                    date = pd.to_datetime(df_date).values.astype("datetime64[D]")
-                    if (
-                        np.datetime64(str(date[-1].astype(object).year) + "-12-31")
-                        > date[-1]
-                        > np.datetime64(str(date[-1].astype(object).year) + "-12-24")
-                    ):
-                        # the final date in all dates, if it is a date in the end of a year, its internal is 5 or 6
-                        final_date = np.datetime64(
-                            str(date[-1].astype(object).year + 1) + "-01-01"
-                        )
-                    else:
-                        final_date = date[-1] + np.timedelta64(8, "D")
-                    date_all = hydro_time.t_range_days(
-                        hydro_time.t_days_lst2range([date[0], final_date])
-                    )
-                    t_range_final = np.intersect1d(date_all, t_range_list)
-                    [_, ind3, ind4] = np.intersect1d(
-                        date, t_range_final, return_indices=True
-                    )
-
-                    days_interval = [y - x for x, y in zip(ind4, ind4[1:])]
-                    # get the final range
-                    if (
-                        t_range_final[-1].item().month == 12
-                        and t_range_final[-1].item().day == 31
-                    ):
-                        final_timedelta = (
-                            t_range_final[-1].item() - t_range_final[ind4[-1]].item()
-                        )
-                        final_day_interval = [final_timedelta.days]
-                    else:
-                        final_day_interval = [8]
-                    days_interval = np.array(days_interval + final_day_interval)
-                    # there may be some missing data, so that some interval will be larger than 8
-                    days_interval[np.where(days_interval > 8)] = 8
-                    # we use mean value rather than sum, because less error when predicting for every day
-                    # for example, mean: [1, x, x, 2, x, x, 3] is obs, [1, 1, 1, 2, 2, 2, 3] is pred,
-                    # sum: [3, x, x, 6, x, x, 9] is obs, [1, 1, 1, 2, 2, 2, 3] is pred
-                    # the final day's error is significant when using sum
-                    # although a better way is to extend [1, 1, 1, 2, 2, 2, 3] to [1, 1, 1, 2, 2, 2, 3, 3, 3]
-                    y[k, ind4, j] = (
-                        et_data["ET(kg/m^2/8day)"][ind3] / days_interval
-                    )
-                    # More notice: it is only for unified process to divide by 35.314666721489
-                    # notice the value's unit is kg/m2/8d and has a scale factor 0.1
-                    # more information can be seen here: https://www.ntsg.umt.edu/project/modis/mod16.php
-                    # it says: "The users should multiply 0.1 to get the real ET/PET values in mm/8day or mm/month"
-                else:
-                    # only one streamflow type: Q
-                    flow_file = os.path.join(
-                        self.data_source_description["CAMELS_FLOW_DIR"],
-                        gage_id_lst[k] + ".csv",
-                    )
-                    flow_data = pd.read_csv(flow_file, sep=",")
-                    date = pd.to_datetime(flow_data["DATE"]).values.astype(
-                        "datetime64[D]"
-                    )
-                    [c, ind1, ind2] = np.intersect1d(
-                        date, t_range_list, return_indices=True
-                    )
-                    y[k, ind2, j] = flow_data["Q"].values[ind1]
-        return y
-
-    def read_relevant_cols(
-        self,
-        gage_id_lst: list = None,
-        t_range: list = None,
-        var_lst: list = None,
-        forcing_type="daymet",
-    ) -> np.array:
-        """
-        Read forcing data
-
-        Parameters
-        ----------
-        gage_id_lst
-            station ids
-        t_range
-            the time range, for example, ["1990-01-01", "2000-01-01"]
-        var_lst
-            forcing variable types
-        forcing_type
-            only for CAMELS-US, don't care it
-        Returns
-        -------
-        np.array
-            forcing data
-        """
-        t_range_list = hydro_time.t_range_days(t_range)
-        nt = t_range_list.shape[0]
-        x = np.full([len(gage_id_lst), nt, len(var_lst)], np.nan)
-        for k in tqdm(range(len(gage_id_lst)), desc="Read forcing data of CAMELS-CC"):
-            forcing_file = os.path.join(
-                self.data_source_description["CAMELS_FORCING_DIR"],
-                gage_id_lst[k] + "_lump_era5_land_forcing.txt",
-            )
-            forcing_data = pd.read_csv(forcing_file, sep=" ")
-            df_date = forcing_data[["Year", "Mnth", "Day"]]
-            df_date.columns = ["year", "month", "day"]
-            date = pd.to_datetime(df_date).values.astype("datetime64[D]")
-
-            [c, ind1, ind2] = np.intersect1d(date, t_range_list, return_indices=True)
-            for j in range(len(var_lst)):
-                if "evaporation" in var_lst[j]:
-                    # evaporation value are all negative (maybe upward flux is marked as negative)
-                    x[k, ind2, j] = forcing_data[var_lst[j]].values[ind1] * -1 * 1e3
-                    # unit of prep and pet is m, tran them to mm
-                elif "precipitation" in var_lst[j]:
-                    prcp = forcing_data[var_lst[j]].values
-                    # there are a few negative values for prcp, set them 0
-                    prcp[prcp < 0] = 0.0
-                    x[k, ind2, j] = prcp[ind1] * 1e3
-                else:
-                    x[k, ind2, j] = forcing_data[var_lst[j]].values[ind1]
-        return x
-
-    def read_attr_all(self):
-        data_folder = self.data_source_description["CAMELS_ATTR_DIR"]
-        key_lst = self.data_source_description["CAMELS_ATTR_KEY_LST"]
-        f_dict = dict()  # factorize dict
-        var_dict = dict()
-        var_lst = list()
-        out_lst = list()
-        gage_dict = self.camels_sites
-        camels_str = ""
-        sep_ = ","
-        for key in key_lst:
-            data_file = os.path.join(data_folder, key + ".csv")
-            data_temp = pd.read_csv(data_file, sep=sep_)
-            var_lst_temp = list(data_temp.columns[1:])
-            var_dict[key] = var_lst_temp
-            var_lst.extend(var_lst_temp)
-            k = 0
-            gage_id_key = "gage_id"
-            n_gage = len(gage_dict[gage_id_key].values)
-            out_temp = np.full([n_gage, len(var_lst_temp)], np.nan)
-            for field in var_lst_temp:
-                if is_string_dtype(data_temp[field]):
-                    value, ref = pd.factorize(data_temp[field], sort=True)
-                    out_temp[:, k] = value
-                    f_dict[field] = ref.tolist()
-                elif is_numeric_dtype(data_temp[field]):
-                    out_temp[:, k] = data_temp[field].values
-                k = k + 1
-            out_lst.append(out_temp)
-        out = np.concatenate(out_lst, 1)
-        return out, var_lst, var_dict, f_dict
-
-    def read_constant_cols(
-        self, gage_id_lst=None, var_lst=None, is_return_dict=False
-    ) -> Union[tuple, np.array]:
-        """
-        Read Attributes data
-
-        Parameters
-        ----------
-        gage_id_lst
-            station ids
-        var_lst
-            attribute variable types
-        is_return_dict
-            if true, return var_dict and f_dict for CAMELS_US
-        Returns
-        -------
-        Union[tuple, np.array]
-            if attr var type is str, return factorized data.
-            When we need to know what a factorized value represents, we need return a tuple;
-            otherwise just return an array
-        """
-        attr_all, var_lst_all, var_dict, f_dict = self.read_attr_all()
-        ind_var = [var_lst_all.index(var) for var in var_lst]
-        id_lst_all = self.read_object_ids()
-        # Notice the sequence of station ids ! Some id_lst_all are not sorted, so don't use np.intersect1d
-        ind_grid = [id_lst_all.tolist().index(tmp) for tmp in gage_id_lst]
-        temp = attr_all[ind_grid, :]
-        out = temp[:, ind_var]
-        if is_return_dict:
-            return out, var_dict, f_dict
-        else:
-            return out
-
-    def read_area(self, object_ids) -> np.array:
-        return self.read_constant_cols(object_ids, ["Area"], is_return_dict=False)
-
-    def read_mean_prep(self, object_ids) -> np.array:
-        return self.read_constant_cols(object_ids, ["p_mean"], is_return_dict=False)
diff --git a/hydromodel/datasets/data_preprocess.py b/hydromodel/datasets/data_preprocess.py
index 0b46221..e29ec4b 100644
--- a/hydromodel/datasets/data_preprocess.py
+++ b/hydromodel/datasets/data_preprocess.py
@@ -1,110 +1,257 @@
 """
 Author: Wenyu Ouyang
 Date: 2022-10-25 21:16:22
-LastEditTime: 2024-03-21 18:36:25
+LastEditTime: 2024-03-25 14:50:32
 LastEditors: Wenyu Ouyang
 Description: preprocess data for models in hydro-model-xaj
-FilePath: \hydro-model-xaj\hydromodel\data\data_preprocess.py
+FilePath: \hydro-model-xaj\hydromodel\datasets\data_preprocess.py
 Copyright (c) 2021-2022 Wenyu Ouyang. All rights reserved.
 """
 
+import os
+import re
 import numpy as np
 import pandas as pd
 from sklearn.model_selection import KFold
-import sys
-import os
-from pathlib import Path
 from collections import OrderedDict
+import xarray as xr
 
-import hydrodataset
 from hydroutils import hydro_time, hydro_file
 
-sys.path.append(os.path.dirname(Path(os.path.abspath(__file__)).parent.parent))
-from hydromodel.datasets import camels_format_data
+from hydromodel import CACHE_DIR
+from hydromodel.datasets import *
 
 
-def trans_camels_format_to_xaj_format(
-    camels_data_dir, basin_ids: list, t_range: list, json_file, npy_file
-):
-    """tranform data with camels format to hydro-model-xaj format
+def check_tsdata_format(file_path):
+    """
+    Checks the time-series data for required and optional columns
+    used in hydrological modeling.
+
+    Parameters
+    ----------
+    file_path : str
+        Path to the hydrological data file.
+
+    Returns
+    -------
+    bool
+        True if the data file format is correct, False otherwise.
+    """
+    # prcp means precipitation, pet means potential evapotranspiration, flow means streamflow
+    required_columns = [
+        TIME_NAME,
+        PRCP_NAME,
+        PET_NAME,
+        FLOW_NAME,
+    ]
+    # et means evapotranspiration, node_flow means upstream streamflow
+    # node1 means the first upstream node, node2 means the second upstream node, etc.
+    # these nodes are the nearest upstream nodes of the target node
+    # meaning: if node1_flow, node2_flow, and more upstream nodes are parellel.
+    # No serial relationship
+    optional_columns = [ET_NAME, NODE_FLOW_NAME]
+
+    try:
+        data = pd.read_csv(file_path)
+
+        # Check required columns
+        if any(column not in data.columns for column in required_columns):
+            print(f"Missing required columns in file: {file_path}")
+            return False
+
+        # Check optional columns
+        for column in optional_columns:
+            if column not in data.columns:
+                print(
+                    f"Optional column '{column}' not found in file: {file_path}, but it's okay."
+                )
+        # Check node_flow columns (flexible number of nodes)
+        node_flow_columns = [
+            col for col in data.columns if re.match(r"node\d+_flow", col)
+        ]
+        if not node_flow_columns:
+            print(f"No 'node_flow' columns found in file: {file_path}, but it's okay.")
+
+        # Check time format and sorting
+        try:
+            data["time"] = pd.to_datetime(data["time"], format=TIME_FORMAT)
+        except ValueError:
+            print(f"Time format is incorrect in file: {file_path}")
+            return False
+
+        if not data["time"].is_monotonic_increasing:
+            print(f"Data is not sorted by time in file: {file_path}")
+            return False
+
+        # Check for consistent time intervals
+        time_differences = (
+            data["time"].diff().dropna()
+        )  # Calculate differences and remove NaN
+        if not all(time_differences == time_differences.iloc[0]):
+            print(f"Time series is not at consistent intervals in file: {file_path}")
+            return False
+
+        return True
 
-    CAMELS format could be seen here: https://gdex.ucar.edu/dataset/camels/file.html
-    download basin_timeseries_v1p2_metForcing_obsFlow.zip and unzip it, you will see the format of data
+    except Exception as e:
+        print(f"Error reading file {file_path}: {e}")
+        return False
 
-    hydro-model-xaj format: see README.md file -- https://github.com/OuyangWenyu/hydro-model-xaj
+
+def check_basin_attr_format(file_path):
+    """
+    Checks the basin attributes data for required columns.
 
     Parameters
     ----------
-    camels_data_dir : str
-        the directory of your CAMELS format data
-    basin_ids : list
-        a list of basins' ids which you choose for modeling
-    t_range: list
-        for example, ["2014-10-01", "2021-10-01"]
-    json_file: str
-        where to save the json file
-    npy_file: str
-        where to save the npy file
+    file_path : str
+        Path to the basin attributes data file.
+
+    Returns
+    -------
+    bool
+        True if the basin attributes file format is correct, False otherwise.
     """
-    if camels_data_dir.stem == "camels_cc":
-        # this is for the author's own data format, for camels we don't need this
-        camels = camels_format_data.MyCamels(camels_data_dir)
-        p_pet = camels.read_relevant_cols(
-            gage_id_lst=basin_ids,
-            t_range=t_range,
-            var_lst=["total_precipitation", "potential_evaporation"],
-        )
-        q = camels.read_target_cols(
-            gage_id_lst=basin_ids, t_range=t_range, target_cols=["Q"]
-        )
-    else:
-        region = camels_data_dir.stem.split("_")[-1].upper()
-        camels = hydrodataset.Camels(camels_data_dir, region=region)
-        flow_tag = camels.get_target_cols()
-        ft3persec2m3persec = 1 / 35.314666721489
-        if region == "US":
-            pet = camels.read_camels_us_model_output_data(
-                basin_ids, t_range, var_lst=["PET"]
-            )
-            p = camels.read_relevant_cols(
-                gage_id_lst=basin_ids,
-                t_range=t_range,
-                var_lst=["prcp"],
+    required_columns = [ID_NAME, NAME_NAME, AREA_NAME]
+
+    try:
+        data = pd.read_csv(file_path)
+
+        if missing_required_columns := [
+            col for col in required_columns if col not in data.columns
+        ]:
+            print(
+                f"Missing required columns in basin attributes file: {file_path}: {missing_required_columns}"
             )
-            p_pet = np.concatenate([p, pet], axis=2)
-        else:
-            raise NotImplementedError("Only CAMELS-US is supported now.")
-        q = camels.read_target_cols(
-            gage_id_lst=basin_ids, t_range=t_range, target_cols=flow_tag
-        )
-        # TODO: camels's streamflow data is in ft3/s, need refactor to unify the unit
-        q = q * ft3persec2m3persec
-    # generally streamflow's unit is m3/s, we transform it to mm/day
-    # basin areas also should be saved,
-    # we will use it to transform streamflow's unit to m3/s after we finished predicting
-    basin_area = camels.read_area(basin_ids)
-    # 1 km2 = 10^6 m2
-    km2tom2 = 1e6
-    # 1 m = 1000 mm
-    mtomm = 1000
-    # 1 day = 24 * 3600 s
-    daytos = 24 * 3600
-    temparea = np.tile(basin_area, (1, q.shape[1]))
-    q = np.expand_dims(q[:, :, 0] / (temparea * km2tom2) * mtomm * daytos, axis=2)
-
-    date_lst = [str(t)[:10] for t in hydro_time.t_range_days(t_range)]
-    data_info = OrderedDict(
-        {
-            "time": date_lst,
-            "basin": basin_ids,
-            "variable": ["prcp(mm/day)", "pet(mm/day)", "streamflow(mm/day)"],
-            "area": basin_area.flatten().tolist(),
-        }
-    )
-    hydro_file.serialize_json(data_info, json_file)
-    hydro_file.serialize_numpy(
-        np.swapaxes(np.concatenate((p_pet, q), axis=2), 0, 1), npy_file
-    )
+            return False
+
+        # Additional checks (e.g., datatype checks, non-empty rows) can be added here
+
+        return True
+
+    except Exception as e:
+        print(f"Error reading basin attributes file {file_path}: {e}")
+        return False
+
+
+def check_folder_contents(folder_path, basin_attr_file="basin_attributes.csv"):
+    """
+    Checks all time series data files in a folder and a single basin attributes file.
+
+    Parameters
+    ----------
+    folder_path : str
+        Path to the folder containing the time series data files.
+    basin_attr_file : str
+        Filename of the basin attributes file, default is "basin_attributes.csv".
+
+    Returns
+    -------
+    bool
+        True if all files in the folder and the basin attributes file are correct, False otherwise.
+    """
+    # 检查流域属性文件
+    if not check_basin_attr_format(os.path.join(folder_path, basin_attr_file)):
+        return False
+
+    # 获取流域ID列表
+    basin_ids = pd.read_csv(os.path.join(folder_path, basin_attr_file))["id"].tolist()
+
+    # 检查每个流域的时序文件
+    for basin_id in basin_ids:
+        file_name = f"basin_{basin_id}.csv"
+        file_path = os.path.join(folder_path, file_name)
+
+        if not os.path.exists(file_path):
+            print(f"Missing time series data file for basin {basin_id}: {file_path}")
+            return False
+
+        if not check_tsdata_format(file_path):
+            print(f"Time series data format check failed for file: {file_path}")
+            return False
+
+    return True
+
+
+def process_and_save_data_as_nc(
+    folder_path,
+    save_folder=CACHE_DIR,
+    nc_attrs_file="attributes.nc",
+    nc_ts_file="timeseries.nc",
+):
+    # 验证文件夹内容
+    if not check_folder_contents(folder_path):
+        print("Folder contents validation failed.")
+        return False
+
+    # 读取流域属性
+    basin_attr_file = os.path.join(folder_path, "basin_attributes.csv")
+    basin_attrs = pd.read_csv(basin_attr_file)
+
+    # 创建属性数据集
+    ds_attrs = xr.Dataset.from_dataframe(basin_attrs.set_index(ID_NAME))
+    new_column_names = {}
+    units = {}
+
+    for col in basin_attrs.columns:
+        new_name = remove_unit_from_name(col)
+        unit = get_unit_from_name(col)
+        new_column_names[col] = new_name
+        if unit:
+            units[new_name] = unit
+
+    basin_attrs.rename(columns=new_column_names, inplace=True)
+
+    # 创建不带单位的数据集
+    ds_attrs = xr.Dataset.from_dataframe(basin_attrs.set_index(ID_NAME))
+
+    # 为有单位的变量添加单位属性
+    for var_name, unit in units.items():
+        ds_attrs[var_name].attrs["units"] = unit
+    # 初始化时序数据集
+    ds_ts = xr.Dataset()
+
+    # 初始化用于保存单位的字典
+    units = {}
+
+    # 获取流域ID列表
+    basin_ids = basin_attrs[ID_NAME].tolist()
+
+    # 为每个流域读取并处理时序数据
+    for i, basin_id in enumerate(basin_ids):
+        file_name = f"basin_{basin_id}.csv"
+        file_path = os.path.join(folder_path, file_name)
+        data = pd.read_csv(file_path)
+        data[TIME_NAME] = pd.to_datetime(data[TIME_NAME])
+
+        # 在处理第一个流域时构建单位字典
+        if i == 0:
+            for col in data.columns:
+                new_name = remove_unit_from_name(col)
+                if unit := get_unit_from_name(col):
+                    units[new_name] = unit
+
+        # 修改列名以移除单位
+        renamed_columns = {col: remove_unit_from_name(col) for col in data.columns}
+        data.rename(columns=renamed_columns, inplace=True)
+
+        # 将 DataFrame 转换为 xarray Dataset
+        ds_basin = xr.Dataset.from_dataframe(data.set_index(TIME_NAME))
+
+        # 为每个变量设置单位属性
+        for var in ds_basin.data_vars:
+            if var in units:
+                ds_basin[var].attrs["units"] = units[var]
+        # 添加 basin 坐标
+        ds_basin = ds_basin.expand_dims({"basin": [basin_id]})
+        # 合并到主数据集
+        ds_ts = xr.merge([ds_ts, ds_basin], compat="no_conflicts")
+
+    # 保存为 NetCDF 文件
+    ds_attrs.to_netcdf(os.path.join(save_folder, nc_attrs_file))
+    ds_ts.to_netcdf(os.path.join(save_folder, nc_ts_file))
+
+    return True
 
 
 def split_train_test(json_file, npy_file, train_period, test_period):
diff --git a/requirements.txt b/requirements.txt
index f0de31a..c031e65 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,4 +10,4 @@ requests
 
 muskingumcunge
 
-hydrodataset
+hydrodata
diff --git a/requirements_dev.txt b/requirements_dev.txt
index 522f725..d9c0900 100644
--- a/requirements_dev.txt
+++ b/requirements_dev.txt
@@ -26,4 +26,4 @@ requests
 
 muskingumcunge
 
-hydrodataset
+hydrodata
diff --git a/scripts/check_data_format.py b/scripts/check_data_format.py
new file mode 100644
index 0000000..4827ac4
--- /dev/null
+++ b/scripts/check_data_format.py
@@ -0,0 +1,27 @@
+import argparse
+
+from hydromodel.datasets.data_preprocess import check_tsdata_format
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Check the format of hydrological data."
+    )
+    parser.add_argument(
+        "--data_file",
+        type=str,
+        required=True,
+        help="Path to the hydrological data file",
+    )
+
+    args = parser.parse_args()
+    file_path = args.data_file
+
+    if check_tsdata_format(file_path):
+        print("Data format is correct.")
+    else:
+        print("Data format is incorrect.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test/test_data.py b/test/test_data.py
index c4c8342..b6c6efc 100644
--- a/test/test_data.py
+++ b/test/test_data.py
@@ -1,7 +1,7 @@
 """
 Author: Wenyu Ouyang
 Date: 2022-10-25 21:16:22
-LastEditTime: 2024-03-22 09:54:30
+LastEditTime: 2024-03-25 11:29:10
 LastEditors: Wenyu Ouyang
 Description: Test for data preprocess
 FilePath: \hydro-model-xaj\test\test_data.py
@@ -9,6 +9,7 @@
 """
 
 import os
+
 from hydrodataset import Camels
 
 from hydromodel import SETTING
@@ -21,7 +22,3 @@ def test_load_dataset():
         ["01013500"], ["2010-01-01", "2014-01-01"], ["streamflow"]
     )
     print(data)
-
-
-def test_read_your_own_data():
-    pass
diff --git a/test/test_data_preprocess.py b/test/test_data_preprocess.py
new file mode 100644
index 0000000..99becc0
--- /dev/null
+++ b/test/test_data_preprocess.py
@@ -0,0 +1,234 @@
+import pytest
+import os
+import pandas as pd
+import xarray as xr
+
+from hydromodel.datasets import *
+from hydromodel.datasets.data_preprocess import process_and_save_data_as_nc
+from hydromodel.datasets.data_preprocess import check_tsdata_format
+from hydromodel.datasets.data_preprocess import check_basin_attr_format
+from hydromodel.datasets.data_preprocess import check_folder_contents
+
+
+@pytest.fixture()
+def basin_attrs_file(tmp_path):
+    # Create a temporary CSV file with required columns
+    file_path = tmp_path / "basin_attributes.csv"
+    data = pd.DataFrame(
+        {
+            ID_NAME: [1, 2, 3],
+            NAME_NAME: ["Basin A", "Basin B", "Basin C"],
+            AREA_NAME: [100, 200, 300],
+        }
+    )
+    data.to_csv(file_path, index=False)
+    return str(file_path)
+
+
+@pytest.fixture()
+def all_data_dir(tmp_path):
+    # Create time series data files for basins 1, 2, and 3
+    for basin_id in [1, 2, 3]:
+        file_name = f"basin_{basin_id}.csv"
+        file_path = tmp_path / file_name
+
+        data = pd.DataFrame(
+            {
+                TIME_NAME: [
+                    "2022-01-01 00:00:00",
+                    "2022-01-02 00:00:00",
+                    "2022-01-03 00:00:00",
+                ],
+                PET_NAME: [1, 2, 3],
+                PRCP_NAME: [4, 5, 6],
+                FLOW_NAME: [7, 8, 9],
+                ET_NAME: [10, 11, 12],
+                NODE_FLOW_NAME: [13, 14, 15],
+            }
+        )
+        data.to_csv(file_path, index=False)
+
+    return str(tmp_path)
+
+
+def test_check_basin_attributes_format_with_valid_file(basin_attrs_file):
+    assert check_basin_attr_format(basin_attrs_file) == True
+
+
+def test_check_basin_attributes_format_with_missing_columns(basin_attrs_file):
+    # Remove the 'name' column from the file
+    data = pd.read_csv(basin_attrs_file)
+    data.drop(columns=["name"], inplace=True)
+    data.to_csv(basin_attrs_file, index=False)
+
+    assert check_basin_attr_format(basin_attrs_file) == False
+
+
+def test_check_basin_attributes_format_with_invalid_file(basin_attrs_file):
+    # Write invalid data to the file
+    with open(basin_attrs_file, "w") as f:
+        f.write("Invalid data")
+
+    assert check_basin_attr_format(basin_attrs_file) == False
+
+
+def test_check_your_own_data(all_data_dir):
+    """
+    Test to check the format of hydrological modeling data.
+    """
+    # Define a sample file path
+    file_path = os.path.join(all_data_dir, "basin_1.csv")
+
+    # Check the format of hydrological modeling data
+    result = check_tsdata_format(file_path)
+
+    # Assert that the result is True
+    assert result
+
+    # Clean up the sample file
+    os.remove(file_path)
+
+
+def test_check_your_own_data_missing_required_columns(tmpdir):
+    """
+    Test to check the format of hydrological modeling data with missing required columns.
+    """
+    # Define a sample file path
+    file_path = os.path.join(str(tmpdir), "hydro_data.csv")
+
+    # Create a sample file with missing required columns
+    sample_data = pd.DataFrame(
+        {
+            PRCP_NAME: [4, 5, 6],
+            FLOW_NAME: [7, 8, 9],
+            ET_NAME: [10, 11, 12],
+            NODE_FLOW_NAME: [13, 14, 15],
+        }
+    )
+    sample_data.to_csv(file_path, index=False)
+
+    # Check the format of hydrological modeling data
+    result = check_tsdata_format(file_path)
+
+    # Assert that the result is False
+    assert not result
+
+    # Clean up the sample file
+    os.remove(file_path)
+
+
+def test_check_your_own_data_missing_optional_columns(tmpdir):
+    """
+    Test to check the format of hydrological modeling data with missing optional columns.
+    """
+    # Define a sample file path
+    file_path = os.path.join(str(tmpdir), "hydro_data.csv")
+
+    # Create a sample file with missing optional columns
+    sample_data = pd.DataFrame(
+        {
+            TIME_NAME: [
+                "2022-01-01 00:00:00",
+                "2022-01-02 00:00:00",
+                "2022-01-03 00:00:00",
+            ],
+            PET_NAME: [1, 2, 3],
+            PRCP_NAME: [4, 5, 6],
+            FLOW_NAME: [7, 8, 9],
+        }
+    )
+    sample_data.to_csv(file_path, index=False)
+
+    # Check the format of hydrological modeling data
+    result = check_tsdata_format(file_path)
+
+    # Assert that the result is True
+    assert result
+
+    # Clean up the sample file
+    os.remove(file_path)
+
+
+def test_check_your_own_data_invalid_file(tmpdir):
+    """
+    Test to check the format of an invalid hydrological modeling data file.
+    """
+    # Define a sample file path
+    file_path = os.path.join(str(tmpdir), "hydro_data.csv")
+
+    # Create an invalid file (not a CSV)
+    with open(file_path, "w") as f:
+        f.write("This is not a valid CSV file.")
+
+    # Check the format of hydrological modeling data
+    result = check_tsdata_format(file_path)
+
+    # Assert that the result is False
+    assert not result
+
+    # Clean up the sample file
+    os.remove(file_path)
+
+
+def test_check_folder_contents_with_valid_files(basin_attrs_file, all_data_dir):
+    assert check_folder_contents(all_data_dir, basin_attrs_file) == True
+
+
+def test_check_folder_contents_with_missing_time_series_data_file(
+    basin_attrs_file, tmp_path
+):
+    # 创建一个模拟的时序数据文件，然后删除它，模拟缺失的文件场景
+    file_path = tmp_path / "basin_2.csv"
+    with open(file_path, "w") as f:
+        f.write("Dummy data")
+    os.remove(file_path)
+
+    # 调用检查函数，确保它正确地返回 False
+    assert check_folder_contents(tmp_path, basin_attrs_file) == False
+
+
+def test_check_folder_contents_with_invalid_time_series_data_file(
+    basin_attrs_file, tmp_path
+):
+    # Create an invalid time series data file for basin 1
+    file_path = tmp_path / "basin_1.csv"
+    with open(file_path, "w") as f:
+        f.write("Invalid data")
+
+    assert check_folder_contents(tmp_path, basin_attrs_file) == False
+
+
+def test_check_folder_contents_with_missing_basin_attributes_file(
+    tmp_path, basin_attrs_file
+):
+    # Remove basin attributes file
+    os.remove(basin_attrs_file)
+
+    assert check_folder_contents(tmp_path) == False
+
+
+def test_check_folder_contents_with_missing_basin_attributes_column(
+    basin_attrs_file, all_data_dir
+):
+    # Remove 'name' column from basin attributes file
+    data = pd.read_csv(basin_attrs_file)
+    data.drop(columns=["name"], inplace=True)
+    data.to_csv(basin_attrs_file, index=False)
+
+    assert check_folder_contents(all_data_dir, basin_attrs_file) == False
+
+
+def test_process_and_save_data_as_nc_with_valid_data(all_data_dir, basin_attrs_file):
+    # Create a temporary folder for testing
+    folder_path = os.path.join(all_data_dir, "test_folder")
+    os.makedirs(folder_path)
+
+    # Call the function to process and save the data as NetCDF files
+    result = process_and_save_data_as_nc(all_data_dir, folder_path)
+
+    # Assert that the function returns True
+    assert result
+
+    # Assert that the NetCDF files are created
+    assert os.path.exists(os.path.join(folder_path, "attributes.nc"))
+    assert os.path.exists(os.path.join(folder_path, "timeseries.nc"))