From 206ba2b2952f4265f77279ef3f4e9aa84521e9c0 Mon Sep 17 00:00:00 2001 From: "ouyang,wenyu" Date: Tue, 26 Mar 2024 21:57:44 +0800 Subject: [PATCH] refactor all output interface; add evaluate script for sceua calibrated xaj --- hydromodel/datasets/data_postprocess.py | 563 +++++++++++------- hydromodel/datasets/data_preprocess.py | 41 +- hydromodel/models/gr4j.py | 3 +- hydromodel/models/model_dict.py | 2 +- hydromodel/trainers/calibrate_ga.py | 3 +- hydromodel/trainers/evaluate.py | 258 ++++++++ hydromodel/trainers/train_utils.py | 399 ------------- scripts/calibrate_xaj.py | 33 +- scripts/evaluate_xaj.py | 176 ++---- ...stprocess4calibrate.py => post_process.py} | 8 +- test/test_data_postprocess.py | 2 +- test/test_show_results.py | 12 +- 12 files changed, 728 insertions(+), 772 deletions(-) create mode 100644 hydromodel/trainers/evaluate.py delete mode 100644 hydromodel/trainers/train_utils.py rename scripts/{datapostprocess4calibrate.py => post_process.py} (95%) diff --git a/hydromodel/datasets/data_postprocess.py b/hydromodel/datasets/data_postprocess.py index cd7dd76..00412d2 100644 --- a/hydromodel/datasets/data_postprocess.py +++ b/hydromodel/datasets/data_postprocess.py @@ -1,261 +1,376 @@ +"""Show results of calibration and validation.""" import os +from matplotlib import pyplot as plt import numpy as np import pandas as pd -import pathlib import spotpy -from hydroutils import hydro_file +from hydroutils import hydro_file, hydro_stat -from hydromodel.models.model_config import MODEL_PARAM_DICT -from hydromodel.models.xaj import xaj +def plot_sim_and_obs( + date, + sim, + obs, + save_fig, + xlabel="Date", + ylabel=None, +): + # matplotlib.use("Agg") + fig = plt.figure(figsize=(9, 6)) + ax = fig.subplots() + ax.plot( + date, + sim, + color="black", + linestyle="solid", + label="Simulation", + ) + ax.plot( + date, + obs, + "r.", + markersize=3, + label="Observation", + ) + ax.set_xlabel(xlabel) + ax.set_ylabel(ylabel) + plt.legend(loc="upper right") + plt.tight_layout() + plt.savefig(save_fig, bbox_inches="tight") + # plt.cla() + plt.close() -def read_save_sceua_calibrated_params(basin_id, save_dir, sceua_calibrated_file_name): - """ - read the parameters' file generated by spotpy SCE-UA when finishing calibration - - We also save the parameters of the best model run to a file - - Parameters - ---------- - basin_id - id of a basin - save_dir - the directory where we save params - sceua_calibrated_file_name - the parameters' file generated by spotpy SCE-UA when finishing calibration - - Returns - ------- - """ - results = spotpy.analyser.load_csv_results(sceua_calibrated_file_name) - bestindex, bestobjf = spotpy.analyser.get_minlikeindex( - results - ) # 结果数组中具有最小目标函数的位置的索引 - best_model_run = results[bestindex] - fields = [word for word in best_model_run.dtype.names if word.startswith("par")] - best_calibrate_params = pd.DataFrame(list(best_model_run[fields])) - save_file = os.path.join(save_dir, basin_id + "_calibrate_params.txt") - best_calibrate_params.to_csv(save_file, sep=",", index=False, header=True) - return np.array(best_calibrate_params).reshape(1, -1) # 返回一列最佳的结果 +def plot_train_iteration(likelihood, save_fig): + # matplotlib.use("Agg") + fig = plt.figure(figsize=(9, 6)) + ax = fig.subplots() + ax.plot(likelihood) + ax.set_ylabel("RMSE") + ax.set_xlabel("Iteration") + plt.savefig(save_fig, bbox_inches="tight") + # plt.cla() + plt.close() -def summarize_parameters(result_dir, model_info: dict): +def show_sceua_cali_result( + sceua_calibrated_file, + warmup_length, + save_dir, + basin_id, + train_period, + result_unit="mm/hour", + basin_area=None, + prcp=None, +): """ - output parameters of all basins to one file + Plot all year result to see the effect of optimized parameters Parameters ---------- - result_dir - the directory where we save results - model_name - the name of the model + sceua_calibrated_file + the result file saved after optimizing + basin_id + id of the basin + train_period + the period of training data + result_unit + the unit of result, default is mm/day, we will convert it to m3/s + basin_area + the area of the basin, its unit must be km2 Returns ------- - + None """ - path = pathlib.Path(result_dir) - all_basins_dirs = [file for file in path.iterdir() if file.is_dir()] - params = [] - basin_ids = [] - for basin_dir in all_basins_dirs: - basin_id = basin_dir.stem - columns = MODEL_PARAM_DICT[model_info["name"]]["param_name"] - params_txt = pd.read_csv( - os.path.join(basin_dir, basin_id + "_calibrate_params.txt") - ) - params_df = pd.DataFrame(params_txt.values.T, columns=columns) - params.append(params_df) - basin_ids.append(basin_id) - params_dfs = pd.concat(params, axis=0) - params_dfs.index = basin_ids - print(params_dfs) - params_dfs_ = params_dfs.transpose() - params_csv_file = os.path.join(result_dir, "basins_params.csv") - params_dfs_.to_csv(params_csv_file, sep=",", index=True, header=True) + # Load the results gained with the sceua sampler, stored in SCEUA_xaj.csv + # results = [] + # for chunk in pd.read_csv(sceua_calibrated_file, chunksize=100000 ): + # results.append(chunk) + # results = pd.concat(results) + results = spotpy.analyser.load_csv_results(sceua_calibrated_file) # 读取结果 + # Plot how the objective function was minimized during sampling + if not os.path.exists(save_dir): # 绘制采样过程中目标函数的最小化情况 + os.makedirs(save_dir) + plot_train_iteration( + results["like1"], + os.path.join(save_dir, "train_iteration.png"), # 绘制迭代中的RMSE + ) + # Plot the best model run + # Find the run_id with the minimal objective function value + bestindex, bestobjf = spotpy.analyser.get_minlikeindex( + results + ) # 绘制最佳模型图并找到run—id + # Select best model run + best_model_run = results[bestindex] # 选择最佳模型结果 + # Filter results for simulation results #最佳模型模拟结果 + fields = [word for word in best_model_run.dtype.names if word.startswith("sim")] + best_simulation = list(best_model_run[fields]) + convert_unit_sim = units.convert_unit( + np.array(best_simulation).reshape(1, -1), + # np.array(list(map(float, best_simulation)), dtype=float).reshape(1, -1), + result_unit, + units.unit["streamflow"], + basin_area=basin_area, + ) + convert_unit_obs = units.convert_unit( + np.array(spot_setup.evaluation()).reshape(1, -1), + result_unit, + units.unit["streamflow"], + basin_area=basin_area, + ) + # save calibrated results of calibration period #保存率定的结果 + train_result_file = os.path.join( + save_dir, + "train_qsim_" + spot_setup.model["name"] + "_" + str(basin_id) + ".csv", + ) + pd.DataFrame(convert_unit_sim.reshape(-1, 1)).to_csv( + train_result_file, + sep=",", + index=False, + header=False, + ) + # calculation rmse、nashsutcliffe and bias for training period + stat_error = hydro_stat.stat_error( + convert_unit_obs, + convert_unit_sim, + ) + print("Training Metrics:", basin_id, stat_error) + hydro_file.serialize_json_np( + stat_error, os.path.join(save_dir, "train_metrics.json") + ) -def renormalize_params(result_dir, model_info: dict): - path = pathlib.Path(result_dir) - all_basins_files = [file for file in path.iterdir() if file.is_dir()] - renormalization_params = [] - basin_ids = [] - for basin_dir in all_basins_files: - basin_id = basin_dir.stem - basin_ids.append(basin_id) - params = np.loadtxt( - os.path.join(basin_dir, basin_id + "_calibrate_params.txt") - )[1:].reshape(1, -1) - param_ranges = MODEL_PARAM_DICT[model_info["name"]]["param_range"] - xaj_params = [ - (value[1] - value[0]) * params[:, i] + value[0] - for i, (key, value) in enumerate(param_ranges.items()) - ] - xaj_params_ = np.array([x for j in xaj_params for x in j]) - params_df = pd.DataFrame(xaj_params_.T) - renormalization_params.append(params_df) - renormalization_params_dfs = pd.concat(renormalization_params, axis=1) - renormalization_params_dfs.index = MODEL_PARAM_DICT[model_info["name"]][ - "param_name" + # 循还画图 + time = pd.read_excel( + "D:/研究生/毕业论文/new毕业论文/预答辩/碧流河水库/站点信息/洪水率定时间.xlsx" + ) + calibrate_starttime = pd.to_datetime("2012-06-10 0:00:00") + calibrate_endtime = pd.to_datetime("2019-12-31 23:00:00") + basin_area = float(basin_area) + best_simulation = [ + x * (basin_area * 1000000 / 1000 / 3600) for x in best_simulation ] - renormalization_params_dfs.columns = basin_ids - print(renormalization_params_dfs) - params_csv_file = os.path.join(result_dir, "basins_renormalization_params.csv") - renormalization_params_dfs.to_csv(params_csv_file, sep=",", index=True, header=True) - - -def summarize_metrics(result_dir, model_info: dict): - """ - output all results' metrics of all basins to one file + obs = [x * (basin_area * 1000000 / 1000 / 3600) for x in spot_setup.evaluation()] + time["starttime"] = pd.to_datetime(time["starttime"]) + time["endtime"] = pd.to_datetime(time["endtime"]) + Prcp_list = [] + W_obs_list = [] + W_sim_list = [] + W_bias_abs_list = [] + W_bias_rela_list = [] + Q_max_obs_list = [] + Q_max_sim_list = [] + Q_bias_rela_list = [] + time_bias_list = [] + DC_list = [] + ID_list = [] + for i, row in time.iterrows(): + # for i in range(len(time)): + if row["starttime"] < calibrate_endtime: + # if(time["starttime",0]= 0: - numerator += pow(abs(mol[i]) - obs[i], 2) - meangauge += obs[i] - count += 1 - meangauge = meangauge / count - for i in range(len(obs)): - if obs[i] >= 0: - denominator += pow(obs[i] - meangauge, 2) - return 1 - numerator / denominator diff --git a/scripts/calibrate_xaj.py b/scripts/calibrate_xaj.py index 1367c4d..b5599a0 100644 --- a/scripts/calibrate_xaj.py +++ b/scripts/calibrate_xaj.py @@ -1,7 +1,7 @@ """ Author: Wenyu Ouyang Date: 2022-11-19 17:27:05 -LastEditTime: 2024-03-26 18:55:25 +LastEditTime: 2024-03-26 19:53:51 LastEditors: Wenyu Ouyang Description: the script to calibrate a model for CAMELS basin FilePath: \hydro-model-xaj\scripts\calibrate_xaj.py @@ -9,7 +9,6 @@ """ import json -import numpy as np import argparse import sys import os @@ -19,11 +18,9 @@ repo_path = os.path.dirname(Path(os.path.abspath(__file__)).parent) sys.path.append(repo_path) +from datasets.data_preprocess import cross_val_split_tsdata from hydromodel.datasets import * from hydromodel.datasets.data_preprocess import ( - cross_valid_data, - split_train_test, - get_ts_from_diffsource, get_pe_q_from_ts, ) from hydromodel.trainers.calibrate_sceua import calibrate_by_sceua @@ -41,22 +38,22 @@ def calibrate(args): basin_ids = args.basin_id model_info = args.model algo_info = args.algorithm - loss = args.loss - ts_data = get_ts_from_diffsource(data_type, data_dir, periods, basin_ids) + loss_info = args.loss where_save = Path(os.path.join(repo_path, "result", exp)) if os.path.exists(where_save) is False: os.makedirs(where_save) - if cv_fold <= 1: - # no cross validation - periods = np.sort( - [train_period[0], train_period[1], test_period[0], test_period[1]] - ) - train_and_test_data = split_train_test(ts_data, train_period, test_period) - else: - # cross validation - train_and_test_data = cross_valid_data(ts_data, periods, warmup, cv_fold) + train_and_test_data = cross_val_split_tsdata( + data_type, + data_dir, + cv_fold, + train_period, + test_period, + periods, + warmup, + basin_ids, + ) print("Start to calibrate the model") @@ -70,7 +67,7 @@ def calibrate(args): warmup, model=model_info, algorithm=algo_info, - loss=loss, + loss=loss_info, ) else: for i in range(cv_fold): @@ -84,7 +81,7 @@ def calibrate(args): warmup, model=model_info, algorithm=algo_info, - loss=loss, + loss=loss_info, ) # Convert the arguments to a dictionary args_dict = vars(args) diff --git a/scripts/evaluate_xaj.py b/scripts/evaluate_xaj.py index f529e9c..02cd25a 100644 --- a/scripts/evaluate_xaj.py +++ b/scripts/evaluate_xaj.py @@ -1,28 +1,38 @@ +""" +Author: Wenyu Ouyang +Date: 2024-03-26 12:00:12 +LastEditTime: 2024-03-26 21:50:14 +LastEditors: Wenyu Ouyang +Description: +FilePath: \hydro-model-xaj\scripts\evaluate_xaj.py +Copyright (c) 2023-2024 Wenyu Ouyang. All rights reserved. +""" + import argparse -import socket -import fnmatch -from datetime import datetime -import numpy as np -import pandas as pd import yaml import os import sys from pathlib import Path -from hydroutils import hydro_file + repo_path = os.path.dirname(Path(os.path.abspath(__file__)).parent) sys.path.append(repo_path) -from hydromodel.trainers.calibrate_sceua import calibrate_by_sceua -from hydromodel.datasets.data_postprocess import ( +from hydromodel.models.model_dict import MODEL_DICT +from hydromodel.datasets import * +from hydromodel.datasets.data_preprocess import cross_val_split_tsdata, get_pe_q_from_ts +from trainers.evaluate import ( + save_evaluate_results, +) +from hydromodel.trainers.evaluate import ( + read_all_basin_params, +) +from hydromodel.trainers.calibrate_ga import calibrate_by_ga, show_ga_result +from hydromodel.trainers.evaluate import ( + convert_streamflow_units, renormalize_params, - read_save_sceua_calibrated_params, - save_streamflow, summarize_metrics, summarize_parameters, ) -from hydromodel.trainers.train_utils import show_calibrate_result, show_test_result -from hydromodel.models.xaj import xaj -from hydromodel.trainers.calibrate_ga import calibrate_by_ga, show_ga_result def read_yaml_config(file_path): @@ -33,95 +43,46 @@ def read_yaml_config(file_path): def evaluate(args): exp = args.exp - warmup = args.warmup_length cali_dir = Path(os.path.join(repo_path, "result", exp)) cali_config = read_yaml_config(os.path.join(cali_dir, "config.yaml")) - kfold = np.sort(kfold) - for fold in kfold: - print(f"Start to calibrate the {fold}-th fold") - current_time = datetime.now().strftime("%b%d_%H-%M-%S") - save_dir = os.path.join( - cali_dir, - current_time - + "_" - + socket.gethostname() - + "_fold" - + str(fold) - ) - # 读输入文件 - if os.path.exists(save_dir) is False: - os.makedirs(save_dir) - hydro_file.serialize_json(vars(args), os.path.join(save_dir, "args.json")) + kfold = cali_config["cv_fold"] + algo_info = cali_config["algorithm"] + basins = cali_config["basin_id"] + warmup = cali_config["warmup"] + data_type = cali_config["data_type"] + data_dir = cali_config["data_dir"] + train_period = cali_config["calibrate_period"] + test_period = cali_config["test_period"] + periods = cali_config["period"] + model_info = cali_config["model"] + train_and_test_data = cross_val_split_tsdata( + data_type, + data_dir, + kfold, + train_period, + test_period, + periods, + warmup, + basins, + ) + for fold in range(kfold): + print(f"Start to evaluate the {fold+1}-th fold") + save_dir = os.path.join(cali_dir, f"sceua_xaj_cv{fold+1}") if algo_info["name"] == "SCE_UA": - for i in range(len(data_info_train["basin"])): - basin_id = data_info_train["basin"][i] - basin_area = data_info_train["area"][i] - # one directory for one model + one hyperparam setting and one basin - spotpy_db_dir = os.path.join( # 一个模型一个文件夹 - save_dir, - basin_id, - ) - if not os.path.exists(spotpy_db_dir): - os.makedirs(spotpy_db_dir) - db_name = os.path.join(spotpy_db_dir, "SCEUA_" + model_info["name"]) - show_calibrate_result( # 展示率定结果 - sampler.setup, - db_name, - warmup_length=warmup, - save_dir=spotpy_db_dir, - basin_id=basin_id, - train_period=data_info_train["time"], - basin_area=basin_area, - prcp=data_train[365:, i : i + 1, 0:1].flatten(), - ) - - params = read_save_sceua_calibrated_params( # 保存率定的参数文件 - basin_id, spotpy_db_dir, db_name - ) - # _ is et which we didn't use here - qsim, _ = xaj( # 计算模拟结果 - data_test[:, i : i + 1, 0:2], - params, - warmup_length=0, - **model_info, - ) - - qsim = units.convert_unit( - qsim, - # TODO: to unify "mm/hour" - unit_now="mm/day", - unit_final=units.unit["streamflow"], - basin_area=basin_area, - ) - qobs = units.convert_unit( - data_test[warmup:, i : i + 1, -1:], - # TODO: to unify "mm/hour" - unit_now="mm/day", - unit_final=units.unit["streamflow"], - basin_area=basin_area, - ) - test_result_file = os.path.join( - spotpy_db_dir, - "test_qsim_" + model_info["name"] + "_" + str(basin_id) + ".csv", - ) - pd.DataFrame(qsim.reshape(-1, 1)).to_csv( - test_result_file, - sep=",", - index=False, - header=False, - ) - test_date = pd.to_datetime(data_info_test["time"][:]).values.astype( - "datetime64[h]" - ) - show_test_result( - basin_id, - test_date, - qsim, - qobs, - save_dir=spotpy_db_dir, - warmup_length=warmup, - prcp=data_test[365:, i : i + 1, 0:1].flatten(), - ) + # evaluate both train and test period for all basins + test_data = train_and_test_data[fold][1] + test_p_and_e, _ = get_pe_q_from_ts(test_data) + params = read_all_basin_params(basins, save_dir) + # 计算模拟结果 + qsim, _ = MODEL_DICT[model_info["name"]]( + test_p_and_e, + params, + # we set the warmup_length=0 but later we get results from warmup_length to the end to evaluate + warmup_length=0, + **model_info, + ) + # 创建 DataArray + qsim, qobs = convert_streamflow_units(test_data, qsim, data_type, data_dir) elif algo_info["name"] == "GA": for i in range(len(data_info_train["basin"])): basin_id = data_info_train["basin"][i] @@ -155,7 +116,7 @@ def evaluate(args): deap_db_dir, warmup_length=warmup, basin_id=basin_id, - the_data=data_test[:, i : i + 1, :], + the_data=test_p_and_e[:, i : i + 1, :], the_period=data_info_test["time"], basin_area=basin_area, model_info=model_info, @@ -165,10 +126,10 @@ def evaluate(args): raise NotImplementedError( "We don't provide this calibrate method! Choose from 'SCE_UA' or 'GA'!" ) - summarize_parameters(save_dir, model_info) - renormalize_params(save_dir, model_info) - summarize_metrics(save_dir, model_info) - save_streamflow(save_dir, model_info, fold=fold) + summarize_parameters(save_dir, model_info["name"], basins) + renormalize_params(save_dir, model_info["name"], basins) + # summarize_metrics(save_dir, model_info) + save_evaluate_results(save_dir, model_info["name"], qsim, qobs, test_data) print(f"Finish calibrating the {fold}-th fold") @@ -183,12 +144,5 @@ def evaluate(args): default="expcamels001", type=str, ) - parser.add_argument( - "--warmup_length", - dest="warmup_length", - help="the length of warmup period for hydro model", - default=365, - type=int, - ) the_args = parser.parse_args() evaluate(the_args) diff --git a/scripts/datapostprocess4calibrate.py b/scripts/post_process.py similarity index 95% rename from scripts/datapostprocess4calibrate.py rename to scripts/post_process.py index 52615a3..8d3d2c4 100644 --- a/scripts/datapostprocess4calibrate.py +++ b/scripts/post_process.py @@ -1,10 +1,10 @@ """ Author: Wenyu Ouyang Date: 2022-11-19 17:27:05 -LastEditTime: 2024-03-26 15:43:08 +LastEditTime: 2024-03-26 21:32:54 LastEditors: Wenyu Ouyang -Description: the script to postprocess calibrated models in hydro-model-xaj -FilePath: \hydro-model-xaj\scripts\datapostprocess4calibrate.py +Description: the script to postprocess results +FilePath: \hydro-model-xaj\scripts\post_process.py Copyright (c) 2021-2022 Wenyu Ouyang. All rights reserved. """ @@ -19,7 +19,7 @@ repo_dir = os.path.dirname(Path(os.path.abspath(__file__)).parent) sys.path.append(repo_dir) -from hydromodel.datasets.data_postprocess import read_and_save_et_ouputs +from trainers.evaluate import read_and_save_et_ouputs def statistics(args): diff --git a/test/test_data_postprocess.py b/test/test_data_postprocess.py index 9d0ffed..720e00c 100644 --- a/test/test_data_postprocess.py +++ b/test/test_data_postprocess.py @@ -14,7 +14,7 @@ import matplotlib.pyplot as plt import spotpy from spotpy.examples.spot_setup_hymod_python import spot_setup as hymod_setup -from hydromodel.datasets.data_postprocess import read_save_sceua_calibrated_params +from trainers.evaluate import read_save_sceua_calibrated_params def test_run_hymod_calibration(): diff --git a/test/test_show_results.py b/test/test_show_results.py index 09b9e89..be75cba 100644 --- a/test/test_show_results.py +++ b/test/test_show_results.py @@ -1,21 +1,25 @@ """ Author: Wenyu Ouyang Date: 2022-12-08 09:24:54 -LastEditTime: 2024-03-22 20:58:24 +LastEditTime: 2024-03-26 21:29:39 LastEditors: Wenyu Ouyang Description: some util funcs for hydro model FilePath: \hydro-model-xaj\test\test_show_results.py Copyright (c) 2021-2022 Wenyu Ouyang. All rights reserved. """ -from hydromodel.datasets.data_postprocess import read_save_sceua_calibrated_params +from hydromodel.datasets.data_postprocess import ( + show_sceua_cali_result, +) from hydromodel.models.xaj import xaj from hydromodel.trainers.calibrate_sceua import calibrate_by_sceua -from hydromodel.trainers.train_utils import show_calibrate_result, show_test_result +from datasets.data_postprocess import show_test_result from hydroutils import hydro_time +from trainers.evaluate import read_save_sceua_calibrated_params + def test_show_calibrate_sceua_result(p_and_e, qobs, warmup_length, db_name, basin_area): sampler = calibrate_by_sceua( @@ -39,7 +43,7 @@ def test_show_calibrate_sceua_result(p_and_e, qobs, warmup_length, db_name, basi }, ) train_period = hydro_time.t_range_days(["2012-01-01", "2017-01-01"]) - show_calibrate_result( + show_sceua_cali_result( sampler.setup, db_name, warmup_length=warmup_length,