From 759c92d1bd22bb108801dc1f1874ee76d548d2d8 Mon Sep 17 00:00:00 2001 From: Mark Nestor Costantini Date: Tue, 16 Apr 2024 18:59:10 +0100 Subject: [PATCH 1/3] first draft of new to old data converter --- validphys2/src/validphys/utils.py | 105 ++++++++++++++++++++++++++++++ 1 file changed, 105 insertions(+) diff --git a/validphys2/src/validphys/utils.py b/validphys2/src/validphys/utils.py index 0c2956daa..ca4f7fb7b 100644 --- a/validphys2/src/validphys/utils.py +++ b/validphys2/src/validphys/utils.py @@ -12,6 +12,7 @@ import numpy as np from validobj import parse_input, ValidationError +from reportengine.compat import yaml def parse_yaml_inp(inp, spec, path): @@ -219,3 +220,107 @@ def scale_from_grid(grid): Returns ``'linear'`` if the scale of the grid object is linear, and otherwise ``' log'``.""" return 'linear' if grid.scale == 'linear' else 'log' + + +def uncertainty_yaml_to_systype(path_uncertainty_yaml, name_dataset, path_systype=None, write_to_file=True): + """ + Convert the new style uncertainty yaml file to the old style systype. + Writes + + Parameters + ---------- + path_uncertainty_yaml : str, or Path + Path to the new style uncertainty yaml file to be converted + + path_systype : str, or Path, optional + path to the output systype file + + Returns + ------- + n_sys : int + Number of systematics in the systype file + """ + # open the uncertainty yaml file + with open(path_uncertainty_yaml) as f: + uncertainty = yaml.safe_load(f) + + # get uncertainty definitions + uncertainty_definitions = uncertainty['definitions'] + + # check whether path_systype is provided else save it in the same directory in which the uncertainty yaml file is + if path_systype is None: + if isinstance(path_uncertainty_yaml, str): + path_uncertainty_yaml = pathlib.Path(path_uncertainty_yaml) + path_systype = path_uncertainty_yaml.parent / f"SYSTYPE_{name_dataset}_DEFAULT.dat" + else: + path_systype = pathlib.Path(path_systype) / f"SYSTYPE_{name_dataset}_DEFAULT.dat" + + # get number of sys (note: stat is not included in the sys) + if 'stat' in uncertainty_definitions.keys(): + n_sys = len(uncertainty_definitions.keys()) - 1 + else: + n_sys = len(uncertainty_definitions.keys()) + + if write_to_file: + # open the systype file for writing + with open(path_systype, 'w') as stream: + + # header: number of sys + stream.write(f"{n_sys}\n") + + # write the systype treatments + + # remove stat from the uncertainty definitions + uncertainty_definitions.pop('stat', None) + + for i, (_, sys_dict) in enumerate(uncertainty_definitions.items()): + # four spaces seems to be the standard format (has to be checked for other datasets than CMS_1JET_8TEV) + stream.write(f"{i+1} {sys_dict['treatment']} {sys_dict['type']}\n") + + return n_sys + + +def convert_new_data_to_old(path_data_yaml, path_uncertainty_yaml, name_dataset, path_DATA=None): + """ + Convert the new data format into the old data format + """ + + # open the data yaml file + with open(path_data_yaml) as f: + data = yaml.safe_load(f) + + # open the uncertainty yaml file + with open(path_uncertainty_yaml) as f: + uncertainty = yaml.safe_load(f) + + # get uncertainty definitions and values + uncertainty_definitions = uncertainty['definitions'] + uncertainty_values = uncertainty['bins'] + n_sys = uncertainty_yaml_to_systype(path_uncertainty_yaml, name_dataset, write_to_file=False) + + # get data values + data_values = data['data_central'] + + # check whether path_DATA is provided else save it in the same directory in which the uncertainty yaml file is + if path_DATA is None: + if isinstance(path_uncertainty_yaml, str): + path_uncertainty_yaml = pathlib.Path(path_uncertainty_yaml) + path_DATA = path_uncertainty_yaml.parent / f"DATA_{name_dataset}.dat" + else: + path_DATA = pathlib.Path(path_DATA) / f"DATA_{name_dataset}.dat" + + # open the DATA file for writing + with open(path_DATA, 'w') as stream: + + # write the header: Dataset name, number of sys errors, and number of data points, whitespace separated + stream.write(f"{name_dataset} {n_sys} {len(data_values)}\n") + + # TODO write the rest of the lines of the DATA file + + + +if __name__ == '__main__': + path_unc_file = "/Users/markcostantini/codes/nnpdfgit/nnpdf/nnpdf_data/nnpdf_data/new_commondata/CMS_1JET_8TEV/uncertainties_legacy_PTY.yaml" + path_data_yaml = "/Users/markcostantini/codes/nnpdfgit/nnpdf/nnpdf_data/nnpdf_data/new_commondata/CMS_1JET_8TEV/data.yaml" + uncertainty_yaml_to_systype(path_unc_file, name_dataset="CMS_1JET_8TEV") + convert_new_data_to_old(path_data_yaml, path_unc_file, name_dataset="CMS_1JET_8TEV", path_DATA=None) \ No newline at end of file From db610a8eb76864ee3cf0eeedae80964548ee63f3 Mon Sep 17 00:00:00 2001 From: Mark Nestor Costantini Date: Sat, 27 Apr 2024 19:57:59 +0100 Subject: [PATCH 2/3] added convertion from new data to old DATA_... format --- validphys2/src/validphys/utils.py | 43 ++++++++++++++++++++++++++++--- 1 file changed, 39 insertions(+), 4 deletions(-) diff --git a/validphys2/src/validphys/utils.py b/validphys2/src/validphys/utils.py index ca4f7fb7b..9e1787cda 100644 --- a/validphys2/src/validphys/utils.py +++ b/validphys2/src/validphys/utils.py @@ -280,11 +280,15 @@ def uncertainty_yaml_to_systype(path_uncertainty_yaml, name_dataset, path_systyp return n_sys -def convert_new_data_to_old(path_data_yaml, path_uncertainty_yaml, name_dataset, path_DATA=None): +def convert_new_data_to_old(path_data_yaml, path_uncertainty_yaml, path_kinematics, path_metadata, name_dataset, path_DATA=None): """ Convert the new data format into the old data format """ + # open the metadata yaml file + with open(path_metadata) as f: + metadata = yaml.safe_load(f) + # open the data yaml file with open(path_data_yaml) as f: data = yaml.safe_load(f) @@ -292,11 +296,16 @@ def convert_new_data_to_old(path_data_yaml, path_uncertainty_yaml, name_dataset, # open the uncertainty yaml file with open(path_uncertainty_yaml) as f: uncertainty = yaml.safe_load(f) + + # open the kinematics yaml file + with open(path_kinematics) as f: + kinematics = yaml.safe_load(f) # get uncertainty definitions and values uncertainty_definitions = uncertainty['definitions'] uncertainty_values = uncertainty['bins'] n_sys = uncertainty_yaml_to_systype(path_uncertainty_yaml, name_dataset, write_to_file=False) + stats = np.array([entr['stat'] for entr in uncertainty_values]) # get data values data_values = data['data_central'] @@ -309,18 +318,44 @@ def convert_new_data_to_old(path_data_yaml, path_uncertainty_yaml, name_dataset, else: path_DATA = pathlib.Path(path_DATA) / f"DATA_{name_dataset}.dat" + kin_names = list(kinematics['bins'][0].keys()) # open the DATA file for writing with open(path_DATA, 'w') as stream: # write the header: Dataset name, number of sys errors, and number of data points, whitespace separated stream.write(f"{name_dataset} {n_sys} {len(data_values)}\n") - - # TODO write the rest of the lines of the DATA file + + + for i, (data_value, stat) in enumerate(zip(data_values, stats)): + cd_line = f"{i+1}\t {metadata['implemented_observables'][0]['process_type']}\t {kin_names[0]}\t {kin_names[1]}\t {kin_names[2]}\t {data_value}\t {stat}\t" + + for j, sys in enumerate(uncertainty_values): + + for k, (sys_name, sys_val) in enumerate(sys.items()): + if sys_name == 'stat': + continue + + if uncertainty_definitions[sys_name]['treatment'] == "ADD": + add_sys = sys_val + mult_sys = add_sys * 100.0 / data_value if data_value != 0.0 else 0.0 + + elif uncertainty_definitions[sys_name]['treatment'] == "MULT": + mult_sys = sys_val + add_sys = mult_sys * data_value / 100.0 + + if k == len(sys)-1: + cd_line += f"{add_sys}\t {mult_sys}\n" + else: + cd_line += f"{add_sys}\t {mult_sys}\t" + + stream.write(cd_line) if __name__ == '__main__': path_unc_file = "/Users/markcostantini/codes/nnpdfgit/nnpdf/nnpdf_data/nnpdf_data/new_commondata/CMS_1JET_8TEV/uncertainties_legacy_PTY.yaml" path_data_yaml = "/Users/markcostantini/codes/nnpdfgit/nnpdf/nnpdf_data/nnpdf_data/new_commondata/CMS_1JET_8TEV/data.yaml" + path_kin = "/Users/markcostantini/codes/nnpdfgit/nnpdf/nnpdf_data/nnpdf_data/new_commondata/CMS_1JET_8TEV/kinematics.yaml" + path_metadata = "/Users/markcostantini/codes/nnpdfgit/nnpdf/nnpdf_data/nnpdf_data/new_commondata/CMS_1JET_8TEV/metadata.yaml" uncertainty_yaml_to_systype(path_unc_file, name_dataset="CMS_1JET_8TEV") - convert_new_data_to_old(path_data_yaml, path_unc_file, name_dataset="CMS_1JET_8TEV", path_DATA=None) \ No newline at end of file + convert_new_data_to_old(path_data_yaml, path_unc_file, path_kin, path_metadata, name_dataset="CMS_1JET_8TEV", path_DATA=None) \ No newline at end of file From 302437ba14ba8a600876368155381b819f441777 Mon Sep 17 00:00:00 2001 From: Mark Nestor Costantini Date: Mon, 29 Apr 2024 14:28:21 +0200 Subject: [PATCH 3/3] kinematics written as numbers --- validphys2/src/validphys/utils.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/validphys2/src/validphys/utils.py b/validphys2/src/validphys/utils.py index 9e1787cda..e37d7eaf6 100644 --- a/validphys2/src/validphys/utils.py +++ b/validphys2/src/validphys/utils.py @@ -319,16 +319,15 @@ def convert_new_data_to_old(path_data_yaml, path_uncertainty_yaml, path_kinemati path_DATA = pathlib.Path(path_DATA) / f"DATA_{name_dataset}.dat" kin_names = list(kinematics['bins'][0].keys()) + kin_values = kinematics['bins'] # open the DATA file for writing with open(path_DATA, 'w') as stream: # write the header: Dataset name, number of sys errors, and number of data points, whitespace separated stream.write(f"{name_dataset} {n_sys} {len(data_values)}\n") - - - for i, (data_value, stat) in enumerate(zip(data_values, stats)): - cd_line = f"{i+1}\t {metadata['implemented_observables'][0]['process_type']}\t {kin_names[0]}\t {kin_names[1]}\t {kin_names[2]}\t {data_value}\t {stat}\t" + for i, (data_value, stat) in enumerate(zip(data_values, stats)): + cd_line = f"{i+1}\t {metadata['implemented_observables'][0]['process_type']}\t {kin_values[i][kin_names[2]]['mid']}\t {kin_values[i][kin_names[1]]['mid']}\t {kin_values[i][kin_names[0]]['mid']}\t {data_value}\t {stat}\t" for j, sys in enumerate(uncertainty_values): for k, (sys_name, sys_val) in enumerate(sys.items()):