Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

New to old data converter #64

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
139 changes: 139 additions & 0 deletions validphys2/src/validphys/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import numpy as np

from validobj import parse_input, ValidationError
from reportengine.compat import yaml


def parse_yaml_inp(inp, spec, path):
Expand Down Expand Up @@ -219,3 +220,141 @@ def scale_from_grid(grid):
Returns ``'linear'`` if the scale of the grid object is linear,
and otherwise ``' log'``."""
return 'linear' if grid.scale == 'linear' else 'log'


def uncertainty_yaml_to_systype(path_uncertainty_yaml, name_dataset, path_systype=None, write_to_file=True):
"""
Convert the new style uncertainty yaml file to the old style systype.
Writes

Parameters
----------
path_uncertainty_yaml : str, or Path
Path to the new style uncertainty yaml file to be converted

path_systype : str, or Path, optional
path to the output systype file

Returns
-------
n_sys : int
Number of systematics in the systype file
"""
# open the uncertainty yaml file
with open(path_uncertainty_yaml) as f:
uncertainty = yaml.safe_load(f)

# get uncertainty definitions
uncertainty_definitions = uncertainty['definitions']

# check whether path_systype is provided else save it in the same directory in which the uncertainty yaml file is
if path_systype is None:
if isinstance(path_uncertainty_yaml, str):
path_uncertainty_yaml = pathlib.Path(path_uncertainty_yaml)
path_systype = path_uncertainty_yaml.parent / f"SYSTYPE_{name_dataset}_DEFAULT.dat"
else:
path_systype = pathlib.Path(path_systype) / f"SYSTYPE_{name_dataset}_DEFAULT.dat"

# get number of sys (note: stat is not included in the sys)
if 'stat' in uncertainty_definitions.keys():
n_sys = len(uncertainty_definitions.keys()) - 1
else:
n_sys = len(uncertainty_definitions.keys())

if write_to_file:
# open the systype file for writing
with open(path_systype, 'w') as stream:

# header: number of sys
stream.write(f"{n_sys}\n")

# write the systype treatments

# remove stat from the uncertainty definitions
uncertainty_definitions.pop('stat', None)

for i, (_, sys_dict) in enumerate(uncertainty_definitions.items()):
# four spaces seems to be the standard format (has to be checked for other datasets than CMS_1JET_8TEV)
stream.write(f"{i+1} {sys_dict['treatment']} {sys_dict['type']}\n")

return n_sys


def convert_new_data_to_old(path_data_yaml, path_uncertainty_yaml, path_kinematics, path_metadata, name_dataset, path_DATA=None):
"""
Convert the new data format into the old data format
"""

# open the metadata yaml file
with open(path_metadata) as f:
metadata = yaml.safe_load(f)

# open the data yaml file
with open(path_data_yaml) as f:
data = yaml.safe_load(f)

# open the uncertainty yaml file
with open(path_uncertainty_yaml) as f:
uncertainty = yaml.safe_load(f)

# open the kinematics yaml file
with open(path_kinematics) as f:
kinematics = yaml.safe_load(f)

# get uncertainty definitions and values
uncertainty_definitions = uncertainty['definitions']
uncertainty_values = uncertainty['bins']
n_sys = uncertainty_yaml_to_systype(path_uncertainty_yaml, name_dataset, write_to_file=False)
stats = np.array([entr['stat'] for entr in uncertainty_values])

# get data values
data_values = data['data_central']

# check whether path_DATA is provided else save it in the same directory in which the uncertainty yaml file is
if path_DATA is None:
if isinstance(path_uncertainty_yaml, str):
path_uncertainty_yaml = pathlib.Path(path_uncertainty_yaml)
path_DATA = path_uncertainty_yaml.parent / f"DATA_{name_dataset}.dat"
else:
path_DATA = pathlib.Path(path_DATA) / f"DATA_{name_dataset}.dat"

kin_names = list(kinematics['bins'][0].keys())
kin_values = kinematics['bins']
# open the DATA file for writing
with open(path_DATA, 'w') as stream:

# write the header: Dataset name, number of sys errors, and number of data points, whitespace separated
stream.write(f"{name_dataset} {n_sys} {len(data_values)}\n")

for i, (data_value, stat) in enumerate(zip(data_values, stats)):
cd_line = f"{i+1}\t {metadata['implemented_observables'][0]['process_type']}\t {kin_values[i][kin_names[2]]['mid']}\t {kin_values[i][kin_names[1]]['mid']}\t {kin_values[i][kin_names[0]]['mid']}\t {data_value}\t {stat}\t"
for j, sys in enumerate(uncertainty_values):

for k, (sys_name, sys_val) in enumerate(sys.items()):
if sys_name == 'stat':
continue

if uncertainty_definitions[sys_name]['treatment'] == "ADD":
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

remove if clause, since the systematics in the new format are all stored as additive

add_sys = sys_val
mult_sys = add_sys * 100.0 / data_value if data_value != 0.0 else 0.0

elif uncertainty_definitions[sys_name]['treatment'] == "MULT":
mult_sys = sys_val
add_sys = mult_sys * data_value / 100.0

if k == len(sys)-1:
cd_line += f"{add_sys}\t {mult_sys}\n"
else:
cd_line += f"{add_sys}\t {mult_sys}\t"

stream.write(cd_line)



if __name__ == '__main__':
path_unc_file = "/Users/markcostantini/codes/nnpdfgit/nnpdf/nnpdf_data/nnpdf_data/new_commondata/CMS_1JET_8TEV/uncertainties_legacy_PTY.yaml"
path_data_yaml = "/Users/markcostantini/codes/nnpdfgit/nnpdf/nnpdf_data/nnpdf_data/new_commondata/CMS_1JET_8TEV/data.yaml"
path_kin = "/Users/markcostantini/codes/nnpdfgit/nnpdf/nnpdf_data/nnpdf_data/new_commondata/CMS_1JET_8TEV/kinematics.yaml"
path_metadata = "/Users/markcostantini/codes/nnpdfgit/nnpdf/nnpdf_data/nnpdf_data/new_commondata/CMS_1JET_8TEV/metadata.yaml"
uncertainty_yaml_to_systype(path_unc_file, name_dataset="CMS_1JET_8TEV")
convert_new_data_to_old(path_data_yaml, path_unc_file, path_kin, path_metadata, name_dataset="CMS_1JET_8TEV", path_DATA=None)