From ad16677a525296b641c4ba5a95c70f695919b4c3 Mon Sep 17 00:00:00 2001 From: juacrumar Date: Mon, 13 Jan 2025 12:53:04 +0100 Subject: [PATCH] improve the autodiscovery of 2D distributions pylint --- pyproject.toml | 4 +- .../external/nnlojet/nnpdf_interface.py | 55 +++++++++++-------- 2 files changed, 35 insertions(+), 24 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 18e394a..88f7abb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,13 +38,13 @@ appdirs = "^1.4.4" tomli = "^2.0.1" yadism = { extras = ["box"], version = "^0.13.5", optional=true, markers = "python_version < '3.13'" } eko = { extras = ["box"], version = "^0.14.2", optional=true, markers = "python_version < '3.13'" } -nnpdf = { git = "https://github.com/NNPDF/nnpdf", optional = true, markers = "python_version < '3.13'" } +nnpdf-data = { version = "*", optional = true} [tool.poetry.extras] dis = ["yadism"] vrap = ["eko"] constraints = ["dis", "vrap"] # integrability + positivity -complete = ["yadism", "eko", "nnpdf"] +complete = ["yadism", "eko", "nnpdf-data"] [tool.poetry.group.docs] optional = true diff --git a/src/pinefarm/external/nnlojet/nnpdf_interface.py b/src/pinefarm/external/nnlojet/nnpdf_interface.py index 7819081..ca7efb4 100755 --- a/src/pinefarm/external/nnlojet/nnpdf_interface.py +++ b/src/pinefarm/external/nnlojet/nnpdf_interface.py @@ -20,6 +20,7 @@ from copy import deepcopy import numpy as np +from nnpdf_data import load_dataset_metadata from ruamel.yaml import YAML, CommentedMap # set-up the yaml reader @@ -32,7 +33,7 @@ def _legacy_nnpdf_translation(df, proc_type): """When reading variables with k1/k2/k3 tries to figure out to which variables it corresponds.""" - from validphys.filters import KIN_LABEL + from validphys.filters import KIN_LABEL # pylint: disable=E0401 new_vars = list(KIN_LABEL[proc_type]) # Reorganize a bit the names to avoid extra problems @@ -61,9 +62,9 @@ def _df_to_bins(dataframe): lpo = [mid_points[-1] + shifts[-1]] return np.concatenate([fpo, bins, lpo]) - bins = dataframe["min"].tolist() - bins.append(dataframe["max"].tolist()[-1]) - return np.array(bins) + # Couple maxs and mins, assuming no overlap between bins... + all_bins = np.concatenate([dataframe["min"], dataframe["max"]]) + return np.unique(all_bins) def _1d_histogram(kin_df, hist_var): @@ -101,6 +102,9 @@ def _nnlojet_observable(observable, process): return "ptw" if observable == "m" and process.upper().startswith("Z"): return "mll" + if observable == "m2": + print("\033[91m [WARNING] \033[0m Changed M2 to M in the selectors") + return "mll" raise ValueError(f"Observable {observable} not recognized for process {process}") @@ -227,7 +231,7 @@ def _generate_nnlojet_pinecard(runname, process, energy, experiment, histograms) def generate_pinecard_from_nnpdf( - nnpdf_dataset, scale="mz", output_path=".", observables=None + nnpdf_dataset, scale="etz", output_path=".", observables=None ): """Generate a NNLOJET pinecard from an NNPDF dataset. @@ -237,11 +241,7 @@ def generate_pinecard_from_nnpdf( If a list of observables is provided, only those in the list will be loaded from the dataframe. """ - # Load the NNPDF dataset - from validphys.api import API - - commondata = API.commondata(dataset_input={"dataset": nnpdf_dataset}) - metadata = commondata.metadata + metadata = load_dataset_metadata(nnpdf_dataset) kin_df = metadata.load_kinematics(drop_minmax=False) if observables is not None: @@ -284,26 +284,37 @@ def generate_pinecard_from_nnpdf( if len(hist_vars) == 1: histograms = [_1d_histogram(kin_df, hist_vars[0])] elif len(hist_vars) == 2: - # Let's (hope) it is in M2 - if "M2" not in hist_vars: + + # Let's see whether we know how to do this 2D distribution + if "M2" in hist_vars: + svar = "M2" + elif "y" in hist_vars: + svar = "y" + else: raise NotImplementedError(f"Don't know how to do this 2D: {hist_vars}") - hist_vars.remove("M2") + hist_vars.remove(svar) + + # 2D distributions can only be done when min-max is available, otherwise it's a mess + bounds_df = kin_df[svar] + if svar.endswith("2"): + bounds_df[["min", "max"]] = bounds_df[["min", "max"]].apply( + lambda x: np.sqrt(x) + ) + bounds = bounds_df.drop_duplicates().values.tolist() + nnlojet_var = _nnlojet_observable(svar, process) another_v = hist_vars[0] - # Get the unique M2 values - unique_m2 = kin_df["M2"]["mid"].unique() - m_name = _nnlojet_observable("M", process) histograms = [] - probable_bounds = np.unique(_1d_histogram(kin_df, "M2")["bins"]).tolist() - for i, val in enumerate(unique_m2): - idx = kin_df["M2"]["mid"] == val + + for i, (bin_min, mid_val, bin_max) in enumerate(bounds): + idx = kin_df[svar]["mid"] == mid_val tmp = _1d_histogram(kin_df[idx], another_v) tmp["name"] = f"{another_v}_bin_{i}" tmp["extra_selectors"] = [ { - "observable": f"{m_name}", - "min": probable_bounds[i], - "max": probable_bounds[i + 1], + "observable": f"{nnlojet_var}", + "min": bin_min, + "max": bin_max, } ] histograms.append(tmp)