From cb66a50f6924382b1d7129aa6363a16d128fcb1c Mon Sep 17 00:00:00 2001 From: Andres Nava Date: Sat, 9 Mar 2024 20:48:11 -0800 Subject: [PATCH 1/5] bug where vbf testing also needs jshifts for the vbf_mass_jj variable --- src/HHbbVV/postprocessing/postprocessing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/HHbbVV/postprocessing/postprocessing.py b/src/HHbbVV/postprocessing/postprocessing.py index 111bb43e..2c719cb8 100644 --- a/src/HHbbVV/postprocessing/postprocessing.py +++ b/src/HHbbVV/postprocessing/postprocessing.py @@ -425,7 +425,7 @@ def main(args): bb_masks, nonres_vars=args.vbf or args.control_plots, vbf_vars=args.vbf, - do_jshifts=False, # only need shifts for BDT pre-processing + do_jshifts=args.vbf, # only need shifts for BDT pre-processing ) # args has attr if --control-plots arg was set From 18f3be34c44897e595ff406bb107acfaac0bd53c Mon Sep 17 00:00:00 2001 From: Andres Nava Date: Sun, 10 Mar 2024 23:41:17 -0700 Subject: [PATCH 2/5] bug fix for NuisanceParameter calculations with specified signal --- src/HHbbVV/postprocessing/CreateDatacard.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/HHbbVV/postprocessing/CreateDatacard.py b/src/HHbbVV/postprocessing/CreateDatacard.py index 6df1d809..c1444eef 100644 --- a/src/HHbbVV/postprocessing/CreateDatacard.py +++ b/src/HHbbVV/postprocessing/CreateDatacard.py @@ -323,6 +323,8 @@ if not syst.samples_corr: # separate nuisance param for each affected sample (TODO: propagate below) for sample in syst.samples: + if sample not in mc_samples: + continue shape_systs_dict[f"{skey}_{sample}"] = rl.NuisanceParameter( f"{syst.name}_{mc_samples[sample]}", "shape" ) From 36fd98401471130a49be6b7f91fa13ac87baf3e7 Mon Sep 17 00:00:00 2001 From: rkansal47 Date: Mon, 11 Mar 2024 16:17:01 -0500 Subject: [PATCH 3/5] abcd fix --- setup.py | 22 -------------------- src/HHbbVV/postprocessing/datacardHelpers.py | 2 +- 2 files changed, 1 insertion(+), 23 deletions(-) delete mode 100755 setup.py diff --git a/setup.py b/setup.py deleted file mode 100755 index 4b56c784..00000000 --- a/setup.py +++ /dev/null @@ -1,22 +0,0 @@ -#!/usr/bin/env python -from __future__ import annotations - -from setuptools import setup - -# packages = find_packages( -# where="src", -# include=["HHbbVV"], # alternatively: `exclude=['additional*']` -# ) - -# package_dir = {"": "src"} - -setup( - name="HHbbVV", - version="0.0.1", - description="HHbbVV analysis code", - author="Raghav Kansal", - # author_email='', - url="https://github.com/rkansal47/HHbbVV", - packages=["HHbbVV"], - package_dir={"": "./src/"}, -) diff --git a/src/HHbbVV/postprocessing/datacardHelpers.py b/src/HHbbVV/postprocessing/datacardHelpers.py index 110a7f3e..8d61b1db 100644 --- a/src/HHbbVV/postprocessing/datacardHelpers.py +++ b/src/HHbbVV/postprocessing/datacardHelpers.py @@ -264,7 +264,7 @@ def get_effect_updown(values_nominal, values_up, values_down, mask, logger, epsi rate $processes_rates ------------------------------ $systematics -single_A rateParam $binA $qcdlabel (@0*@2/@1) single_B,single_C,single_D +single_A rateParam $binA $qcdlabel (@0*@1/@2) single_B,single_C,single_D single_B rateParam $binB $qcdlabel $dataqcdB single_C rateParam $binC $qcdlabel $dataqcdC single_D rateParam $binD $qcdlabel $dataqcdD From 95c7fbe8a4d146416c903ff42be565278055abd3 Mon Sep 17 00:00:00 2001 From: rkansal47 Date: Tue, 12 Mar 2024 11:39:43 -0500 Subject: [PATCH 4/5] script bug fixes + wjets-training arg --- README.md | 21 +++++----- src/HHbbVV/postprocessing/TrainBDT.py | 39 +++++++++++++------ .../bash_scripts/ControlPlots.sh | 20 ++++++---- src/HHbbVV/postprocessing/postprocessing.py | 14 +++---- src/HHbbVV/processors/bbVVSkimmer.py | 1 + 5 files changed, 60 insertions(+), 35 deletions(-) diff --git a/README.md b/README.md index 7cfd0ff3..41aeebff 100644 --- a/README.md +++ b/README.md @@ -36,7 +36,7 @@ using the [coffea](https://coffeateam.github.io/coffea/) and - [BDT Pre-Processing](#bdt-pre-processing) - [BDT Trainings](#bdt-trainings) - [Post-Processing](#post-processing-1) - - [Control plots with resonant and nonresonant samples](#control-plots-with-resonant-and-nonresonant-samples) + - [Control plots](#control-plots) - [BDT sculpting plots](#bdt-sculpting-plots) - [Making separate background and signal templates for scan and bias tests (resonant)](#making-separate-background-and-signal-templates-for-scan-and-bias-tests-resonant) - [Create Datacard](#create-datacard) @@ -325,6 +325,9 @@ python TrainBDT.py --data-path "../../../../data/skimmer/Feb24/bdt_data" --year ### Post-Processing +**Important:** If running on a Mac, make sure to install `gnu-getopt` first for +bash scripts, see [here.](#getopt-for-mac) + ```bash python postprocessing.py --templates --year "2017" --template-dir "templates/$TAG/" --plot-dir "../../../plots/PostProcessing/$TAG/" --data-dir "../../../../data/skimmer/Feb24/" (--resonant --signal-data-dir "" --control-plots) ``` @@ -332,22 +335,22 @@ python postprocessing.py --templates --year "2017" --template-dir "templates/$TA All years (non-resonant): ```bash -for year in 2016 2016APV 2017 2018; do python -u postprocessing.py --templates --year $year --template-dir "templates/Jun14" --data-dir "../../../../data/skimmer/Feb24" --signal-data-dir "../../../../data/skimmer/Jun10" --bdt-preds-dir "../../../../data/skimmer/Feb24/23_05_12_multiclass_rem_feats_3/inferences"; done +./bash_scripts/NonresTemplates.sh --tag $TAG # remember to change data_dir as needed! ``` Scan (non-resonant): ```bash -for year in 2016 2016APV 2017 2018; do python -u postprocessing.py --templates --year $year --template-dir "templates/$TAG/" --data-dir "../../../../data/skimmer/Feb24/" --old-processor --nonres-txbb-wp "LP" "MP" "HP" --nonres-bdt-wp 0.995 0.998 0.999 --no-do-jshifts; done +./bash_scripts/NonresTemplatesScan.sh --tag $TAG # remember to change data_dir as needed! ``` -#### Control plots with resonant and nonresonant samples - -Run `postprocessing/bash_scripts/ControlPlots.sh` from inside -`postprocessing folder`. +#### Control plots -**Important:** If running on a Mac, make sure to install `gnu-getopt` first, see -[here.](#getopt-for-mac) +```bash +./bash_scripts/ControlPlot.sh --tag $TAG # w/ resonant and nonresonant samples and all control plot variables in postprocessing.py script by default +./bash_scripts/ControlPlots.sh --tag $TAG --nonresonant --controlplotvars BDTScore --nohem2d # BDT score only +./bash_scripts/MassPlots.sh --tag $TAG # mSD vs mReg plots +``` #### BDT sculpting plots diff --git a/src/HHbbVV/postprocessing/TrainBDT.py b/src/HHbbVV/postprocessing/TrainBDT.py index 33a4ebca..8511e1b4 100644 --- a/src/HHbbVV/postprocessing/TrainBDT.py +++ b/src/HHbbVV/postprocessing/TrainBDT.py @@ -47,12 +47,6 @@ weight_key = "finalWeight" sig_key = "HHbbVV" -bg_keys = ["QCD", "TT", "Z+Jets"] -training_keys = [sig_key] + bg_keys - -# if doing multiclass classification, encode each process separately -label_encoder = LabelEncoder() -label_encoder.fit(training_keys) # only vars used for training, ordered by importance @@ -145,7 +139,9 @@ def get_X( return pd.concat(X, axis=0), mc_vars -def get_Y(data_dict: dict[str, pd.DataFrame], multiclass: bool = False): +def get_Y( + data_dict: dict[str, pd.DataFrame], multiclass: bool = False, label_encoder: LabelEncoder = None +): Y = [] for _year, data in data_dict.items(): if multiclass: @@ -179,7 +175,10 @@ def remove_neg_weights(data: pd.DataFrame): def equalize_weights( - data: pd.DataFrame, equalize_sig_bg: bool = True, equalize_per_process: bool = False + data: pd.DataFrame, + bg_keys: list[str], + equalize_sig_bg: bool = True, + equalize_per_process: bool = False, ): """ If `equalize_sig_bg`: scales signal such that total signal = total background @@ -214,6 +213,15 @@ def load_data(data_path: str, year: str, all_years: bool): def main(args): + bg_keys = ["QCD", "TT", "Z+Jets"] + if args.wjets_training: + bg_keys += ["W+Jets"] + training_keys = [sig_key] + bg_keys + + # if doing multiclass classification, encode each process separately + label_encoder = LabelEncoder() + label_encoder.fit(training_keys) + bdtVars = AllTaggerBDTVars if args.all_tagger_vars else SingleTaggerBDTVars early_stopping_callback = xgb.callback.EarlyStopping( @@ -314,7 +322,9 @@ def main(args): f'{np.sum(data[data["Dataset"] == key][weight_key])}' ) - equalize_weights(data, args.equalize_weights, args.equalize_weights_per_process) + equalize_weights( + data, bg_keys, args.equalize_weights, args.equalize_weights_per_process + ) for key in training_keys: print( @@ -341,8 +351,8 @@ def main(args): model = train_model( get_X(train, bdtVars), get_X(test, bdtVars), - get_Y(train, args.multiclass), - get_Y(test, args.multiclass), + get_Y(train, args.multiclass, label_encoder), + get_Y(test, args.multiclass, label_encoder), get_weights(train, args.absolute_weights), get_weights(test, args.absolute_weights), bdtVars, @@ -358,6 +368,7 @@ def main(args): train, test, args.test_size, + training_keys, args.equalize_weights, bdtVars, multiclass=args.multiclass, @@ -430,6 +441,7 @@ def evaluate_model( train: dict[str, pd.DataFrame], test: dict[str, pd.DataFrame], test_size: float, + training_keys: list[str], equalize_sig_bg: bool, bdtVars: list[str], txbb_threshold: float = 0.98, @@ -677,6 +689,8 @@ def do_inference( type=int, ) + add_bool_arg(parser, "wjets-training", "Include W+Jets in training", default=False) + """ Varying between 0.01 - 1 showed no significant difference https://hhbbvv.nrp-nautilus.io/bdt/24_03_07_new_samples_lr_0.01/ @@ -704,9 +718,10 @@ def do_inference( help="minimum weight required to keep splitting (higher is more conservative)", type=float, ) + # This just needs to be higher than the # rounds needed for early-stopping to kick in parser.add_argument( - "--n-estimators", default=1 - 000, help="max number of trees to keep adding", type=int + "--n-estimators", default=10000, help="max number of trees to keep adding", type=int ) parser.add_argument("--rem-feats", default=0, help="remove N lowest importance feats", type=int) diff --git a/src/HHbbVV/postprocessing/bash_scripts/ControlPlots.sh b/src/HHbbVV/postprocessing/bash_scripts/ControlPlots.sh index 8c049254..0de07451 100755 --- a/src/HHbbVV/postprocessing/bash_scripts/ControlPlots.sh +++ b/src/HHbbVV/postprocessing/bash_scripts/ControlPlots.sh @@ -15,23 +15,30 @@ #################################################################################################### MAIN_DIR="../../.." +data_dir="$MAIN_DIR/../data/skimmer/24Mar6AllYearsBDTVars" TAG="" resonant="--resonant" samples="HHbbVV VBFHHbbVV NMSSM_XToYHTo2W2BTo4Q2B_MX-900_MY-80 NMSSM_XToYHTo2W2BTo4Q2B_MX-1200_MY-190 NMSSM_XToYHTo2W2BTo4Q2B_MX-2000_MY-125 NMSSM_XToYHTo2W2BTo4Q2B_MX-3000_MY-250 NMSSM_XToYHTo2W2BTo4Q2B_MX-4000_MY-150" +# samples="HHbbVV" hem2d="--HEM2d" +controlplotvars="" -options=$(getopt -o "" --long "nonresonant,nohem2d,tag:" -- "$@") +options=$(getopt -o "" --long "nonresonant,nohem2d,controlplotvars:,tag:" -- "$@") eval set -- "$options" while true; do case "$1" in --nonresonant) resonant="" - samples="HHbbVV VBFHHbbVV" + samples="HHbbVV VBFHHbbVV qqHH_CV_1_C2V_0_kl_1_HHbbVV qqHH_CV_1_C2V_2_kl_1_HHbbVV" ;; --nohem2d) hem2d="" ;; + --controlplotvars) + shift + controlplotvars="--control-plot-vars $1" + ;; --tag) shift TAG=$1 @@ -56,12 +63,11 @@ if [[ -z $TAG ]]; then exit 1 fi -# for year in 2016APV 2016 2017 2018 -for year in 2018 +for year in 2016APV 2016 2017 2018 do python -u postprocessing.py --control-plots --year $year ${resonant} ${hem2d} \ - --data-dir "${MAIN_DIR}/../data/skimmer/24Mar5AllYears" \ + --data-dir $data_dir \ --sig-samples $samples \ - --bdt-preds-dir "${MAIN_DIR}/../data/skimmer/Feb24/23_05_12_multiclass_rem_feats_3/inferences" \ - --plot-dir "${MAIN_DIR}/plots/PostProcessing/$TAG" + --bdt-preds-dir "$MAIN_DIR/../data/skimmer/24Mar6AllYearsBDTVars/24_03_07_new_samples_max_depth_5/inferences" \ + --plot-dir "${MAIN_DIR}/plots/PostProcessing/$TAG" ${controlplotvars} done diff --git a/src/HHbbVV/postprocessing/postprocessing.py b/src/HHbbVV/postprocessing/postprocessing.py index 2c719cb8..1526ffce 100644 --- a/src/HHbbVV/postprocessing/postprocessing.py +++ b/src/HHbbVV/postprocessing/postprocessing.py @@ -83,7 +83,7 @@ class Region: control_plot_vars = [ ShapeVar(var="MET_pt", label=r"$p^{miss}_T$ (GeV)", bins=[20, 0, 300]), # ShapeVar(var="DijetEta", label=r"$\eta^{jj}$", bins=[20, -8, 8]), - ShapeVar(var="DijetPt", label=r"$p_T^{jj}$ (GeV)", bins=[20, 0, 750]), + # ShapeVar(var="DijetPt", label=r"$p_T^{jj}$ (GeV)", bins=[20, 0, 750]), ShapeVar(var="DijetMass", label=r"$m^{jj}$ (GeV)", bins=[20, 600, 4000]), ShapeVar(var="bbFatJetEta", label=r"$\eta^{bb}$", bins=[20, -2.4, 2.4]), ShapeVar( @@ -114,12 +114,12 @@ class Region: # ShapeVar(var="VVFatJetParTMD_probT", label=r"Prob(Top) (Mass-Decorrelated)", bins=[20, 0, 1]), ShapeVar(var="VVFatJetParTMD_THWWvsT", label=r"$T^{VV}_{HWW}$", bins=[20, 0, 1]), # ShapeVar(var="bbFatJetPtOverDijetPt", label=r"$p^{bb}_T / p_T^{jj}$", bins=[20, 0, 40]), - ShapeVar(var="VVFatJetPtOverDijetPt", label=r"$p^{VV}_T / p_T^{jj}$", bins=[20, 0, 40]), - ShapeVar(var="VVFatJetPtOverbbFatJetPt", label=r"$p^{VV}_T / p^{bb}_T$", bins=[20, 0.4, 2.0]), + # ShapeVar(var="VVFatJetPtOverDijetPt", label=r"$p^{VV}_T / p_T^{jj}$", bins=[20, 0, 40]), + # ShapeVar(var="VVFatJetPtOverbbFatJetPt", label=r"$p^{VV}_T / p^{bb}_T$", bins=[20, 0.4, 2.0]), ShapeVar(var="nGoodMuonsHbb", label=r"# of Muons", bins=[3, 0, 3]), - ShapeVar(var="nGoodMuonsHH", label=r"# of Muons", bins=[3, 0, 3]), + # ShapeVar(var="nGoodMuonsHH", label=r"# of Muons", bins=[3, 0, 3]), ShapeVar(var="nGoodElectronsHbb", label=r"# of Electrons", bins=[3, 0, 3]), - ShapeVar(var="nGoodElectronsHH", label=r"# of Electrons", bins=[3, 0, 3]), + # ShapeVar(var="nGoodElectronsHH", label=r"# of Electrons", bins=[3, 0, 3]), # removed if not ggF nonresonant - needs to be the last variable! ShapeVar(var="BDTScore", label=r"BDT Score", bins=[50, 0, 1]), ] @@ -415,7 +415,6 @@ def main(args): # only need to worry about variations if making templates events_dict = _load_samples(args, bg_samples, sig_samples, cutflow, variations=args.templates) bb_masks = bb_VV_assignment(events_dict) - # QCD xsec normalization for plots qcd_sf(events_dict, cutflow) @@ -424,6 +423,7 @@ def main(args): events_dict, bb_masks, nonres_vars=args.vbf or args.control_plots, + # nonres_vars=args.vbf, vbf_vars=args.vbf, do_jshifts=args.vbf, # only need shifts for BDT pre-processing ) @@ -622,7 +622,7 @@ def _add_nonres_columns(df, bb_mask, vbf_vars=False, ptlabel="", mlabel=""): VVJet = utils.make_vector(df, "VVFatJet", bb_mask, ptlabel=ptlabel, mlabel=mlabel) Dijet = bbJet + VVJet - if f"DijetPt{ptlabel}{mlabel}" not in df.columns: + if f"DijetMass{ptlabel}{mlabel}" not in df.columns: df[f"DijetMass{ptlabel}{mlabel}"] = Dijet.mass df[f"DijetPt{ptlabel}{mlabel}"] = Dijet.pt df[f"VVFatJetPtOverbbFatJetPt{ptlabel}{mlabel}"] = VVJet.pt / bbJet.pt diff --git a/src/HHbbVV/processors/bbVVSkimmer.py b/src/HHbbVV/processors/bbVVSkimmer.py index d40acea3..ce6b4551 100644 --- a/src/HHbbVV/processors/bbVVSkimmer.py +++ b/src/HHbbVV/processors/bbVVSkimmer.py @@ -123,6 +123,7 @@ class bbVVSkimmer(SkimmerABC): "VBFJetPt", "VBFJetPhi", "VBFJetMass", + "DijetMass", "nGoodVBFJets", "ak8FatJetHbb", "ak8FatJetHVV", From 611940f20d292b713e361e669697e9f0a6a2956e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 12 Mar 2024 19:51:03 +0000 Subject: [PATCH 5/5] chore: update pre-commit hooks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/pre-commit/mirrors-prettier: v3.1.0 → v4.0.0-alpha.8](https://github.com/pre-commit/mirrors-prettier/compare/v3.1.0...v4.0.0-alpha.8) - [github.com/astral-sh/ruff-pre-commit: v0.1.14 → v0.3.2](https://github.com/astral-sh/ruff-pre-commit/compare/v0.1.14...v0.3.2) - [github.com/python-jsonschema/check-jsonschema: 0.27.3 → 0.28.0](https://github.com/python-jsonschema/check-jsonschema/compare/0.27.3...0.28.0) --- .pre-commit-config.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f2a9ca80..1dba932f 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -40,14 +40,14 @@ repos: - id: rst-inline-touching-normal - repo: https://github.com/pre-commit/mirrors-prettier - rev: "v3.1.0" + rev: "v4.0.0-alpha.8" hooks: - id: prettier types_or: [yaml, markdown, html, css, scss, javascript, json] args: [--prose-wrap=always] - repo: https://github.com/astral-sh/ruff-pre-commit - rev: "v0.1.14" + rev: "v0.3.2" hooks: - id: ruff args: ["--fix", "--show-fixes"] @@ -89,7 +89,7 @@ repos: additional_dependencies: ["validate-pyproject-schema-store[all]"] - repo: https://github.com/python-jsonschema/check-jsonschema - rev: "0.27.3" + rev: "0.28.0" hooks: - id: check-dependabot - id: check-github-workflows