Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

backwards_ecal: run with dask #120

Merged
merged 1 commit into from
Jan 3, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions benchmarks/backwards_ecal/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -152,13 +152,31 @@ rule backwards_ecal:
script="benchmarks/backwards_ecal/backwards_ecal.py",
output:
directory("results/backwards_ecal/{CAMPAIGN}/")
log:
scheduler=".logs/results/backwards_ecal/{CAMPAIGN}/scheduler.log",
worker=".logs/results/backwards_ecal/{CAMPAIGN}/worker.log",
threads: workflow.cores
shell:
"""
if [[ "{wildcards.CAMPAIGN}" == "local" ]]; then
export PLOT_TITLE="Benchmark simulation"
else
export PLOT_TITLE="\\textbf{{ePIC}} Simulation {wildcards.CAMPAIGN}"
fi

set -m # monitor mode to prevent lingering processes
cleanup() {{
echo Cleaning up
kill $WORKER_PID $SCHEDULER_PID
}}
trap cleanup EXIT

PORT=$RANDOM
dask scheduler --port $PORT 2>{log.scheduler} &
export DASK_SCHEDULER=localhost:$PORT
SCHEDULER_PID=$!
dask worker tcp://$DASK_SCHEDULER --nworkers {threads} --nthreads 1 2>{log.worker} &
WORKER_PID=$!
env \
MATPLOTLIBRC={input.matplotlibrc} \
DETECTOR_CONFIG=""" + DETECTOR_CONFIG + """ \
Expand Down
108 changes: 76 additions & 32 deletions benchmarks/backwards_ecal/backwards_ecal.org
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,32 @@ import os
from pathlib import Path

import awkward as ak
import boost_histogram as bh
import dask_histogram as dh
import numpy as np
import vector
import uproot
from sklearn.metrics import roc_curve
import vector
#+end_src

#+begin_src jupyter-python :results silent
from dask.distributed import Client

client = Client(os.environ.get("DASK_SCHEDULER", "localhost:8786"))
#+end_src

#+begin_src jupyter-python
vector.register_awkward()

from dask.distributed import WorkerPlugin
class VectorImporter(WorkerPlugin):
idempotent=True

def setup(self, worker):
import vector

vector.register_awkward()

client.register_plugin(VectorImporter())
#+end_src

* Plotting setup
Expand Down Expand Up @@ -93,7 +113,7 @@ energies = [
]
filter_name = [
"MCParticles.*",
"*EcalEndcapNClusters*",
"EcalEndcapNClusters.energy",
]

pi_eval = {}
Expand All @@ -105,18 +125,46 @@ def readlist(path):
return paths

for energy in energies:
pi_eval[energy] = filter_pointing(uproot.concatenate(
pi_eval[energy] = uproot.dask(
{path: "events" for path in readlist(INPUT_PATH_FORMAT.format(particle="pi-", energy=energy))},
filter_name=filter_name,
))
e_eval[energy] = filter_pointing(uproot.concatenate(
steps_per_file=1,
open_files=False,
).map_partitions(filter_pointing)
e_eval[energy] = uproot.dask(
{path: "events" for path in readlist(INPUT_PATH_FORMAT.format(particle="e-", energy=energy))},
filter_name=filter_name,
))
steps_per_file=1,
open_files=False,
).map_partitions(filter_pointing)
#+end_src

** Energy resolution

The first step in the analysis is plotting distributions of cluster energies.
Only electron distribution is needed, but we also do pi- for the pion rejection
study.

#+begin_src jupyter-python
e_over_p_hist = {}
e_over_p_axis = bh.axis.Regular(101, 0., 1.01)

for ix, energy in enumerate(energies):
for particle_name, dataset in [("e-", e_eval[energy]), ("pi-", pi_eval[energy])]:
energy_value = float(energy.replace("GeV", "").replace("MeV", "e-3"))
def clf(events):
return ak.drop_none(ak.max(events["EcalEndcapNClusters.energy"], axis=-1)) / energy_value
e_over_p_hist[(particle_name, energy)] = dh.factory(
clf(dataset),
axes=(e_over_p_axis,),
)

e_over_p_hist = client.gather(client.compute(e_over_p_hist))
#+end_src

The next step is to plot the histograms together with fits and produce a plot
for resolution vs energy that summarizes them.

#+begin_src jupyter-python
fig, axs = plt.subplots(2, 4, sharex=True, sharey=True, figsize=(15, 6))

Expand All @@ -125,29 +173,26 @@ axs = np.ravel(np.array(axs))
sigmas_rel_FWHM_cb = {}
fractions_below = {}

for ix, energy in enumerate(energies):
energy_value = float(energy.replace("GeV", "").replace("MeV", "e-3"))
clf_label = PLOT_TITLE
def clf(events):
return ak.drop_none(ak.max(events["EcalEndcapNClusters.energy"], axis=-1)) / energy_value
e_pred = clf(e_eval[energy])
def norm_by_sum(arr):
return arr / np.sum(arr)

for ix, energy in enumerate(energies):
plt.sca(axs[ix])
counts, bins, patches = plt.hist(e_pred, weights=np.full_like(e_pred, 1.0 / ak.num(e_pred, axis=0)), bins=np.linspace(0.01, 1.01, 101), label=rf"$e^-$ {clf_label}")
hist = e_over_p_hist[("e-", energy)]
patch = plt.stairs(norm_by_sum(hist.values()), hist.axes[0].edges, label=rf"$e^-$ {PLOT_TITLE}")
plt.title(f"{energy}")

e_over_p = (bins[1:] + bins[:-1]) / 2
import scipy.stats
def f(x, n, beta, m, loc, scale):
return n * scipy.stats.crystalball.pdf(x, beta, m, loc, scale)
p0 = (np.sum(counts[10:]), 2., 3., 0.95, 0.05)
p0 = (np.sum(hist.values()[10:]), 2., 3., 0.95, 0.05)

try:
import scipy.optimize
par, pcov = scipy.optimize.curve_fit(f, e_over_p[5:], counts[5:], p0=p0, maxfev=10000)
par, pcov = scipy.optimize.curve_fit(f, hist.axes[0].centers[5:], hist.values()[5:], p0=p0, maxfev=10000)
except RuntimeError:
par = None
plt.plot(e_over_p, f(e_over_p, *par), label=rf"Crystal Ball fit", color="tab:green", lw=0.8)
plt.plot(hist.axes[0].centers, f(hist.axes[0].centers, *par), label=rf"Crystal Ball fit", color="tab:green", lw=0.8)

def summarize_fit(par):
_, _, _, loc_cb, scale_cb = par
Expand All @@ -156,19 +201,19 @@ for ix, energy in enumerate(energies):
f_prime = lambda x: f(x, *par) - y_max / 2
x_plus, = scipy.optimize.root(f_prime, loc_cb + scale_cb).x
x_minus, = scipy.optimize.root(f_prime, loc_cb - scale_cb).x
plt.axvline(x_minus, ls="--", lw=0.75, color=patches[0].get_facecolor(), label=r"$\mu - $FWHM")
plt.axvline(x_plus, ls=":", lw=0.75, color=patches[0].get_facecolor(), label=r"$\mu + $FWHM")
plt.axvline(x_minus, ls="--", lw=0.75, color=patch.get_facecolor(), label=r"$\mu - $FWHM")
plt.axvline(x_plus, ls=":", lw=0.75, color=patch.get_facecolor(), label=r"$\mu + $FWHM")
fwhm = (x_plus - x_minus) / loc_cb
sigma_rel_FWHM_cb = fwhm / 2 / np.sqrt(2 * np.log(2))

cutoff_x = loc_cb - fwhm
fraction_below = np.sum(counts[e_over_p < cutoff_x]) / ak.num(e_pred, axis=0)
fraction_below = np.sum(hist.values()[hist.axes[0].centers < cutoff_x]) / np.sum(hist.values())

return sigma_rel_FWHM_cb, fraction_below

sigma_rel_FWHM_cb, fraction_below = summarize_fit(par)
sigmas_rel_FWHM_cb.setdefault(clf_label, {})[energy] = sigma_rel_FWHM_cb
fractions_below.setdefault(clf_label, {})[energy] = fraction_below
sigmas_rel_FWHM_cb.setdefault(PLOT_TITLE, {})[energy] = sigma_rel_FWHM_cb
fractions_below.setdefault(PLOT_TITLE, {})[energy] = fraction_below

plt.legend()
plt.xlabel("$E/p$", loc="right")
Expand Down Expand Up @@ -246,16 +291,15 @@ rocs = {}

for ix, energy in enumerate(energies):
energy_value = float(energy.replace("GeV", "").replace("MeV", "e-3"))
clf_label = PLOT_TITLE
def clf(events):
return ak.drop_none(ak.max(events["EcalEndcapNClusters.energy"], axis=-1)) / energy_value
e_pred = clf(e_eval[energy])
pi_pred = clf(pi_eval[energy])

for do_log, ax in [(False, axs[ix]), (True, axs_log[ix])]:
plt.sca(ax)
plt.hist(e_pred, weights=np.full_like(e_pred, 1.0 / ak.num(e_pred, axis=0)), bins=np.linspace(0., 1.01, 101), label=rf"$e^-$ {clf_label}")
plt.hist(pi_pred, weights=np.full_like(pi_pred, 1.0 / ak.num(pi_pred, axis=0)), bins=np.linspace(0., 1.01, 101), label=rf"$\pi^-$ {clf_label}", histtype="step")
patch = plt.stairs(norm_by_sum(e_over_p_hist[("e-", energy)].values()), e_over_p_hist[("e-", energy)].axes[0].edges, label=rf"$e^-$ {PLOT_TITLE}")
patch = plt.stairs(norm_by_sum(e_over_p_hist[("pi-", energy)].values()), e_over_p_hist[("pi-", energy)].axes[0].edges, label=rf"$\pi^-$ {PLOT_TITLE}")
plt.title(f"{energy}")
plt.legend()
plt.xlabel("Classifier output")
Expand All @@ -264,18 +308,18 @@ for ix, energy in enumerate(energies):
plt.yscale("log")

plt.sca(axs_roc[ix])
fpr, tpr, _ = roc_curve(
np.concatenate([np.ones_like(e_pred), np.zeros_like(pi_pred)]),
np.concatenate([e_pred, pi_pred]),
)
fpr = np.cumsum(e_over_p_hist[("pi-", energy)].values()[::-1])
fpr /= fpr[-1]
tpr = np.cumsum(e_over_p_hist[("e-", energy)].values()[::-1])
tpr /= tpr[-1]
cond = fpr != 0 # avoid infinite rejection (region of large uncertainty)
cond &= tpr != 1 # avoid linear interpolation (region of large uncertainty)
def mk_interp(tpr, fpr):
def interp(eff):
return np.interp(eff, tpr, fpr)
return interp
rocs.setdefault(clf_label, {})[energy] = mk_interp(tpr, fpr)
plt.plot(tpr[cond] * 100, 1 / fpr[cond], label=f"{clf_label}")
rocs.setdefault(PLOT_TITLE, {})[energy] = mk_interp(tpr, fpr)
plt.plot(tpr[cond] * 100, 1 / fpr[cond], label=f"{PLOT_TITLE}")
plt.yscale("log")
plt.title(f"{energy}")
plt.legend(loc="lower left")
Expand Down
4 changes: 2 additions & 2 deletions benchmarks/backwards_ecal/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ bench:backwards_ecal:
script:
- export PYTHONUSERBASE=$LOCAL_DATA_PATH/deps
- pip install -r benchmarks/backwards_ecal/requirements.txt
- snakemake $SNAKEMAKE_FLAGS --cores 1 results/backwards_ecal/local
- snakemake $SNAKEMAKE_FLAGS --cores 5 results/backwards_ecal/local

bench:backwards_ecal_campaigns:
extends: .det_benchmark
Expand All @@ -36,7 +36,7 @@ bench:backwards_ecal_campaigns:
script:
- export PYTHONUSERBASE=$LOCAL_DATA_PATH/deps
- pip install -r benchmarks/backwards_ecal/requirements.txt
- snakemake $SNAKEMAKE_FLAGS --cores 1 results/backwards_ecal/24.10.1
- snakemake $SNAKEMAKE_FLAGS --cores 5 results/backwards_ecal/24.10.1

collect_results:backwards_ecal:
extends: .det_benchmark
Expand Down
6 changes: 5 additions & 1 deletion benchmarks/backwards_ecal/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
awkward >= 2.4.0
scikit-learn
boost_histogram
dask >= 2024
dask_awkward >= 2024
dask_histogram
distributed >= 2024
uproot >= 5.2.0
vector
Loading