From 4fe949d5a3bc5b51652c3d1958285e86b2c86123 Mon Sep 17 00:00:00 2001
From: Dmitry Kalinkin <dmitry.kalinkin@gmail.com>
Date: Wed, 18 Dec 2024 23:57:39 -0500
Subject: [PATCH 1/3] calo_pid: pass input files via a list file

---
 benchmarks/calo_pid/Snakefile    | 23 +++++++++++++----------
 benchmarks/calo_pid/calo_pid.org | 13 +++++++++----
 2 files changed, 22 insertions(+), 14 deletions(-)

diff --git a/benchmarks/calo_pid/Snakefile b/benchmarks/calo_pid/Snakefile
index a93cfc4..31b0298 100644
--- a/benchmarks/calo_pid/Snakefile
+++ b/benchmarks/calo_pid/Snakefile
@@ -64,22 +64,25 @@ exec env DETECTOR_CONFIG={wildcards.DETECTOR_CONFIG} \
 """
 
 
-rule calo_pid:
+rule calo_pid_input_list:
     input:
         electrons=expand(
-            "sim_output/calo_pid/{{DETECTOR_CONFIG}}/{PARTICLE}/{ENERGY}/{PHASE_SPACE}/{PARTICLE}_{ENERGY}_{PHASE_SPACE}.{INDEX:04d}.eicrecon.tree.edm4eic.root",
-            PARTICLE=["e-"],
-            ENERGY=["100MeVto20GeV"],
-            PHASE_SPACE=["130to177deg"],
-            INDEX=range(100),
-        ),
-        pions=expand(
-            "sim_output/calo_pid/{{DETECTOR_CONFIG}}/{PARTICLE}/{ENERGY}/{PHASE_SPACE}/{PARTICLE}_{ENERGY}_{PHASE_SPACE}.{INDEX:04d}.eicrecon.tree.edm4eic.root",
-            PARTICLE=["pi-"],
+            "sim_output/calo_pid/{{DETECTOR_CONFIG}}/{{PARTICLE}}/{ENERGY}/{PHASE_SPACE}/{{PARTICLE}}_{ENERGY}_{PHASE_SPACE}.{INDEX:04d}.eicrecon.tree.edm4eic.root",
             ENERGY=["100MeVto20GeV"],
             PHASE_SPACE=["130to177deg"],
             INDEX=range(100),
         ),
+    output:
+        "listing/calo_pid/{DETECTOR_CONFIG}/{PARTICLE}.lst",
+    run:
+        with open(output[0], "wt") as fp:
+            fp.write("\n".join(input))
+
+
+rule calo_pid:
+    input:
+        electrons="listing/calo_pid/{DETECTOR_CONFIG}/e-.lst",
+        pions="listing/calo_pid/{DETECTOR_CONFIG}/pi-.lst",
         matplotlibrc=".matplotlibrc",
         script="benchmarks/calo_pid/calo_pid.py",
     output:
diff --git a/benchmarks/calo_pid/calo_pid.org b/benchmarks/calo_pid/calo_pid.org
index a46aec0..965af14 100644
--- a/benchmarks/calo_pid/calo_pid.org
+++ b/benchmarks/calo_pid/calo_pid.org
@@ -32,8 +32,8 @@ vector.register_awkward()
 #+begin_src jupyter-python :results silent
 DETECTOR_CONFIG=os.environ.get("DETECTOR_CONFIG")
 PLOT_TITLE=os.environ.get("PLOT_TITLE")
-INPUT_PIONS=os.environ.get("INPUT_PIONS", "").split(" ")
-INPUT_ELECTRONS=os.environ.get("INPUT_ELECTRONS", "").split(" ")
+INPUT_PIONS=os.environ.get("INPUT_PIONS")
+INPUT_ELECTRONS=os.environ.get("INPUT_ELECTRONS")
 
 output_dir=Path(os.environ.get("OUTPUT_DIR", "./"))
 output_dir.mkdir(parents=True, exist_ok=True)
@@ -75,8 +75,13 @@ def filter_pointing(events):
     cond = (part_momentum.eta[:,0] > -3.5) & (part_momentum.eta[:,0] < -2.)
     return events[cond]
 
-e = filter_pointing(uproot.concatenate({filename: "events" for filename in INPUT_ELECTRONS}, filter_name=["MCParticles.*", "*EcalEndcapN*"]))
-pi = filter_pointing(uproot.concatenate({filename: "events" for filename in INPUT_PIONS}, filter_name=["MCParticles.*", "*EcalEndcapN*"]))
+def readlist(path):
+    with open(path, "rt") as fp:
+        paths = [line.rstrip() for line in fp.readlines()]
+    return paths
+
+e = filter_pointing(uproot.concatenate({filename: "events" for filename in readlist(INPUT_ELECTRONS)}, filter_name=["MCParticles.*", "*EcalEndcapN*"]))
+pi = filter_pointing(uproot.concatenate({filename: "events" for filename in readlist(INPUT_PIONS)}, filter_name=["MCParticles.*", "*EcalEndcapN*"]))
 
 e_train = e[:len(pi)//2]
 pi_train = pi[:len(pi)//2]

From fd8629e78223c4a342b2752a1fe50dd72ebfbad2 Mon Sep 17 00:00:00 2001
From: Dmitry Kalinkin <dmitry.kalinkin@gmail.com>
Date: Wed, 18 Dec 2024 23:57:59 -0500
Subject: [PATCH 2/3] calo_pid: produce
 EcalEndcapNParticleIDOutput_probability_tensor

---
 benchmarks/calo_pid/Snakefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/calo_pid/Snakefile b/benchmarks/calo_pid/Snakefile
index 31b0298..d9c48c9 100644
--- a/benchmarks/calo_pid/Snakefile
+++ b/benchmarks/calo_pid/Snakefile
@@ -60,7 +60,7 @@ rule calo_pid_recon:
 set -m # monitor mode to prevent lingering processes
 exec env DETECTOR_CONFIG={wildcards.DETECTOR_CONFIG} \
   eicrecon {input} -Ppodio:output_file={output} \
-  -Ppodio:output_collections=MCParticles,EcalEndcapNRecHits,EcalEndcapNClusters,EcalEndcapNParticleIDInput_features,EcalEndcapNParticleIDTarget
+  -Ppodio:output_collections=MCParticles,EcalEndcapNRecHits,EcalEndcapNClusters,EcalEndcapNParticleIDInput_features,EcalEndcapNParticleIDTarget,EcalEndcapNParticleIDOutput_probability_tensor
 """
 
 

From 2924d820cfdd9c7ec1961c530b540e83b4fcc011 Mon Sep 17 00:00:00 2001
From: Dmitry Kalinkin <dmitry.kalinkin@gmail.com>
Date: Wed, 18 Dec 2024 23:58:15 -0500
Subject: [PATCH 3/3] calo_pid: savefig inferred probability difference

---
 benchmarks/calo_pid/calo_pid.org | 1 +
 1 file changed, 1 insertion(+)

diff --git a/benchmarks/calo_pid/calo_pid.org b/benchmarks/calo_pid/calo_pid.org
index 965af14..c9ec2fa 100644
--- a/benchmarks/calo_pid/calo_pid.org
+++ b/benchmarks/calo_pid/calo_pid.org
@@ -363,6 +363,7 @@ if "_EcalEndcapNParticleIDOutput_probability_tensor_floatData" in pi_train.field
     eval_proba = ak.concatenate([pi_eval_proba, e_eval_proba])
 
     plt.hist(clf.predict_proba(eval_x.to_numpy())[:,1] - eval_proba[:,1].to_numpy())
+    plt.savefig(output_dir / f"proba_diff.pdf", bbox_inches="tight")
     plt.show()
 else:
     print("EcalEndcapNParticleIDOutput not present")