Merge pull request #209 from JeffersonLab/aaust_comp_mod

Realistic projection for 2023-01, extrapolate to 2024-08, fix 2019-11
JeffersonLab · Apr 20, 2023 · 858fde4 · 858fde4
2 parents 5af682b + e5eb9cd
commit 858fde4
Show file tree

Hide file tree

Showing 3 changed files with 591 additions and 2 deletions.
diff --git a/comp_mod/RunPeriod-2019-11.xml b/comp_mod/RunPeriod-2019-11.xml
@@ -1,7 +1,6 @@
 <!--
 
 NOTE: For the RunPeriod2019-11 data set, see  RunPeriod-2020-02.xml
---------------------------------------------------------------------
 
 RunPeriod-2019-11  (Estimates
 
@@ -60,7 +59,7 @@ storage. Here are some compression factors for different algorithms
 used on a file from run 51505:
 
 compression type  file size  fraction  factor
-- - - - - - - -   - - - - -  - - - -   - - - -
+
     none           19.5GB     100.0%     1.00
     lz4-1          15.9GB      81.5%     1.23
     lz4-9          14.3GB      73.3%     1.36

diff --git a/comp_mod/RunPeriod-2023-01.xml b/comp_mod/RunPeriod-2023-01.xml
@@ -0,0 +1,295 @@
+<!--
+
+Spring 2023: GlueX-II
+
+=====================================================================
+triggerRate
+
+75kHz
+
+=====================================================================
+runningTimeOnFloor
+
+Total number of days: 67
+
+=====================================================================
+runningEfficiency
+
+Total running efficiency: 42%
+
+=====================================================================
+eventsize
+
+Running hdevio_scan on a raw data file gives:
+
+hdevio_scan /cache/halld/RunPeriod-2023-01/rawdata/Run121207/hd_rawdata_121207_000.evio 
+No run number given, trying to extract from filename: /cache/halld/RunPeriod-2023-01/rawdata/Run121207/hd_rawdata_121207_000.evio
+Processing file 1/1 : /cache/halld/RunPeriod-2023-01/rawdata/Run121207/hd_rawdata_121207_000.evio
+Mapping EVIO file ...
+
+
+EVIO Statistics for /cache/halld/RunPeriod-2023-01/rawdata/Run121207/hd_rawdata_121207_000.evio :
+    Nblocks: 337
+    Nevents: 39114
+    Nerrors: 0
+Nbad_blocks: 0
+Nbad_events: 0
+
+EVIO file size: 19073 MB
+EVIO block map size: 1857 kB
+first event: 1
+last event: 3128840
+
+             block levels = 40
+         events per block = 1-5,19,127-136,142,162,169,179
+                    Nsync = 0
+                Nprestart = 1
+                      Ngo = 1
+                   Npause = 0
+                     Nend = 0
+                   Nepics = 0
+                     Nbor = 1
+                 Nphysics = 1564440
+                 Nunknown = 0
+ blocks with unknown tags = 0
+
+which gives for the avg. event size: 19073/1564440 = 0.0122 MB/event or 12.2kB
+
+=====================================================================
+eventsPerRun
+
+Number of events (in millions) in a production run: average 400M
+
+=====================================================================
+RESTfraction
+
+This is based on looking at several REST file sizes on the cache disk
+e.g.
+2867323 /cache/halld/RunPeriod-2017-01/recon/ver02/REST/030739/dana_rest_030739_000.hddm
+2950430 /cache/halld/RunPeriod-2017-01/recon/ver02/REST/030749/dana_rest_030749_000.hddm
+2857726 /cache/halld/RunPeriod-2017-01/recon/ver02/REST/030769/dana_rest_030769_000.hddm
+2915293 /cache/halld/RunPeriod-2017-01/recon/ver02/REST/030787/dana_rest_030787_000.hddm
+2857342 /cache/halld/RunPeriod-2017-01/recon/ver02/REST/030788/dana_rest_030788_000.hddm
+2796538 /cache/halld/RunPeriod-2017-01/recon/ver02/REST/030823/dana_rest_030823_000.hddm
+
+The raws data files are all very similar in size to: 19570207
+
+thus:  2.86/19.57 = 14.6%
+
+=====================================================================
+goodRunFraction
+
+This represents the fraction of the full dataset considered good production
+runs. We get this from the ratio of the CPU used for the two recon passes
+from the record (https://halldweb.jlab.org/data_monitoring/launch_analysis/index.html)
+to that calculated assuming all beamtime was used to collect production
+data:
+
+   (1.5743+1.3669+1.7672+1.5498)/7.4 = 0.85
+
+=====================================================================
+reconstructionRate
+
+Directly measured on gluons gives something close to 5.2Hz/core. The
+5.0 number is from memory of a calculation I did based on some numbers
+from one of the launces documented here:
+https://halldweb.jlab.org/data_monitoring/launch_analysis/index.html
+
+I assume the discrepancy is due to inclusion of hyperthreads in the
+farm number.
+
+=====================================================================
+reconPasses
+
+Number of reconstruction passes. We did 2 full recon passes of the
+2017 data.
+
+https://halldweb.jlab.org/data_monitoring/launch_analysis/index.html
+
+=====================================================================
+analysisRate
+
+This is estimated by looking at the total CPU of the first analysis
+pass of 2017 data compared to the total CPU of the first recon pass
+and using that to scale the 5Hz recon rate:
+
+  (5Hz)*(1.5743+1.3669)/(0.1954) = 75Hz
+
+Note that this will depend on what channels are included in the pass.
+Some passes only added channels and therefore took less time. This
+number represents the rate for the first pass which would have been the
+slowest rate.
+
+=====================================================================
+analysisPasses
+
+For 2017 there were 8 versions, but only 5 had data at
+https://halldweb.jlab.org/data_monitoring/launch_analysis/index.html
+Presumably the other 3 were minor enough as to not warrant bookkeeping.
+
+As noted above, not all passes were the same. The first was the most
+inclusive, but others only added some channels and therefore used
+much less CPU. The final analysis launch looks to have taken the same
+amount of time as the first, but was only run on about half of the files.
+
+The value here is empirical to rpresent an equivalent number of passes
+to match the total CPU of the 5 recorded passes.
+
+   (0.551 Mhr)/(0.1954) = 2.82
+
+=====================================================================
+cores
+
+The average number of cores available to us varied from the different
+launches/batches due to competition for the farm at the time. The
+number of threads per job was 24. The number of jobs active varied
+from 100-300 which would correspond to 2400 to 7200 cores. This would
+include some hyperthreading.
+
+The number of 4500 was based partially on the above and partially
+on an estimate of the time jobs were active in each batch. The
+following are taken from eyeballing the "active" curve on the plot:
+"Number of jobs in each stage since launch"
+
+410 + 350 + 350 + 320 = 1430 hr = 8.5 weeks
+
+=====================================================================
+incomingData
+
+proportional to number of runs
+
+Number of files per run analyzed for the "incoming data" jobs. This
+is always 5.
+
+=====================================================================
+calibRate
+
+proportinal to time on floor
+
+This value represents the number of Mhr of CPU used per week of running
+to calibrate the detector. For 2017 data, the gxproj3 account (Sean)
+used 2Mhr. Additional time was used by individual accounts for
+calibration that is not as easy to categorize. Tegan B. was the biggest
+user with 7.4% of the 26.3Mhr, some fraction of this for calibration.
+For this value we assume 3Mhr/5.7 weeks = 0.526
+
+It should be noted that during the discussion on this at the Offline
+meeting on 2018-06-15 there was general thinking that we should be
+able to calibrate with far less CPU in the future. This number is
+higher partly because we were still developing technique and partly
+because the farm resource was not freely available at the time.
+
+=====================================================================
+offlineMonitoring
+
+proportional to number of runs
+
+A total of about 2.3 Mhr was used for Offline Monitoring jobs of 2017
+data. This consisted of a couple of dozen runs with various conditions
+and amounts of data for each. If we took 289 production runs (based
+on 0.893PB total data, 24TB/run, and 85% good run fraction) then the
+offline monitoring used about 0.00800Mhr per run.
+
+=====================================================================
+miscUserStudies
+
+proportional to time to process al files of single run
+
+This value is used to capture the CPU usage by all of the various users
+that is attributed to the gluex project. Some of this should probably
+go under calibRate, but it is very hard to categorize which parts of
+this should go there.
+
+It is assumed here that these are jobs that run over all files from a
+small number of runs in order to do special studies. The amount of CPU
+required is therefore proportional to the time it takes to process a
+single production run. This number is empirical based on 2017 CPU usage.
+There is about a 9 Mhr descrepency in the total usage (26.3MHr) and the
+shared account usage (16.4Mhr). We attribute 1Mhr of that to Teagan's
+calibrations in the calibrateRate value above.
+
+9Mhr/( (200M events)/(5Hz)/(3600s/hr) ) = 810
+
+Note that this is not to say that there were 810 studies, but rather,
+this is the proportionality constant for the CPU usage that is
+proportional to processing a single run.
+
+
+"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
+
+=====================================================================
+Actual Farm CPU usage
+
+By way of comparison of the calculation to the actual farm usage for
+the recon launches, recon numbers are obtained from:
+https://halldweb.jlab.org/data_monitoring/launch_analysis/index.html
+
+Full Recon.
+ver01:  3.19Mcore-hr
+   batch1:  mean CPU/job=68.55hr  Njobs=23337
+   batch2:  mean CPU/job=71.16hr  Njobs=22411
+
+ver02:  3.37Mcore-hr
+   batch1:  mean CPU/job=76.95hr  Njobs=23262
+   batch2:  mean CPU/job=80.68hr  Njobs=19569
+
+Total: 6.56 Mcore-hr  n.b. this will include hyperthreads and failed jobs
+
+=====================================================================
+Actual Tape usage
+
+The total amount of raw data was 911TB (from memory since the scicomp
+page is down). This number includes special runs, including some
+tests by Sasha after the beam was gone. Other non-production running
+was also mixed in that would cause this number to be higher than the
+estimate calculated by the model.
+
+=====================================================================
+simulationRate
+
+This is based on a very rough value Thomas B. gave of 40ms/event for
+bggen events with real data background mixed in. Note that adding the
+background this way significantly reduced the compute time required
+from previous models.
+
+=====================================================================
+simulationpasses
+
+Number of times we will need to repeat simulation. This value of
+2 is an old estimate.
+
+=====================================================================
+simulatedPerRawEvent
+
+Number of simulated events needed for each raw data event (production
+runs only) This is assumed to be 2 simulated events for each signal
+event in the raw data stream. We estimate about 20% of the raew data
+is reconstructable (see "GlueX at High Intensity" talk slide 10
+here:  https://halldweb.jlab.org/wiki/index.php/GlueX-II_and_DIRC_ERR )
+
+
+-->
+<compMod>
+<parameter name="triggerRate" value="75e3" units="Hz"/>
+<parameter name="runningTimeOnFloor" value="67.0" units="days"/>
+<parameter name="runningEfficiency" value="0.42"/>
+<parameter name="eventsize" value="12.2" units="kB"/>
+<parameter name="eventsPerRun" value="400" units="Mevent"/>
+<parameter name="compressionFactor" value="1.0"/>
+<parameter name="RESTfraction" value="0.15"/>
+
+<parameter name="reconstructionRate" value="10.0" units="Hz"/>
+<parameter name="reconPasses" value="2.0"/>
+<parameter name="goodRunFraction" value="0.85"/>
+<parameter name="analysisRate" value="75.0" units="Hz"/>
+<parameter name="analysisPasses" value="2.82"/>
+<parameter name="cores" value="4500"/>
+<parameter name="incomingData" value="5" units="files"/>
+<parameter name="calibRate" value="0.530" units="Mhr/week"/>
+<parameter name="offlineMonitoring" value="0.00800" units="Mhr/run"/>
+<parameter name="miscUserStudies" value="810"/>
+
+<parameter name="simulationRate" value="25" units="Hz"/>
+<parameter name="simulationpasses" value="2"/>
+<parameter name="simulatedPerRawEvent" value="0.4"/>
+</compMod>