feat : add validation workflow & ROC/efficiency script

Ming-Yan · Feb 9, 2024 · bcb4893 · bcb4893
1 parent fd79a80
commit bcb4893
Show file tree

Hide file tree

Showing 25 changed files with 776 additions and 467 deletions.
diff --git a/metadata/data_Summer22EE_Run3_2022_em_BTV_Comm_v2_NanoV12_noPF.json b/metadata/data_Summer22EE_Run3_2022_em_BTV_Comm_v2_NanoV12_noPF.json
diff --git a/notebooks/getROC_eff.ipynb b/notebooks/getROC_eff.ipynb
diff --git a/runner.py b/runner.py
@@ -91,9 +91,9 @@ def get_main_parser():
     )
     parser.add_argument(
         "--isSyst",
-        default=None,
+        default=False,
         type=str,
-        choices=[None, "all", "weight_only", "JERC_split"],
+        choices=[False, "all", "weight_only", "JERC_split"],
         help="Run with systematics, all, weights_only(no JERC uncertainties included),JERC_split, None",
     )
     parser.add_argument("--isArray", action="store_true", help="Output root files")
@@ -229,7 +229,7 @@ def get_main_parser():
     ogoutput = args.output
     histoutdir = ogoutput.split(".")[0]
     coffeaoutput = f"{histoutdir}/{ogoutput}"
-    outdir = histoutdir
+    outdir = "arrays_" + histoutdir
     basename = ogoutput.replace(".coffea", "").replace("hists_", "")
     if args.output == parser.get_default("output"):
         index = args.samplejson.rfind("/") + 1

diff --git a/scripts/fetch.py b/scripts/fetch.py
@@ -37,6 +37,12 @@
     help="For samples that are not published on DAS. If this option is set then the format of the --input file must be adjusted. It should be: \n dataset_name path_to_files.",
     default=False,
 )
+parser.add_argument(
+    "--testfile",
+    action="store_true",
+    help="Construct file list in the test directory. Specify the test directory path, create the json file for individual dataset",
+    default=False,
+)
 parser.add_argument(
     "--whitelist_sites",
     help="White list fot sites",
@@ -48,9 +54,7 @@
     default=None,
 )
 parser.add_argument(
-    "--save_",
-    help="Black list for sites",
-    default=None,
+    "--limit", help="Limit numbers of file to create json", default=None, type=int
 )
 
 
@@ -86,9 +90,9 @@ def get_xrootd_sites_map():
                         if "prefix" not in proc:
                             if "rules" in proc:
                                 for rule in proc["rules"]:
-                                    sites_xrootd_access[site["rse"]][
-                                        rule["lfn"]
-                                    ] = rule["pfn"]
+                                    sites_xrootd_access[site["rse"]][rule["lfn"]] = (
+                                        rule["pfn"]
+                                    )
                         else:
                             sites_xrootd_access[site["rse"]] = proc["prefix"]
         json.dump(sites_xrootd_access, open(".sites_map.json", "w"))
@@ -199,14 +203,16 @@ def getFilesFromDas(args):
 
         if xrd is None:
             raise Exception(f"No SITE available in the whitelist for file {dsname}")
+        if args.limit is not None:
+            flist = flist[: args.limit]
         if dsname not in fdict:
             fdict[dsname] = [xrd + f for f in flist if len(f) > 1]
         else:  # needed to collect all data samples into one common key "Data" (using append() would introduce a new element for the key)
             fdict[dsname].extend([xrd + f for f in flist if len(f) > 1])
     return fdict
 
 
-def getFilesFromPath(args, lim=None):
+def getFilesFromPath(args):
     fdict = {}
     fset = []
     with open(args.input) as fp:
@@ -224,8 +230,29 @@ def getFilesFromPath(args, lim=None):
             ds = line.strip().split()
             print("ds=", ds)
             dataset = ds[0]
-            fdict[ds[0]] = getRootFilesFromPath(ds[1])
+            fdict[ds[0]] = getRootFilesFromPath(ds[1], args.limit)
+
+    return fdict
+
 
+def getTestlist(args):
+    fdict = {}
+    with open(args.input) as fp:
+        lines = fp.readlines()
+        for line in lines:
+            if line.startswith("#") or line.strip() == "":
+                continue
+            if not line.endswith("/"):
+                line = line + "/"
+            if "test" not in line:
+                print("You are not getting files in test directory")
+
+            dirs_in_test = os.popen(f"gfal-ls {line}").read().split("\n")
+            for s in dirs_in_test:
+                if s == "":
+                    continue
+                print("dataset: ", s)
+                fdict[s] = getRootFilesFromPath(line + s, 1)
     return fdict
 
 
@@ -331,12 +358,12 @@ def remove_bad_files(sample_dict, outname, remove_bad=True):
 def main(args):
     if args.from_path:
         print("do it from path: ")
-
         fdict = getFilesFromPath(args)
+    elif args.testfile:
 
+        fdict = getTestlist(args)
     else:
         fdict = getFilesFromDas(args)
-
     # Check the any file lists empty
     empty = True
     for dsname, flist in fdict.items():

diff --git a/scripts/missingFiles.py b/scripts/missingFiles.py
@@ -2,27 +2,47 @@
 import argparse
 
 
-parser = argparse.ArgumentParser(description='Check for missing hists_N.coffea files.')
-parser.add_argument('--jobName', '-j', type=str, required=True, help='Path to the folder containing jobnum_list.txt', default= 'jobs_DY_MC')
-parser.add_argument('--outputXrootdDir', '-o', type=str, required=True, help='Path to the folder containing hists_N.coffea files', default= 'DY_MC')
-parser.add_argument('--missingfilename', '-f', type=str, help='Name outputfile', default= None)
-parser.add_argument('--updateJDL', '-u', action="store_true",  help='Update submit.jdl file')#, action=store_true)
-parser.add_argument('--test', '-t', action="store_true", help='test behaviour')
+parser = argparse.ArgumentParser(description="Check for missing hists_N.coffea files.")
+parser.add_argument(
+    "--jobName",
+    "-j",
+    type=str,
+    required=True,
+    help="Path to the folder containing jobnum_list.txt",
+    default="jobs_DY_MC",
+)
+parser.add_argument(
+    "--outputXrootdDir",
+    "-o",
+    type=str,
+    required=True,
+    help="Path to the folder containing hists_N.coffea files",
+    default="DY_MC",
+)
+parser.add_argument(
+    "--missingfilename", "-f", type=str, help="Name outputfile", default=None
+)
+parser.add_argument(
+    "--updateJDL", "-u", action="store_true", help="Update submit.jdl file"
+)  # , action=store_true)
+parser.add_argument("--test", "-t", action="store_true", help="test behaviour")
 
 args = parser.parse_args()
 
 # Read the jobnum_list.txt and get all the job numbers
-jobFolder        = 'jobs_'+args.jobName +'/'
-jobnum_list_file = jobFolder +'jobnum_list.txt'
+jobFolder = "jobs_" + args.jobName + "/"
+jobnum_list_file = jobFolder + "jobnum_list.txt"
 if not os.path.isfile(jobnum_list_file):
-    print(f"The jobnum_list.txt file does not exist at the provided path: {jobnum_list_file}")
+    print(
+        f"The jobnum_list.txt file does not exist at the provided path: {jobnum_list_file}"
+    )
     exit(1)
 
 if not os.path.isdir(args.outputXrootdDir):
     print(f"The folder path provided does not exist: {args.outputXrootdDir}")
     exit(1)
 
-with open(jobnum_list_file, 'r') as file:
+with open(jobnum_list_file, "r") as file:
     job_numbers = file.read().splitlines()
 
 # List all the files in the folder
@@ -31,42 +51,49 @@
 # Check for each number if the corresponding file exists
 missing_files = []
 for job_number in job_numbers:
-    expected_folder    = f'hists_{job_number}'
-    fol_i              = os.listdir(args.outputXrootdDir+'/'+expected_folder) if os.path.isdir(args.outputXrootdDir+'/'+expected_folder) else [""]
-    expected_file_name = expected_folder+'.coffea'
-    if expected_file_name not in files_in_folder and expected_file_name not in fol_i :
+    expected_folder = f"hists_{job_number}"
+    fol_i = (
+        os.listdir(args.outputXrootdDir + "/" + expected_folder)
+        if os.path.isdir(args.outputXrootdDir + "/" + expected_folder)
+        else [""]
+    )
+    expected_file_name = expected_folder + ".coffea"
+    if expected_file_name not in files_in_folder and expected_file_name not in fol_i:
         missing_files.append(job_number)
 
-missingfilename = args.missingfilename.replace('.txt','')+'.txt' if args.missingfilename else 'missing_files_'+args.outputXrootdDir.replace('/','_')+'.txt'
-missingfileloc = jobFolder+ missingfilename
+missingfilename = (
+    args.missingfilename.replace(".txt", "") + ".txt"
+    if args.missingfilename
+    else "missing_files_" + args.outputXrootdDir.replace("/", "_") + ".txt"
+)
+missingfileloc = jobFolder + missingfilename
 
 # Save the list of missing files to missing_files.txt
-if len(missing_files)<1:
+if len(missing_files) < 1:
     print("All histograms in folder, file not being created")
     exit()
 else:
     print("Job numbers missing:", missing_files, jobFolder)
 if args.test:
     exit()
-with open(missingfileloc, 'w') as file:
+with open(missingfileloc, "w") as file:
     for missing_file in missing_files:
-        file.write(str(missing_file)+'\n')
+        file.write(str(missing_file) + "\n")
 
 print(f"Missing files have been saved to ", missingfileloc)
 
-#Update the jdl file if -u option is on
+# Update the jdl file if -u option is on
 if args.updateJDL:
     print("am i storing", args.updateJDL)
-    jdl_file = 'submit.jdl'
-    jdl_loc  = jobFolder+jdl_file
-    os.system('cp '+ jdl_loc+' '+jdl_loc.replace('.jdl','_all.jdl'))
-    with open(jdl_loc, 'r') as file:
+    jdl_file = "submit.jdl"
+    jdl_loc = jobFolder + jdl_file
+    os.system("cp " + jdl_loc + " " + jdl_loc.replace(".jdl", "_all.jdl"))
+    with open(jdl_loc, "r") as file:
         filedata = file.read()
 
     filedata = filedata.replace("jobnum_list.txt", missingfilename)
 
-    with open(jdl_loc, 'w') as file:
+    with open(jdl_loc, "w") as file:
         file.write(filedata)
 
     print(f"The file {jdl_loc} has been updated with the new missing filename.")
-
diff --git a/setup.cfg b/setup.cfg
@@ -35,7 +35,7 @@ project_urls =
 packages = find:
 install_requires = 
     vector
-    coffea>=0.7.20
+    coffea==0.7.21
 
 
 python_requires = <3.11