Skip to content

Commit

Permalink
feat : add validation workflow & ROC/efficiency script
Browse files Browse the repository at this point in the history
  • Loading branch information
Ming-Yan committed Feb 9, 2024
1 parent fd79a80 commit bcb4893
Show file tree
Hide file tree
Showing 25 changed files with 776 additions and 467 deletions.
169 changes: 0 additions & 169 deletions metadata/data_Summer22EE_Run3_2022_em_BTV_Comm_v2_NanoV12_noPF.json

Large diffs are not rendered by default.

413 changes: 413 additions & 0 deletions notebooks/getROC_eff.ipynb

Large diffs are not rendered by default.

6 changes: 3 additions & 3 deletions runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,9 +91,9 @@ def get_main_parser():
)
parser.add_argument(
"--isSyst",
default=None,
default=False,
type=str,
choices=[None, "all", "weight_only", "JERC_split"],
choices=[False, "all", "weight_only", "JERC_split"],
help="Run with systematics, all, weights_only(no JERC uncertainties included),JERC_split, None",
)
parser.add_argument("--isArray", action="store_true", help="Output root files")
Expand Down Expand Up @@ -229,7 +229,7 @@ def get_main_parser():
ogoutput = args.output
histoutdir = ogoutput.split(".")[0]
coffeaoutput = f"{histoutdir}/{ogoutput}"
outdir = histoutdir
outdir = "arrays_" + histoutdir
basename = ogoutput.replace(".coffea", "").replace("hists_", "")
if args.output == parser.get_default("output"):
index = args.samplejson.rfind("/") + 1
Expand Down
47 changes: 37 additions & 10 deletions scripts/fetch.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,12 @@
help="For samples that are not published on DAS. If this option is set then the format of the --input file must be adjusted. It should be: \n dataset_name path_to_files.",
default=False,
)
parser.add_argument(
"--testfile",
action="store_true",
help="Construct file list in the test directory. Specify the test directory path, create the json file for individual dataset",
default=False,
)
parser.add_argument(
"--whitelist_sites",
help="White list fot sites",
Expand All @@ -48,9 +54,7 @@
default=None,
)
parser.add_argument(
"--save_",
help="Black list for sites",
default=None,
"--limit", help="Limit numbers of file to create json", default=None, type=int
)


Expand Down Expand Up @@ -86,9 +90,9 @@ def get_xrootd_sites_map():
if "prefix" not in proc:
if "rules" in proc:
for rule in proc["rules"]:
sites_xrootd_access[site["rse"]][
rule["lfn"]
] = rule["pfn"]
sites_xrootd_access[site["rse"]][rule["lfn"]] = (
rule["pfn"]
)
else:
sites_xrootd_access[site["rse"]] = proc["prefix"]
json.dump(sites_xrootd_access, open(".sites_map.json", "w"))
Expand Down Expand Up @@ -199,14 +203,16 @@ def getFilesFromDas(args):

if xrd is None:
raise Exception(f"No SITE available in the whitelist for file {dsname}")
if args.limit is not None:
flist = flist[: args.limit]
if dsname not in fdict:
fdict[dsname] = [xrd + f for f in flist if len(f) > 1]
else: # needed to collect all data samples into one common key "Data" (using append() would introduce a new element for the key)
fdict[dsname].extend([xrd + f for f in flist if len(f) > 1])
return fdict


def getFilesFromPath(args, lim=None):
def getFilesFromPath(args):
fdict = {}
fset = []
with open(args.input) as fp:
Expand All @@ -224,8 +230,29 @@ def getFilesFromPath(args, lim=None):
ds = line.strip().split()
print("ds=", ds)
dataset = ds[0]
fdict[ds[0]] = getRootFilesFromPath(ds[1])
fdict[ds[0]] = getRootFilesFromPath(ds[1], args.limit)

return fdict


def getTestlist(args):
fdict = {}
with open(args.input) as fp:
lines = fp.readlines()
for line in lines:
if line.startswith("#") or line.strip() == "":
continue
if not line.endswith("/"):
line = line + "/"
if "test" not in line:
print("You are not getting files in test directory")

dirs_in_test = os.popen(f"gfal-ls {line}").read().split("\n")
for s in dirs_in_test:
if s == "":
continue
print("dataset: ", s)
fdict[s] = getRootFilesFromPath(line + s, 1)
return fdict


Expand Down Expand Up @@ -331,12 +358,12 @@ def remove_bad_files(sample_dict, outname, remove_bad=True):
def main(args):
if args.from_path:
print("do it from path: ")

fdict = getFilesFromPath(args)
elif args.testfile:

fdict = getTestlist(args)
else:
fdict = getFilesFromDas(args)

# Check the any file lists empty
empty = True
for dsname, flist in fdict.items():
Expand Down
79 changes: 53 additions & 26 deletions scripts/missingFiles.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,27 +2,47 @@
import argparse


parser = argparse.ArgumentParser(description='Check for missing hists_N.coffea files.')
parser.add_argument('--jobName', '-j', type=str, required=True, help='Path to the folder containing jobnum_list.txt', default= 'jobs_DY_MC')
parser.add_argument('--outputXrootdDir', '-o', type=str, required=True, help='Path to the folder containing hists_N.coffea files', default= 'DY_MC')
parser.add_argument('--missingfilename', '-f', type=str, help='Name outputfile', default= None)
parser.add_argument('--updateJDL', '-u', action="store_true", help='Update submit.jdl file')#, action=store_true)
parser.add_argument('--test', '-t', action="store_true", help='test behaviour')
parser = argparse.ArgumentParser(description="Check for missing hists_N.coffea files.")
parser.add_argument(
"--jobName",
"-j",
type=str,
required=True,
help="Path to the folder containing jobnum_list.txt",
default="jobs_DY_MC",
)
parser.add_argument(
"--outputXrootdDir",
"-o",
type=str,
required=True,
help="Path to the folder containing hists_N.coffea files",
default="DY_MC",
)
parser.add_argument(
"--missingfilename", "-f", type=str, help="Name outputfile", default=None
)
parser.add_argument(
"--updateJDL", "-u", action="store_true", help="Update submit.jdl file"
) # , action=store_true)
parser.add_argument("--test", "-t", action="store_true", help="test behaviour")

args = parser.parse_args()

# Read the jobnum_list.txt and get all the job numbers
jobFolder = 'jobs_'+args.jobName +'/'
jobnum_list_file = jobFolder +'jobnum_list.txt'
jobFolder = "jobs_" + args.jobName + "/"
jobnum_list_file = jobFolder + "jobnum_list.txt"
if not os.path.isfile(jobnum_list_file):
print(f"The jobnum_list.txt file does not exist at the provided path: {jobnum_list_file}")
print(
f"The jobnum_list.txt file does not exist at the provided path: {jobnum_list_file}"
)
exit(1)

if not os.path.isdir(args.outputXrootdDir):
print(f"The folder path provided does not exist: {args.outputXrootdDir}")
exit(1)

with open(jobnum_list_file, 'r') as file:
with open(jobnum_list_file, "r") as file:
job_numbers = file.read().splitlines()

# List all the files in the folder
Expand All @@ -31,42 +51,49 @@
# Check for each number if the corresponding file exists
missing_files = []
for job_number in job_numbers:
expected_folder = f'hists_{job_number}'
fol_i = os.listdir(args.outputXrootdDir+'/'+expected_folder) if os.path.isdir(args.outputXrootdDir+'/'+expected_folder) else [""]
expected_file_name = expected_folder+'.coffea'
if expected_file_name not in files_in_folder and expected_file_name not in fol_i :
expected_folder = f"hists_{job_number}"
fol_i = (
os.listdir(args.outputXrootdDir + "/" + expected_folder)
if os.path.isdir(args.outputXrootdDir + "/" + expected_folder)
else [""]
)
expected_file_name = expected_folder + ".coffea"
if expected_file_name not in files_in_folder and expected_file_name not in fol_i:
missing_files.append(job_number)

missingfilename = args.missingfilename.replace('.txt','')+'.txt' if args.missingfilename else 'missing_files_'+args.outputXrootdDir.replace('/','_')+'.txt'
missingfileloc = jobFolder+ missingfilename
missingfilename = (
args.missingfilename.replace(".txt", "") + ".txt"
if args.missingfilename
else "missing_files_" + args.outputXrootdDir.replace("/", "_") + ".txt"
)
missingfileloc = jobFolder + missingfilename

# Save the list of missing files to missing_files.txt
if len(missing_files)<1:
if len(missing_files) < 1:
print("All histograms in folder, file not being created")
exit()
else:
print("Job numbers missing:", missing_files, jobFolder)
if args.test:
exit()
with open(missingfileloc, 'w') as file:
with open(missingfileloc, "w") as file:
for missing_file in missing_files:
file.write(str(missing_file)+'\n')
file.write(str(missing_file) + "\n")

print(f"Missing files have been saved to ", missingfileloc)

#Update the jdl file if -u option is on
# Update the jdl file if -u option is on
if args.updateJDL:
print("am i storing", args.updateJDL)
jdl_file = 'submit.jdl'
jdl_loc = jobFolder+jdl_file
os.system('cp '+ jdl_loc+' '+jdl_loc.replace('.jdl','_all.jdl'))
with open(jdl_loc, 'r') as file:
jdl_file = "submit.jdl"
jdl_loc = jobFolder + jdl_file
os.system("cp " + jdl_loc + " " + jdl_loc.replace(".jdl", "_all.jdl"))
with open(jdl_loc, "r") as file:
filedata = file.read()

filedata = filedata.replace("jobnum_list.txt", missingfilename)

with open(jdl_loc, 'w') as file:
with open(jdl_loc, "w") as file:
file.write(filedata)

print(f"The file {jdl_loc} has been updated with the new missing filename.")

2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ project_urls =
packages = find:
install_requires =
vector
coffea>=0.7.20
coffea==0.7.21


python_requires = <3.11
Expand Down
Loading

0 comments on commit bcb4893

Please sign in to comment.