Skip to content

Commit

Permalink
Merge pull request #1259 from cta-observatory/rf_event_weighting
Browse files Browse the repository at this point in the history
Computation of pointing-dependent weights for MC events
  • Loading branch information
moralejo authored Jun 21, 2024
2 parents 6ea82fe + 5974bb8 commit 609a977
Show file tree
Hide file tree
Showing 4 changed files with 120 additions and 7 deletions.
4 changes: 4 additions & 0 deletions lstchain/data/lstchain_standard_config.json
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,10 @@
"fraction_cleaning_intensity": 0.03
},

"random_forest_weight_settings": {
"pointing_wise_weights": true
},

"random_forest_energy_regressor_args": {
"max_depth": 30,
"min_samples_leaf": 10,
Expand Down
40 changes: 33 additions & 7 deletions lstchain/reco/dl1_to_dl2.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,8 @@ def train_energy(train, custom_config=None):

reg = model(**energy_regression_args)
reg.fit(train[features],
train['log_mc_energy'])
train['log_mc_energy'],
sample_weight=train['weight'])

logger.info("Model {} trained!".format(model))
return reg
Expand Down Expand Up @@ -118,7 +119,7 @@ def train_disp_vector(train, custom_config=None, predict_features=None):
reg = model(**disp_regression_args)
x = train[features]
y = np.transpose([train[f] for f in predict_features])
reg.fit(x, y)
reg.fit(x, y, sample_weight=train['weight'])

logger.info("Model {} trained!".format(model))

Expand Down Expand Up @@ -152,7 +153,7 @@ def train_disp_norm(train, custom_config=None, predict_feature='disp_norm'):
reg = model(**disp_regression_args)
x = train[features]
y = np.transpose(train[predict_feature])
reg.fit(x, y)
reg.fit(x, y, sample_weight=train['weight'])

logger.info("Model {} trained!".format(model))

Expand Down Expand Up @@ -186,7 +187,7 @@ def train_disp_sign(train, custom_config=None, predict_feature='disp_sign'):
clf = model(**classification_args)
x = train[features]
y = np.transpose(train[predict_feature])
clf.fit(x, y)
clf.fit(x, y, sample_weight=train['weight'])

logger.info("Model {} trained!".format(model))

Expand Down Expand Up @@ -223,15 +224,17 @@ def train_reco(train, custom_config=None):

reg_energy = model(**energy_regression_args)
reg_energy.fit(train[energy_features],
train['log_mc_energy'])
train['log_mc_energy'], sample_weight = train['weight']
)

logger.info("Random Forest trained!")
logger.info("Given disp_features: ", disp_features)
logger.info("Training Random Forest Regressor for disp_norm Reconstruction...")

reg_disp = RandomForestRegressor(**disp_regression_args)
reg_disp.fit(train[disp_features],
train['disp_norm'])
train['disp_norm'],
sample_weight=train['weight'])

logger.info("Random Forest trained!")
logger.info("Done!")
Expand Down Expand Up @@ -267,7 +270,8 @@ def train_sep(train, custom_config=None):
clf = model(**classification_args)

clf.fit(train[features],
train['mc_type'])
train['mc_type'],
sample_weight=train['weight'])
logger.info("Random Forest trained!")
return clf

Expand Down Expand Up @@ -345,6 +349,12 @@ def build_models(filegammas, fileprotons,
# Adding a filter on mc_type just for training
events_filters['mc_type'] = [-9000, np.inf]

pointing_wise_weights = False
if 'random_forest_weight_settings' in config:
if config['random_forest_weight_settings']['pointing_wise_weights']:
logger.info("Pointing-wise event weighting activated")
pointing_wise_weights = True

df_gamma = pd.read_hdf(filegammas, key=dl1_params_lstcam_key)
df_proton = pd.read_hdf(fileprotons, key=dl1_params_lstcam_key)

Expand Down Expand Up @@ -431,6 +441,14 @@ def build_models(filegammas, fileprotons,
src_r_max = config['train_gamma_src_r_deg'][1]
df_gamma = utils.apply_src_r_cut(df_gamma, src_r_min, src_r_max)

if pointing_wise_weights:
# Give same total weight to all events in every pointing node, by
# applying event-wise weights which depend on the statistics per node
# The weight is written to a new column of df_gamma, called 'weight'
_, _ = utils.compute_rf_event_weights(df_gamma)
else:
df_gamma['weight'] = np.ones(len(df_gamma))

# Train regressors for energy and disp_norm reconstruction, only with gammas
n_gamma_regressors = config["n_training_events"]["gamma_regressors"]
if n_gamma_regressors not in [1.0, None]:
Expand Down Expand Up @@ -491,6 +509,14 @@ def build_models(filegammas, fileprotons,
"The requested number of protons for the classifier training is not valid."
) from e

if pointing_wise_weights:
# Give same total weight to all events in every pointing node, by
# applying event-wise weights which depend on the statistics per node
# The weight is written to a new column of df_gamma, called 'weight'
_, _ = utils.compute_rf_event_weights(df_proton)
else:
df_proton['weight'] = np.ones(len(df_proton))

test = pd.concat([testg, df_proton], ignore_index=True)

temp_reg_energy = train_energy(train, custom_config=config)
Expand Down
11 changes: 11 additions & 0 deletions lstchain/reco/tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,3 +237,14 @@ def test_apply_src_r_cut(simulated_dl1_file):
src_r_max = srcdep_config['train_gamma_src_r_deg'][1]
params = apply_src_r_cut(params, src_r_min, src_r_max)
assert (params.event_id.values == np.arange(100, 110, 1)).all()

def test_compute_rf_event_weights():
from lstchain.reco.utils import compute_rf_event_weights

alt_tel = np.append(3*[0.41109], 6*[0.53508])
az_tel = np.append(3*[1.32615], 6*[1.38195])

df = pd.DataFrame({"alt_tel": alt_tel, "az_tel": az_tel})
_, _ = compute_rf_event_weights(df)
np.testing.assert_array_equal(df['weight'],
np.append(3*[4.5/3], 6*[4.5/6]))
72 changes: 72 additions & 0 deletions lstchain/reco/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
"cartesian_to_polar",
"clip_alt",
"compute_alpha",
"compute_rf_event_weights",
"compute_theta2",
"expand_tel_list",
"extract_source_position",
Expand Down Expand Up @@ -862,3 +863,74 @@ def get_events_in_GTI(events, CatB_cal_table):
gti_mask = gti[events['calibration_id']]

return events[gti_mask]

def compute_rf_event_weights(events):
"""
Compute event-wise weights. Can be used for correcting for the different
statistics present in each pointing node of the MC training sample,
to avoid "jumps" in the performance of the random forests
Parameters
----------
events : `~pd.DataFrame`
DL1 parameters dataframe. The table is modified in place by the
addition of a column called 'weight' (unless it exists already). The
column contains an event-wise weight to be used in the Random Forest
training, to give each of the telescope pointings in the training sample
the same overall weight in the training.
Returns
-------
pointings: ndarray of shape (number_of_pointings, 2) Alt Az (in radians)
for each of the identified telescope pointings in the input MC sample
weight_per_pointing: ndarray [number_of_pointings] weight for each of the
identified pointings. The weight is equal to the mean number of training
events per node divided by the number of training events in the specific
node. If used as sample_weight in scikit-learn, each node will have the
same total weight in the training of the Random Forests
"""

if 'weight' in events.columns:
log.warning("compute_rf_event_weights: DL2 table already contains")
log.warning("a column called weight. It will NOT be overwritten!")
return None, None

# Add a 'weight' column to the input table
weights = np.array(np.ones(len(events)))

# First identify existing telescope pointings in the sample:
# Convert to degrees and round to avoid potential
# rounding issues:
alt = np.round(events['alt_tel'], decimals=5)
az = np.round(events['az_tel'], decimals=5)
pointings = np.unique([alt, az], axis=1).T

# Find the total statistics in each of the pointings:
stats = []
for tel_alt_az in pointings:
mask = (np.isclose(tel_alt_az[0], events['alt_tel'],
atol=1e-5, rtol=0) &
np.isclose(tel_alt_az[1], events['az_tel'],
atol=1e-5, rtol=0))
stats.append(mask.sum())

stats = np.array(stats)
weight_per_pointing = stats.mean() / stats

# Now set the weights.
# Weight in a given node will be mean_events_per_node / n_events_in_node

for ipointing, tel_alt_az in enumerate(pointings):
mask = (np.isclose(tel_alt_az[0], events['alt_tel'],
atol=1e-5, rtol=0) &
np.isclose(tel_alt_az[1], events['az_tel'],
atol=1e-5, rtol=0))
weights[mask] = weight_per_pointing[ipointing]

events['weight'] = weights

# return the identified pointings and weights set (for checks)
return pointings, weight_per_pointing

0 comments on commit 609a977

Please sign in to comment.