Skip to content

Commit

Permalink
Merge pull request #350 from johannbrehmer/weighted-histo
Browse files Browse the repository at this point in the history
Histogram with weighted events
  • Loading branch information
johannbrehmer authored Jun 12, 2019
2 parents 5e6a9a3 + fe3eafd commit e19984f
Show file tree
Hide file tree
Showing 6 changed files with 378 additions and 221 deletions.
350 changes: 211 additions & 139 deletions examples/tutorial_particle_physics/4a_limits.ipynb

Large diffs are not rendered by default.

32 changes: 24 additions & 8 deletions madminer/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,14 @@ def event_loader(
yield data

def weighted_events(
self, theta=None, nu=None, start_event=None, end_event=None, derivative=False, generated_close_to=None
self,
theta=None,
nu=None,
start_event=None,
end_event=None,
derivative=False,
generated_close_to=None,
n_draws=None,
):
"""
Returns all events together with the benchmark weights (if theta is None) or weights for a given theta.
Expand All @@ -177,6 +184,12 @@ def weighted_events(
If True and if theta is not None, the derivative of the weights with respect to theta are returned. Default
value: False.
generated_close_to : None or int, optional
Only returns benchmarks generated from this benchmark (and background events). Default value: None.
n_draws : None or int, optional
If not None, returns only this number of events, drawn randomly.
Returns
-------
x : ndarray
Expand All @@ -194,29 +207,32 @@ def weighted_events(
self.event_loader(batch_size=None, start=start_event, end=end_event, generated_close_to=generated_close_to)
)

# Pick events randomly
n_events = len(x)
if n_draws is not None and n_draws < n_events:
idx = np.random.choice(n_events, n_draws, replace=False)
x = x[idx]
weights_benchmarks = weights_benchmarks[idx]
elif n_draws is not None:
logger.warning("Requested %s events, but only %s available", n_draws, n_events)

# Process and return appropriate weights
if theta is None:
return x, weights_benchmarks

elif isinstance(theta, six.string_types):
i_benchmark = list(self.benchmarks.keys()).index(theta)
return x, weights_benchmarks[:, i_benchmark]

elif derivative:
dtheta_matrix = self._get_dtheta_benchmark_matrix(theta)

gradients_theta = mdot(dtheta_matrix, weights_benchmarks) # (n_gradients, n_samples)
gradients_theta = gradients_theta.T

return x, gradients_theta

else:
# TODO: nuisance params
if nu is not None:
raise NotImplementedError

theta_matrix = self._get_theta_benchmark_matrix(theta)
weights_theta = mdot(theta_matrix, weights_benchmarks)

return x, weights_theta

def xsecs(
Expand Down
91 changes: 75 additions & 16 deletions madminer/limits.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,11 +43,12 @@ def observed_limits(
include_xsec=True,
resolutions=25,
luminosity=300000.0,
n_toys_per_theta=10000,
n_histo_toys=10000,
returns="pval",
dof=None,
n_observed=None,
histo_theta_batchsize=100,
weighted_histo=True,
):
if n_observed is None:
n_observed = len(x_observed)
Expand All @@ -63,10 +64,11 @@ def observed_limits(
include_xsec,
None,
luminosity,
n_toys_per_theta,
n_histo_toys,
returns=returns,
dof=dof,
histo_theta_batchsize=histo_theta_batchsize,
weighted_histo=weighted_histo,
)
return theta_grid, return_values, i_ml

Expand All @@ -81,10 +83,11 @@ def expected_limits(
include_xsec=True,
resolutions=25,
luminosity=300000.0,
n_toys_per_theta=10000,
n_histo_toys=10000,
returns="pval",
dof=None,
histo_theta_batchsize=100,
weighted_histo=True,
):
logger.info("Generating Asimov data")
x_asimov, x_weights = self._asimov_data(theta_true)
Expand All @@ -102,11 +105,12 @@ def expected_limits(
include_xsec,
x_weights,
luminosity,
n_toys_per_theta,
n_histo_toys,
returns=returns,
dof=dof,
histo_theta_batchsize=histo_theta_batchsize,
theta_true=theta_true,
weighted_histo=weighted_histo,
)
return theta_grid, return_values, i_ml

Expand All @@ -130,11 +134,12 @@ def _analyse(
include_xsec=True,
obs_weights=None,
luminosity=300000.0,
n_toys_per_theta=10000,
n_histo_toys=10000,
returns="pval",
dof=None,
histo_theta_batchsize=100,
theta_true=None,
weighted_histo=True,
):
logger.debug("Calculating p-values for %s expected events", n_events)

Expand All @@ -152,6 +157,7 @@ def _analyse(
# Kinematic part
if mode == "rate":
log_r_kin = 0.0

elif mode == "ml":
assert model_file is not None
logger.info("Loading kinematic likelihood ratio estimator")
Expand Down Expand Up @@ -184,8 +190,9 @@ def _analyse(
hist_bins,
theta_grid,
theta_resolutions,
n_toys_per_theta,
n_histo_toys,
histo_theta_batchsize=histo_theta_batchsize,
weighted_histo=weighted_histo,
)

logger.info("Calculating kinematic log likelihood with histograms")
Expand Down Expand Up @@ -285,22 +292,74 @@ def _make_theta_grid(theta_ranges, resolutions):
return theta_grid

def _make_histo(
self, summary_function, x_bins, theta_grid, theta_bins, n_toys_per_theta=1000, histo_theta_batchsize=100
self,
summary_function,
x_bins,
theta_grid,
theta_bins,
n_histo_toys=1000,
histo_theta_batchsize=100,
weighted_histo=True,
):
logger.info("Building histogram with %s bins per parameter and %s bins per observable", theta_bins, x_bins)
histo = Histo(theta_bins, x_bins)
logger.debug("Generating histo data")
theta, summary_stats = self._make_histo_data(
summary_function, theta_grid, n_toys_per_theta, histo_theta_batchsize=histo_theta_batchsize
)
logger.debug(
"Histo data has theta dimensions %s and summary stats dimensions %s", theta.shape, summary_stats.shape
)

if weighted_histo:
logger.debug("Generating weighted histo data")
theta, summary_stats, weights = self._make_weighted_histo_data(summary_function, theta_grid, n_histo_toys)
logger.debug(
"Histo data has theta dimensions %s, summary stats dimensions %s, and weight dimension %s",
theta.shape,
summary_stats.shape,
weights.shape,
)

else:
logger.debug("Generating sampled histo data")
theta, summary_stats = self._make_sampled_histo_data(
summary_function, theta_grid, n_histo_toys, histo_theta_batchsize=histo_theta_batchsize
)
weights = None
logger.debug(
"Histo data has theta dimensions %s and summary stats dimensions %s", theta.shape, summary_stats.shape
)

logger.debug("Filling histogram with summary statistics")
histo.fit(theta, summary_stats, fill_empty_bins=True)
histo.fit(theta, summary_stats, weights=weights, fill_empty_bins=True, epsilon=1.0e-3)

return histo

def _make_histo_data(self, summary_function, thetas, n_toys_per_theta, test_split=0.2, histo_theta_batchsize=100):
def _make_weighted_histo_data(self, summary_function, thetas, n_toys, test_split=0.2):
n_thetas = len(thetas)

# Get weighted events
start_event, end_event, _ = self._train_test_split(True, test_split)
x, weights_benchmarks = self.weighted_events(start_event=start_event, end_event=end_event, n_draws=n_toys)

# Calculate summary stats
summary_stats = summary_function(x)

# Calculate weights for thetas
weights = self._weights(thetas, None, weights_benchmarks)

# Reformat for histos
all_thetas, all_weights, all_summary_stats = [], [], []
for theta, this_weights in zip(thetas, weights):
all_thetas.append(np.asarray([theta] * len(this_weights)))
all_weights.append(this_weights)
all_summary_stats.append(summary_stats)
all_thetas = np.concatenate(all_thetas, 0)
all_weights = np.concatenate(all_weights, 0)
all_summary_stats = np.concatenate(all_summary_stats, 0)

# Rescale weights to be 1 on average
all_weights = all_weights / np.mean(all_weights)

return all_thetas, all_summary_stats, all_weights

def _make_sampled_histo_data(
self, summary_function, thetas, n_toys_per_theta, test_split=0.2, histo_theta_batchsize=100
):
sampler = SampleAugmenter(self.madminer_filename, include_nuisance_parameters=self.include_nuisance_parameters)
all_summary_stats, all_theta = None, None

Expand Down
Loading

0 comments on commit e19984f

Please sign in to comment.