-
Notifications
You must be signed in to change notification settings - Fork 16
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
fitting overhaul #305
fitting overhaul #305
Changes from all commits
d16b6ad
41dccc6
815f6c3
5973bef
d51fd7f
7b6f536
5838636
2ce7830
a7d19ee
6ed7271
050aaec
5e1484f
ca2daba
5393a03
7062548
1beaf6b
6fa674a
6e587a6
5478678
8fc2938
fb47dde
7261abe
15702b4
9548134
03c9b36
f3d9222
f2266d5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
from .fitting import * | ||
from .sinusoid import * | ||
|
||
__all__ = [] | ||
__all__.extend(fitting.__all__) | ||
__all__.extend(sinusoid.__all__) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,312 @@ | ||
import numpy as np | ||
import functools | ||
import collections | ||
from typing import Dict, List, Optional, Tuple, Union | ||
from scipy import optimize, stats | ||
|
||
__all__ = ['FitBase'] | ||
|
||
|
||
# TODO: type annotations! (What's the best way of annotating np arrays?) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is something I find annoying too -- I'd like to be able to say what dimensions I expect the array to be. I will send you a link to how I've handled this in an OxIonics project, but afaik there's nothing built into numpy for this. You'd just have to use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've had a look at this as well in the context of NAC3, and came to the same conclusion; this isn't nicely supported in NumPy at all. There is some support for type annotations now (e.g. you can do |
||
# TODO: docstring formatting for sphinx | ||
class FitBase: | ||
def __init__( | ||
self, | ||
x=None, | ||
y=None, | ||
y_err=None, | ||
param_bounds: Optional[Dict[str, Tuple[float, float]]] = None, | ||
fixed_params: Optional[Dict[str, float]] = None, | ||
initial_values: Optional[Dict[str, float]] = None, | ||
): | ||
""" | ||
If `x` and `y` are provided we fit the provided dataset. `x`, `y` and `y_err` | ||
may be updated post construction to fit other datasets (c.f. :meth fit:). | ||
|
||
Fit results are exposed as class properties. For example, a fit parameter | ||
'foo' is accessed via 'fit.foo', which returns a tuple with the fitted value | ||
of 'foo' and its estimated uncertainty. | ||
|
||
:param x: array of x-axis values. | ||
:param y: array of y-axis values. | ||
:param y_err: array of y-axis uncertainties. | ||
:param param_bounds: dictionary of tuples containing the lower and upper bounds | ||
for each parameter. If not specified, the defaults from | ||
:meth get_default_bounds: are used. | ||
:param fixed_params: dictionary specifying constant values for any non-floated | ||
parameters. If not specified, the defaults from | ||
:meth get_default_fixed_params: are used. | ||
:param initial_values: dictionary specifying initial parameter values to use in | ||
the fit. These override the values found by :meth estimate_parameters: | ||
""" | ||
self._x = None if x is None else np.asarray(x) | ||
self._y = None if x is None else np.asarray(y) | ||
self._y_err = None if y_err is None else np.asarray(y_err) | ||
|
||
self._validate_param_names(fixed_params) | ||
if fixed_params is not None: | ||
self._fixed_params = fixed_params | ||
else: | ||
self._fixed_params = self.get_default_fixed_params() | ||
self._free_params = [ | ||
param for param in self.get_params() | ||
if param not in self._fixed_params.keys() | ||
] | ||
|
||
self._validate_param_names(param_bounds) | ||
self._param_bounds = self.get_default_bounds() | ||
self._param_bounds.update(param_bounds or {}) | ||
self._param_bounds = { | ||
param: bound | ||
for param, bound in self._param_bounds.items() if param in self._free_params | ||
} | ||
|
||
self._validate_param_names(initial_values, self._free_params) | ||
self._initial_values = initial_values or {} | ||
|
||
self._x_scale = None | ||
self._y_scale = None | ||
self._scale_factors = None | ||
self._p = {} | ||
self._perr = {} | ||
|
||
def getter(name, instance): | ||
return instance._p[name], instance._p_err[name] | ||
|
||
cls = type(self) | ||
for param in self.get_params() + self.get_derived_params(): | ||
setattr(cls, param, property(functools.partial(getter, param))) | ||
|
||
if self._x is not None and self._y is not None: | ||
self.fit() | ||
|
||
def fit(self) -> Tuple[Dict[str, float], Dict[str, float]]: | ||
""" | ||
Fit the dataset and return the fitted parameter values and uncertainties. | ||
""" | ||
valid_pts = np.logical_and(np.isfinite(self._x), np.isfinite(self._y)) | ||
x = self._x[valid_pts] | ||
y = self._y[valid_pts] | ||
y_err = None if self._y_err is None else self._y_err[valid_pts] | ||
|
||
self._x_scale = np.max(np.abs(self._x)) | ||
self._y_scale = np.max(np.abs(self._y)) | ||
self._scale_factors = self.get_default_scale_factors() | ||
assert set(self._scale_factors) == set(self.get_params()) | ||
|
||
initial_values = self._estimate_parameters() | ||
initial_values.update(self._initial_values) | ||
|
||
for param in self.get_free_params(): | ||
param_bounds = self._param_bounds[param] | ||
initial_values[param] = np.clip(initial_values[param], param_bounds[0], | ||
param_bounds[1]) | ||
|
||
lower_bounds = [self._param_bounds[param][0] for param in self._free_params] | ||
upper_bounds = [self._param_bounds[param][1] for param in self._free_params] | ||
|
||
p0 = [initial_values[param] for param in self._free_params] | ||
p0 = [max(estimate, lower) for estimate, lower in zip(p0, lower_bounds)] | ||
p0 = [min(estimate, upper) for estimate, upper in zip(p0, upper_bounds)] | ||
|
||
scale_factors = [ | ||
self._scale_factors.get(param, 1) for param in self._free_params | ||
] | ||
scale_factors = np.asarray(scale_factors) | ||
|
||
p0 = np.asarray(p0) / scale_factors | ||
lower_bounds = np.asarray(lower_bounds) / scale_factors | ||
upper_bounds = np.asarray(upper_bounds) / scale_factors | ||
|
||
p, p_cov = optimize.curve_fit( | ||
f=functools.partial(self._func_free, scale_factors), | ||
xdata=x, | ||
ydata=y, | ||
p0=p0, | ||
sigma=y_err, | ||
absolute_sigma=y_err is not None, | ||
bounds=(lower_bounds, upper_bounds), | ||
) | ||
|
||
p *= scale_factors | ||
p_cov *= scale_factors**2 | ||
|
||
self._p_cov = p_cov | ||
p_err = np.sqrt(np.diag(p_cov)) | ||
|
||
self._p = {param: value for param, value in zip(self._free_params, p)} | ||
self._p_err = {param: value for param, value in zip(self._free_params, p_err)} | ||
self._p.update(self._fixed_params) | ||
self._p_err.update({param: 0 for param in self._fixed_params.keys()}) | ||
|
||
self._calculate_derived_params() | ||
assert set(self.get_params() + self.get_derived_params()) == set(self._p.keys()) | ||
return self._p, self._p_err | ||
|
||
def _func_free(self, scale_factors, x, *args): | ||
"""Evaluate the fit function with given values for the free parameters.""" | ||
params = dict(self._fixed_params) | ||
|
||
p = np.asarray(args) | ||
p *= scale_factors | ||
params.update({k: v for k, v in zip(self._free_params, p)}) | ||
|
||
return self.func(x, params) | ||
|
||
def evaluate(self, x_fit: Union[np.array, int] = 100): | ||
"""Evaluates the function along x-fit and returns the tuple (x_fit, y_fit) | ||
with the results. | ||
|
||
`x_fit` may either be a scalar or an array. If it is a scalar it gives the | ||
number of equally spaced points between `min(self.x)` `max(self.x)`. Otherwise | ||
it gives the x-axis to use. | ||
""" | ||
if np.isscalar(x_fit): | ||
x_fit = np.linspace(np.min(self._x), np.max(self._x), x_fit) | ||
y_fit = self.func(x_fit, self._p) | ||
return x_fit, y_fit | ||
|
||
@property | ||
def residuals(self): | ||
"""Returns the fit residuals.""" | ||
return self._y - self._func_free(self._x, self._p) | ||
|
||
@property | ||
def chi2(self) -> Tuple[float, float]: | ||
""" Chi-Squared test. | ||
|
||
Assumes the values of `y` are independent and normally distributed. | ||
Returns the tuple (Chi-Squared, p-value) | ||
Comment on lines
+178
to
+179
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I haven't had a closer look at this at all, but it might be valuable to offer some guidance here to users regarding the fact that our data is pretty much never actually normally distributed when working with ion readout data. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. mmm...I'm not sure that I have much intelligent to say here (but that probably mainly speaks to my limited knowledge of statistics) beyond "if the errors are well-approximated by normal then the Chi2 p-value is probably a decent estimator for the fit significance." A couple of other comments here:
|
||
|
||
The the p-value is the probability that the observed deviations from the fit | ||
could be observed by chance assuming the data is normally distributed (values | ||
close to unity indicate high-quality fits). | ||
""" | ||
if self._y_err is None: | ||
raise ValueError("Cannot calculate chi square without knowing y_err") | ||
|
||
n = len(self._x) - len(self._free_params) | ||
if n < 1: | ||
raise ValueError("Cannot calculate chi squared with " | ||
f"{len(self._free_params)} fit parameters and only " | ||
f"{len(self._x)} data points.") | ||
|
||
y_fit = self.evaluate(self._x)[1] | ||
chi_2 = np.sum(np.power((self._y - y_fit) / self._y_err, 2)) | ||
p = stats.chi2.sf(chi_2, n) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Todo: check the standard definition of |
||
return chi_2, p | ||
|
||
def _calculate_derived_params(self): | ||
""" | ||
Updates fit results with values and uncertainties for derived parameters. | ||
""" | ||
pass | ||
|
||
@staticmethod | ||
def func(x, params): | ||
"""The fit function.""" | ||
raise NotImplementedError | ||
|
||
@staticmethod | ||
def get_params() -> List[str]: | ||
"""Returns list of fit params""" | ||
raise NotImplementedError | ||
|
||
def get_free_params(self) -> List[str]: | ||
""" Returns a list of the free (not held constant) fit parameters. """ | ||
return self._free_params | ||
|
||
@staticmethod | ||
def get_derived_params() -> List[str]: | ||
"""Returns list of derived parameters""" | ||
return [] | ||
|
||
@staticmethod | ||
def get_default_fixed_params() -> Dict[str, float]: | ||
"""Returns a dictionary mapping names of parameters which are not floated by | ||
default to their values. | ||
""" | ||
return {} | ||
|
||
@staticmethod | ||
def get_default_bounds() -> Dict[str, Tuple[float, float]]: | ||
""" | ||
Returns a dictionary mapping parameter names to a tuple of (upper, lower) bounds | ||
""" | ||
raise NotImplementedError | ||
|
||
@staticmethod | ||
def get_default_annotations(): | ||
""" Returns a dictionary of default annotations for the fit. """ | ||
return {} | ||
|
||
def get_default_scale_factors(self) -> Dict[str, float]: | ||
""" Returns a dictionary of default parameter scale factors. """ | ||
return {param: 1 for param in self.get_params()} | ||
|
||
def _estimate_parameters(self) -> Dict[str, float]: | ||
""" | ||
Returns a dictionary of estimates for the parameter values for the current | ||
dataset. | ||
""" | ||
raise NotImplementedError | ||
|
||
def _validate_param_names(self, param_names, valid_params=None): | ||
if param_names is None: | ||
return | ||
|
||
valid_params = set( | ||
valid_params if valid_params is not None else self.get_params()) | ||
params = set(param_names) | ||
|
||
duplicates = [ | ||
param for param, count in collections.Counter(params).items() if count > 1 | ||
] | ||
if duplicates: | ||
raise ValueError(f"Duplicate parameters: {','.join(duplicates)}") | ||
|
||
invalid = params - params.intersection(valid_params) | ||
if invalid: | ||
raise ValueError(f"Invalid parameters: {','.join(invalid)}") | ||
|
||
@property | ||
def x(self): | ||
""" Dataset x axis """ | ||
return self._x | ||
|
||
@x.setter | ||
def x(self, x): | ||
self._x = np.asarray(x) | ||
self._p = {} | ||
self._perr = {} | ||
self._x_scale = None | ||
self._y_scale = None | ||
self._scale_factors = None | ||
|
||
@property | ||
def y(self): | ||
""" Dataset y axis """ | ||
return self._y | ||
|
||
@y.setter | ||
def y(self, y): | ||
self._y = np.asarray(y) | ||
self._p = {} | ||
self._perr = {} | ||
self._x_scale = None | ||
self._y_scale = None | ||
self._scale_factors = None | ||
|
||
@property | ||
def y_err(self): | ||
""" Dataset y-axis uncertainty """ | ||
return self._y_err | ||
|
||
@y_err.setter | ||
def y_err(self, y_err=None): | ||
self._y_err = np.asarray(y_err) if y_err is not None else None | ||
self._p = {} | ||
self._perr = {} | ||
self._x_scale = None | ||
self._y_scale = None | ||
self._scale_factors = None |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't dislike this (although the
dump_json
docstring would need adapting), but I wonder whether we can come up with some sort of best practices for using dataclasses/… in PYON. (I used JSON here instead of PYON on purpose to make it easier to load the result files from e.g. Mathematica or Julia, but that could be adapted.)There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You mean best practices within ndscan or best practices for general pyon? If we do go down the route of replacing the dictionary schema with something like dataclasses, I was wondering of having a
Schema
(orSerialisableDataClass
or whatever) base class which helps to formalise some of this. That could then provideto_dict
andfrom_dict
(ordescribe
anddecode
or whatever) methods to help keep things uniform.