From 9fe3ece278bef89c99d9d35eb7bdf502d1a1bc2d Mon Sep 17 00:00:00 2001 From: regislebrun Date: Thu, 16 May 2024 09:20:08 +0200 Subject: [PATCH] Improved KernelSmoothing Now, a log transformation can be applied to ease the reconstruction of skewed distributions. Added documentation & tests. Fixed minor errors. Thks to adutfoy@github.com. --- lib/etc/openturns.conf.in | 18 +- lib/src/Base/Common/ResourceMap.cxx | 1 + .../Distribution/KernelSmoothing.cxx | 82 +++++- .../openturns/KernelSmoothing.hxx | 29 +- python/doc/bibliography.rst | 8 + ...ot_estimate_non_parametric_distribution.py | 57 +++- .../theory/data_analysis/data_analysis.rst | 11 +- .../theory/data_analysis/kernel_smoothing.rst | 45 +++- python/src/KernelSmoothing_doc.i.in | 248 ++++++++++++++---- python/test/t_KernelSmoothing_std.py | 18 ++ 10 files changed, 441 insertions(+), 76 deletions(-) diff --git a/lib/etc/openturns.conf.in b/lib/etc/openturns.conf.in index e6edf5c136..befee650c1 100644 --- a/lib/etc/openturns.conf.in +++ b/lib/etc/openturns.conf.in @@ -576,13 +576,14 @@ - - - - - - - + + + + + + + + @@ -601,6 +602,9 @@ + + + diff --git a/lib/src/Base/Common/ResourceMap.cxx b/lib/src/Base/Common/ResourceMap.cxx index d7815c878c..82dfea8a17 100644 --- a/lib/src/Base/Common/ResourceMap.cxx +++ b/lib/src/Base/Common/ResourceMap.cxx @@ -1209,6 +1209,7 @@ void ResourceMap::loadDefaultConfiguration() addAsScalar("KernelSmoothing-CutOffPlugin", 5.0); addAsScalar("KernelSmoothing-RelativePrecision", 1.0e-5); addAsScalar("KernelSmoothing-ResidualPrecision", 1.0e-10); + addAsScalar("KernelSmoothing-DefaultShiftScale", 1.0e-5); addAsUnsignedInteger("KernelSmoothing-BinNumber", 1024); addAsUnsignedInteger("KernelSmoothing-MaximumIteration", 50); addAsUnsignedInteger("KernelSmoothing-SmallSize", 250); diff --git a/lib/src/Uncertainty/Distribution/KernelSmoothing.cxx b/lib/src/Uncertainty/Distribution/KernelSmoothing.cxx index fc5e944ae3..543bac887a 100644 --- a/lib/src/Uncertainty/Distribution/KernelSmoothing.cxx +++ b/lib/src/Uncertainty/Distribution/KernelSmoothing.cxx @@ -25,6 +25,8 @@ #include "openturns/PersistentObjectFactory.hxx" #include "openturns/Brent.hxx" #include "openturns/MethodBoundEvaluation.hxx" +#include "openturns/SymbolicFunction.hxx" +#include "openturns/ParametricFunction.hxx" #include "openturns/Function.hxx" #include "openturns/HermiteFactory.hxx" #include "openturns/UniVariatePolynomial.hxx" @@ -33,6 +35,7 @@ #include "openturns/SobolSequence.hxx" #include "openturns/ResourceMap.hxx" #include "openturns/JointDistribution.hxx" +#include "openturns/CompositeDistribution.hxx" #include "openturns/BlockIndependentDistribution.hxx" BEGIN_NAMESPACE_OPENTURNS @@ -256,8 +259,39 @@ Distribution KernelSmoothing::build(const Sample & sample) const { // For 1D sample, use the rule that give the best tradeoff between speed and precision if (sample.getDimension() == 1) - return build(sample, computeMixedBandwidth(sample)); - + { + if (useLogTransform_) + { + const Scalar skewness = sample.computeSkewness()[0]; + const Scalar xMin = sample.getMin()[0]; + const Scalar xMax = sample.getMax()[0]; + const Scalar delta = (xMax - xMin) * std::max(SpecFunc::Precision, ResourceMap::GetAsScalar("KernelSmoothing-DefaultShiftScale")); + ParametricFunction transform; + ParametricFunction inverseTransform; + // Need to construct explicitely a Description to disambiguate the call + // to SymbolicFunction constructor + const Description inVars = {"x", "shift"}; + if (skewness >= 0.0) + { + const Scalar shift = delta - xMin; + transform = ParametricFunction(SymbolicFunction(inVars, {"log(x+shift)"}), {1}, {shift}); + inverseTransform = ParametricFunction(SymbolicFunction(inVars, {"exp(x)-shift"}), {1}, {shift}); + } + else + { + const Scalar shift = xMax + delta; + transform = ParametricFunction(SymbolicFunction(inVars, {"log(shift-x)"}), {1}, {shift}); + inverseTransform = ParametricFunction(SymbolicFunction(inVars, {"shift - exp(x)"}), {1}, {shift}); + } + const Sample transformedSample(transform(sample)); + const Distribution transformedDistribution(build(transformedSample, computeMixedBandwidth(transformedSample))); + CompositeDistribution fitted(inverseTransform, transformedDistribution); + fitted.setDescription(sample.getDescription()); + return fitted; + } // useLogTransform + else + return build(sample, computeMixedBandwidth(sample)); + } // dimension 1 // For nD sample, use the only available rule return build(sample, computeSilvermanBandwidth(sample)); } @@ -279,8 +313,8 @@ Distribution KernelSmoothing::build(const Sample & sample, if (xmin == xmax) { bandwidth_ = bandwidth; - KernelSmoothing::Implementation result(new Dirac(xmin)); - result->setDescription(sample.getDescription()); + Dirac result(xmin); + result.setDescription(sample.getDescription()); return result; } Indices degenerateIndices; @@ -549,6 +583,7 @@ TruncatedDistribution KernelSmoothing::buildAsTruncatedDistribution(const Sample // Now, work on the extended sample SampleImplementation newSample(newSampleData.getSize(), 1); newSample.setData(newSampleData); + newSample.setDescription(sample.getDescription()); size = newSample.getSize(); const Bool mustBin = binned_ && (dimension * std::log(1.0 * binNumber_) < std::log(1.0 * size)); if (binned_ != mustBin) LOGINFO("Will not bin the data because the bin number is greater than the sample size"); @@ -585,6 +620,11 @@ void KernelSmoothing::setBoundaryCorrection(const Bool boundaryCorrection) boundingOption_ = (boundaryCorrection ? BOTH : NONE); } +Bool KernelSmoothing::getBoundaryCorrection() const +{ + return (boundingOption_ != NONE); +} + /* Boundary correction accessor */ void KernelSmoothing::setBoundingOption(const BoundingOption boundingOption) { @@ -615,6 +655,38 @@ void KernelSmoothing::setAutomaticUpperBound(const Bool automaticUpperBound) automaticUpperBound_ = automaticUpperBound; } +/* Binning accessors */ +void KernelSmoothing::setBinning(const Bool binned) +{ + binned_ = binned; +} + +Bool KernelSmoothing::getBinning() const +{ + return binned_; +} + +/* Bin number accessor */ +void KernelSmoothing::setBinNumber(const UnsignedInteger binNumber) +{ + if (binNumber_ < 2) throw InvalidArgumentException(HERE) << "Error: The number of bins=" << binNumber << " is less than 2."; +} + +UnsignedInteger KernelSmoothing::getBinNumber() const +{ + return binNumber_; +} + +/* Use log transform accessor */ +void KernelSmoothing::setUseLogTransform(const Bool useLog) +{ + useLogTransform_ = useLog; +} + +Bool KernelSmoothing::getUseLogTransform() const +{ + return useLogTransform_; +} /* Method save() stores the object through the StorageManager */ void KernelSmoothing::save(Advocate & adv) const @@ -646,6 +718,8 @@ void KernelSmoothing::load(Advocate & adv) adv.loadAttribute("automaticLowerBound_", automaticLowerBound_); adv.loadAttribute("upperBound_", upperBound_); adv.loadAttribute("automaticUpperBound_", automaticUpperBound_); + if (adv.hasAttribute("useLogTransform_")) + adv.loadAttribute("useLogTransform_", useLogTransform_); } END_NAMESPACE_OPENTURNS diff --git a/lib/src/Uncertainty/Distribution/openturns/KernelSmoothing.hxx b/lib/src/Uncertainty/Distribution/openturns/KernelSmoothing.hxx index bb37d35817..a38f147894 100644 --- a/lib/src/Uncertainty/Distribution/openturns/KernelSmoothing.hxx +++ b/lib/src/Uncertainty/Distribution/openturns/KernelSmoothing.hxx @@ -75,20 +75,33 @@ public: /** Kernel accessor */ Distribution getKernel() const; - /* Boundary correction accessor, shortcut for setBoundingOption(NONE) or setBoundingOption(BOTH) */ + /** Boundary correction accessor, shortcut for setBoundingOption(NONE) or setBoundingOption(BOTH) */ void setBoundaryCorrection(const Bool boundaryCorrection); + Bool getBoundaryCorrection() const; - /* Boundary correction accessor */ + /** Boundary correction accessor */ void setBoundingOption(const BoundingOption boundingOption); - /* Boundary accessor */ + /** Boundary accessor */ void setLowerBound(const Scalar lowerBound); void setUpperBound(const Scalar upperBound); - /* Automatic boundary accessor */ + /** Automatic boundary accessor */ void setAutomaticLowerBound(const Bool automaticLowerBound); void setAutomaticUpperBound(const Bool automaticUpperBound); + /** Binning accessors */ + void setBinning(const Bool binned); + Bool getBinning() const; + + /** Bin number accessor */ + void setBinNumber(const UnsignedInteger binNumber); + UnsignedInteger getBinNumber() const; + + /** Use log transform accessor */ + void setUseLogTransform(const Bool useLog); + Bool getUseLogTransform() const; + /** Compute the bandwidth according to Silverman's rule */ Point computeSilvermanBandwidth(const Sample & sample) const; @@ -124,13 +137,13 @@ private: Distribution kernel_; // Flag to tell if we compute a binned version of the estimator - Bool binned_; + Bool binned_ = false; // Number of bins in each dimension - UnsignedInteger binNumber_; + UnsignedInteger binNumber_ = ResourceMap::GetAsUnsignedInteger("KernelSmoothing-BinNumber"); // Direction of the boundary treatment - BoundingOption boundingOption_; + BoundingOption boundingOption_ = NONE; // Known bounds Scalar lowerBound_; @@ -138,6 +151,8 @@ private: Scalar upperBound_; Bool automaticUpperBound_; + // Use log transform + Bool useLogTransform_ = false; }; /* class KernelSmoothing */ diff --git a/python/doc/bibliography.rst b/python/doc/bibliography.rst index f0449f254d..c0076d5f96 100644 --- a/python/doc/bibliography.rst +++ b/python/doc/bibliography.rst @@ -72,6 +72,10 @@ Bibliography http://ceres-solver.org .. [chacon2018] Chacón, J. E., & Duong, T. (2018). *Multivariate kernel smoothing and its applications.* CRC Press. +.. [charpentier2015] Charpentier, A., & Flachaire, E. (2014). + *Log-Transform Kernel Density Estimation of Income Distribution* WP 2015-Nr 6, + AMSE Aix Marseille School of Economics. + `pdf `__ .. [chihara1978] Chihara, T. S. (1978). *An introduction to orthogonal polynomials.* Dover publications. .. [chapelle2002] Chapelle, O., Vapnik, V., & Bengio, Y. (2002). @@ -204,6 +208,10 @@ Bibliography *Global optimization of expensive black-box functions*, Journal of Global Optimization, 13(4), 455-492, 1998. `pdf `__ +.. [jones1993] M.C. Jones, + *Simple boundary correction for kernel density estimation*, + Statistics and Computing. Vol. 3, Issue 3, 1993, pp. 135-146, + https://doi.org/10.1007/BF00147776 .. [Keutelian1991] Hovhannes Keutelian. *The Kolmogorov-Smirnov test when parameters are estimated from data*, 30 April 1991, Fermilab. diff --git a/python/doc/examples/data_analysis/distribution_fitting/plot_estimate_non_parametric_distribution.py b/python/doc/examples/data_analysis/distribution_fitting/plot_estimate_non_parametric_distribution.py index fb8b8b2bb8..745c1435b5 100644 --- a/python/doc/examples/data_analysis/distribution_fitting/plot_estimate_non_parametric_distribution.py +++ b/python/doc/examples/data_analysis/distribution_fitting/plot_estimate_non_parametric_distribution.py @@ -204,7 +204,7 @@ # Boundary corrections # -------------------- # -# We finish this example on an advanced feature of the kernel smoothing, the boundary corrections. +# We detail here an advanced feature of the kernel smoothing, the boundary corrections. # # %% @@ -257,5 +257,60 @@ # %% # The boundary correction made has a remarkable impact on the quality of the estimate for the small values. +# %% +# Log-transform treatment +# ----------------------- +# +# We finish this example on another advanced feature of the kernel smoothing: the log-transform treatment. +# This treatment is highly suited to skewed distributions, which are all challenging for kernel smoothing. +# + +# %% +# We consider several distributions which have significant skewness: +distCollection = [ot.LogNormal(0.0, 2.5), ot.Beta(20000.5, 2.5, 0.0, 1.0), ot.Exponential(), + ot.WeibullMax(1.0, 0.9, 0.0), ot.Mixture([ot.Normal(-1.0, 0.5), ot.Normal(1.0, 1.0)], [0.4, 0.6]), + ot.Mixture([ot.LogNormal(-1.0, 1.0, -1.0), ot.LogNormal(1.0, 1.0, 1.0)], [0.2, 0.8])] + +# %% +# For each distribution, we do the following steps: +# +# - we generate a sample of size 5000, +# - we fit a kernel smoothing distribution without the log-transform treatment, +# - we fit a kernel smoothing distribution with the log-transform treatment, +# - we plot the real distribution and both non parametric estimations. +# +# Other transformations could be used, but the Log-transform one is quite effective. If the skewness is moderate, +# there is almost no change wrt simple kernel smoothing. But if the skewness is large, the transformation performs +# very well. Note that, in addition, this transformation performs an automatic boundary correction. +grid = ot.GridLayout(2, 3) +ot.RandomGenerator.SetSeed(0) +for i, distribution in enumerate(distCollection): + sample = distribution.getSample(5000) + + # We draw the real distribution + graph = distribution.drawPDF() + graph.setLegends([distribution.getClassName()]) + # We choose the default kernel + kernel = ot.KernelSmoothing() + + # We activate no particular treatment + fitted = kernel.build(sample) + curve = fitted.drawPDF() + curve.setLegends(["Fitted"]) + graph.add(curve) + + # We activate the log-transform treatment + kernel.setUseLogTransform(True) + fitted = kernel.build(sample) + curve = fitted.drawPDF() + curve.setLegends(["Fitted LogTransform"]) + curve = curve.getDrawable(0) + curve.setLineStyle("dashed") + + graph.add(curve) + graph.setColors(ot.Drawable.BuildDefaultPalette(3)) + grid.setGraph(i // 3, i % 3, graph) + +view = viewer.View(grid) plt.show() diff --git a/python/doc/theory/data_analysis/data_analysis.rst b/python/doc/theory/data_analysis/data_analysis.rst index 42d5e9c705..08a3e10833 100644 --- a/python/doc/theory/data_analysis/data_analysis.rst +++ b/python/doc/theory/data_analysis/data_analysis.rst @@ -14,14 +14,21 @@ Comparison of two samples qqplot_graph smirnov_test -Estimation of a parametric model --------------------------------- +Estimation of a nonparametric model +----------------------------------- .. toctree:: :maxdepth: 1 empirical_cdf kernel_smoothing + +Estimation of a parametric model +-------------------------------- + +.. toctree:: + :maxdepth: 1 + maximum_likelihood parametric_estimation diff --git a/python/doc/theory/data_analysis/kernel_smoothing.rst b/python/doc/theory/data_analysis/kernel_smoothing.rst index b580f5ab23..a8376218a6 100644 --- a/python/doc/theory/data_analysis/kernel_smoothing.rst +++ b/python/doc/theory/data_analysis/kernel_smoothing.rst @@ -610,7 +610,7 @@ Another method is to use boundary kernels (see [chacon2018]_ page 76, [scott2015]_ page 157). In dimension 1, the boundary effects may be taken into account using -a *reflection* or *mirroring* method (see [silverman1982]_ page 31). +a *reflection* or *mirroring* method (see [silverman1982]_ page 31, [jones1993]_). the boundaries are automatically detected from the sample (with the *min* and *max* functions) and the kernel smoothed PDF is corrected in the boundary areas to remain within the boundaries, @@ -629,6 +629,49 @@ according to the mirroring technique: - this last kernel smoothed PDF is truncated within the initial range :math:`[min, max]` (conditional PDF). +Log-transform treatment +~~~~~~~~~~~~~~~~~~~~~~~ + +In this section, we consider a random variable i.e. :math:`d = 1`. This treatment is highly suited to skewed distributions, +which are all challenging for kernel smoothing. See [charpentier2015]_ to get more details. + +We denote by :math:`(X_i)_{1 \leq i \leq n}` some independent random variates, identically distributed according to :math:`X`. + +The log-transform treatment maps each :math:`X_j` into :math:`Y_j` as follows: + +.. math:: + + Y_j = T(X_j) = \left | + \begin{array}{ll} + \log (X_j - \min_{i} X_i + \delta) & \mbox{if } \gamma_1(X) >0\\ + \log (\max_{i} X_i - X_j + \delta) & \mbox{if } \gamma_1(X) >0 + \end{array} + \right. + +where :math:`\gamma_1(X) = \dfrac{\Expect{\left( X - \mu\right)^3}}{\sigma}` +is the skewness of :math:`X` with :math:`\mu = \Expect{X}`, :math:`\sigma^2 = \Var{X}` +and :math:`\delta \in \Rset^+_*` the shift scale. + +Once a kernel smoothed distribution has been fitted on the transformed data, the fitted distribution of :math:`X` +is built as :math:`T^{-1}(Y)` where :math:`Y` is distributed according to the kernel smoothed distribution. + +Given a sample :math:`(x_i)_{1 \leq i \leq n}` from :math:`X`, we denote by :math:`\hat{a} = \min_{i} x_i`, +:math:`\hat{b} = \max_{i} x_i` and :math:`y_i = T(x_i)` for :math:`1 \leq i \leq n`. +We build the kernel smoothing distribution of :math:`Y` using :math:`(y_i)_{1 \leq i \leq n}` +which pdf is :math:`\hat{p}_Y` and cdf :math:`\hat{F}_Y`. + +We recover the pdf and cdf of :math:`X` as follows: + +.. math:: + + \hat{F}_X(x) & = \hat{F}_Y(T(x)) \\ + \hat{p}_X(x) & = T'(x) \hat{p}_Y(T(x)) + +We note that this transformation also embeds a treatment of the boundaries as the finite lower bound in case of positive skewness +or the finite upper bound in case of negative skewness is rejected to infinity. Thus, there is no more boundary effect on the +:math:`Y`-sample. + + Conclusion ~~~~~~~~~~ The next table presents a summary of histogram, kernel smoothing and diff --git a/python/src/KernelSmoothing_doc.i.in b/python/src/KernelSmoothing_doc.i.in index 600a5d87df..cbd7a5beeb 100644 --- a/python/src/KernelSmoothing_doc.i.in +++ b/python/src/KernelSmoothing_doc.i.in @@ -1,4 +1,4 @@ -%feature("docstring") OT::KernelSmoothing ++%feature("docstring") OT::KernelSmoothing "Non parametric continuous distribution estimation by kernel smoothing. Refer to :ref:`kernel_smoothing`. @@ -10,21 +10,23 @@ kernel : :class:`~openturns.Distribution`, optional binned : bool, optional Activates bining mechanism only in the univariate or bivariate cases. It allows one to speed up the manipulation of the density function of the resulting distribution. By default, the mechanism is activated. binNumber : int, :math:`binNumber \geq 2`, optional - Indicates the number of bins used by the bining mechanism. By default, OpenTURNS uses the values stored in the *ResourceMap*. + Indicates the number of bins used by the bining mechanism. By default, OpenTURNS uses the values stored in :class:`~openturns.ResourceMap`. boundaryCorrection : bool, optional Activates the boundary correction using the mirroring technique. By default, the correction is not provided. Notes ----- -The binning mechanism creates a regular grid of *binNumber* intervals in each -dimension, then the unit weight of each point is linearly affected to the vertices -of the bin containing the point (see [wand1994]_ appendix D, page 182). -The `KernelSmoothing-BinNumber` key defines the default value of the -number of bins used in the _binning_ algorithm to improve the evaluation speed. +The binning mechanism is available in dimension 1 and 2 only. See the notes of the +:meth:`setBinning` method for details. + +The boundary correction is available in dimension 1 only, and it is done using +the mirroring technique (also named as the reflection correction). +See the notes of the :meth:`setBoundingOption` method for +details. -The boundary correction is available only in one dimension, and it is done using -the mirroring technique. See the notes of the :meth:`setBoundingOption` method for +It is possible to apply a log-transformation on the data in dimension 1 only, and build the kernel smoothing +distribution on the transformed data. See the notes of the :meth:`setUseLogTransform` method for details. When applied to multivariate samples, the kernel is the kernel product of the @@ -78,7 +80,8 @@ Nevertheless, the parameters can be manually set. Variants of the :meth:`build` method can be used when the distribution to build is expected to be of a certain type. In those cases however, the bandwidth must be user-specified. -To use :meth:`buildAsTruncatedDistribution`, boundary correction must be enabled. +To use :meth:`buildAsTruncatedDistribution`, boundary correction must be activated. +To use the LogTransform treatment, activate it with :meth:`setUseLogTransform`. >>> distribution = ks.buildAsKernelMixture(sample, bandwidth) >>> print(distribution.getClassName()) @@ -88,7 +91,11 @@ KernelMixture Mixture >>> distribution = ks.buildAsTruncatedDistribution(sample, bandwidth) >>> print(distribution.getClassName()) -TruncatedDistribution" +TruncatedDistribution +>>> ks.setUseLogTransform(True) +>>> distribution = ks.build(sample) +>>> print(distribution.getClassName()) +Distribution" // --------------------------------------------------------------------- %feature("docstring") OT::KernelSmoothing::buildAsKernelMixture @@ -103,12 +110,12 @@ bandwidth : :class:`~openturns.Point` Returns ------- -fittdDist : :class:`~openturns.KernelMixture` +fittedDist : :class:`~openturns.KernelMixture` The fitted distribution. Notes ----- -It builds a :math:`~openturns.KernelMixture` using the given data and bandwidth regardless of the binning or boundary treatment flags. +It builds a :class:`~openturns.KernelMixture` using the given data and bandwidth regardless of the binning or boundary treatment flags. Examples -------- @@ -132,12 +139,12 @@ bandwidth : :class:`~openturns.Point` Returns ------- -fittdDist : :class:`~openturns.KernelMixture` +fittedDist : :class:`~openturns.Mixture` The fitted distribution. Notes ----- -It builds a :math:`~openturns.Mixture` using the given bandwidth and a binning of the given data regardless of the bin number, the data size, the binning flag or boundary treatment flags. This method is available only for 1D or 2D samples. +It builds a :class:`~openturns.Mixture` using the given bandwidth and a binning of the given data regardless of the bin number, the data size, the binning flag or boundary treatment flags. This method is available only for 1D or 2D samples. Examples -------- @@ -161,7 +168,7 @@ bandwidth : :class:`~openturns.Point` Returns ------- -fittdDist : :class:`~openturns.TruncatedDistribution` +fittedDist : :class:`~openturns.TruncatedDistribution` The estimated distribution as a :class:`~openturns.TruncatedDistribution`. Examples @@ -186,7 +193,7 @@ bandwidth : :class:`~openturns.Point`, optional Returns ------- -fittdDist : :class:`~openturns.Distribution` +fittedDist : :class:`~openturns.Distribution` The fitted distribution. Notes @@ -194,17 +201,26 @@ Notes According to the dimension of the data and the specified treatments, the resulting distribution differs. - If the sample is constant, a :class:`~openturns.Dirac` distribution is built. -- If dimension > 2 or if no treatment has been asked for, a :class:`~openturns.KernelMixture` is built by calling *buildAsKernelMixture*. -- If dimension = 1 and a boundary treatment has been asked for, a :class:`~openturns.TruncatedDistribution` is built by calling *buildAsTruncatedDistribution* -- If dimension = 1 or 2 and no boundary treatment has been asked for, but a binning treatment has been asked for, - - If the sample size is greater than the bin number, then a :class:`~openturns.Mixture` is built by calling `buildAsMixture` - - Otherwise a :class:`~openturns.KernelMixture` is built by calling `buildAsKernelMixture` +- In dimension 1: + + - if no treatment is activated, a :class:`~openturns.KernelMixture` is built by using :meth:`buildAsKernelMixture`, + - if a boundary treatment is activated, a :class:`~openturns.TruncatedDistribution` is built by using :meth:`buildAsTruncatedDistribution`, + - if a log-transformation is activated, a :class:`~openturns.CompositeDistribution` is built by using :meth:`build`. + +- In dimension > 2: + + - no treatment (boundary correction or log-transformation) is available. A :class:`~openturns.KernelMixture` is built by using :meth:`buildAsKernelMixture`. + +- In dimension 1 or 2, if a binning treatment is activated: + + - If the sample size is greater than the bin number, then a :class:`~openturns.Mixture` is built by using `buildAsMixture`, + - Otherwise a :class:`~openturns.KernelMixture` is built by using :meth:`buildAsKernelMixture`. -The bandwidth selection depends on the dimension. +The bandwidth selection depends on the dimension: -- If dimension = 1, then `computeMixedBandwidth` is used. -- Otherwise, then the only multivariate rule `computeSilvermanBandwidth` is used. + - If dimension 1, then :meth:`computeMixedBandwidth` is used, + - Otherwise, then the only multivariate rule :meth:`computeSilvermanBandwidth` is used. Examples -------- @@ -233,7 +249,7 @@ Compare the PDFs: Returns ------- bandwidth : :class:`~openturns.Point` - Bandwidth used in each direction. + Bandwidth. " // --------------------------------------------------------------------- @@ -248,13 +264,45 @@ kernel : :class:`~openturns.Distribution` // --------------------------------------------------------------------- +%feature("docstring") OT::KernelSmoothing::getBoundaryCorrection +"Accessor to the boundary correction flag. + +Returns +------- +boundaryCorrection : bool + Flag to tell if the boundary correction is activated. + +Notes +----- +This treatment is available in dimension 1 only." + +// --------------------------------------------------------------------- + %feature("docstring") OT::KernelSmoothing::setBoundaryCorrection "Accessor to the boundary correction flag. Parameters ---------- boundaryCorrection : bool - Activates the boundary correction using the mirroring technique." + Activates the boundary correction using the mirroring technique. + +Notes +----- +This treatment is available in dimension 1 only. See [jones1993]_ to get more details. +The *reflection* or *mirroring* method +is used: the boundaries are automatically detected from the sample +(with the *min* and *max* functions) and the kernel smoothed distribution +is corrected in the boundary areas to remain within the boundaries, +according to the mirroring technique: + +- the Scott bandwidth is evaluated from the sample: *h* +- two sub-samples are extracted from the initial sample, + containing all the points within the range :math:`[min, min + h[` and :math:`]max-h, max]`, +- both sub-samples are transformed into their symmetric samples with respect their respective boundary: + its results two samples within the range :math:`]min-h, min]` and :math:`[max, max+h[`, +- a kernel smoothed PDF is built from the new sample composed with + the initial one and the two new ones, with the previous bandwidth *h*, +- this last kernel smoothed PDF is truncated within the initial range :math:`[min, max]` (conditional PDF)." // --------------------------------------------------------------------- @@ -275,9 +323,9 @@ The possible values for the bounding option are: - KernelSmoothing.UPPER or 2: apply the boundary correction to the upper bound - KernelSmoothing.BOTH or 3: apply the boundary correction to both bounds -It applies only to 1D samples. Each bound can be defined by the user or computed -automatically from the sample, see *setLowerBound*, *setUpperBound*, -*setAutomaticLowerBound*, *setAutomaticUpperBound*." +This treatment is available in dimension 1 only. Each bound can be defined by the user or computed +automatically from the sample, see :meth:`setLowerBound`, :meth:`setUpperBound`, +:meth:`setAutomaticLowerBound`, :meth:`setAutomaticUpperBound`." // --------------------------------------------------------------------- @@ -291,6 +339,7 @@ lowerBound : float Notes ----- +This treatment is available in dimension 1 only. This method automatically sets the *automaticLowerBound* flag to *False*. The given value will be taken into account only if *boundingOption* is set to either 1 or 3. If the algorithm is applied to a sample with a minimum value @@ -306,10 +355,11 @@ less than the user-defined lower bound and the *automaticLowerBound* is set to Parameters ---------- upperBound : float - A user-defined lower bound to take into account for boundary correction. + A user-defined upper bound to take into account for boundary correction. Notes ----- +This treatment is available in dimension 1 only. This method automatically sets the *automaticLowerBound* flag to *False*. The given value will be taken into account only if *boundingOption* is set to either 1 or 3. If the algorithm is applied to a sample with a minimum value @@ -325,7 +375,13 @@ less than the user-defined lower bound and the *automaticLowerBound* is set to Parameters ---------- automaticLowerBound : bool - Flag to tell if the user-defined lower bound has to be taken into account (value *False*) or if the minimum of the given sample has to be used (value *True*)." + Flag to tell if the lower bound is automatically calculated from the sample. + +Notes +----- +This treatment is available in dimension 1 only. +The automatic lower bound is the minimum of the given sample. In the other case, +the user has to specify the lower bound." // --------------------------------------------------------------------- @@ -335,7 +391,96 @@ automaticLowerBound : bool Parameters ---------- automaticUpperBound : bool - Flag to tell if the user-defined upper bound has to be taken into account (value *False*) or if the maximum of the given sample has to be used (value *True*)." + Flag to tell if the upper bound is automatically calculated from the sample. + +Notes +----- +This treatment is available in dimension 1 only. +The automatic upper bound is the maximum of the given sample. In the other case, +the user has to specify the upper bound." + +// --------------------------------------------------------------------- + +%feature("docstring") OT::KernelSmoothing::getBinning +"Accessor to the binning flag. + +Returns +------- +binning : bool + Flag to tell if the binning treatment is activated. + +Notes +----- +This treatment is available in dimension 1 and 2 only." + +// --------------------------------------------------------------------- + +%feature("docstring") OT::KernelSmoothing::setBinning +"Accessor to the binning flag. + +Parameters +---------- +binning : bool + Flag to tell if the binning treatment is activated. + +Notes +----- +This treatment is available in dimension 1 and 2 only. +It creates a regular grid of *binNumber* +intervals in each +dimension, then the unit weight of each point is linearly affected to the vertices +of the bin containing the point (see [wand1994]_ appendix D, page 182). +The `KernelSmoothing-BinNumber` key of the class :class:`~openturns.ResourceMap` defines the default value of the +number of bins used in the _binning_ algorithm to improve the evaluation speed." + +// --------------------------------------------------------------------- + +%feature("docstring") OT::KernelSmoothing::setUseLogTransform +"Accessor to the log-transform flag. + +Parameters +---------- +useLogTransform : bool + Flag to tell if the kernel smoothing distribution is built on the log-transformed data. + +Notes +----- +This treatment is available in dimension 1 only. See [charpentier2015]_ to get more details. + +We denote by :math:`(X_i)_{1 \leq i \leq n}` +some independent random variates, identically distributed according to :math:`X`. + +The log-transform treatment maps each :math:`X_j` into :math:`Y_j` as follows: + +.. math:: + + Y_j = T(X_j) = \left | + \begin{array}{ll} + \log (X_j - \min_{i} X_i + \delta) & \mbox{if } \gamma_1(X) >0\\ + \log (\max_{i} X_i - X_j + \delta) & \mbox{if } \gamma_1(X) >0 + \end{array} + \right. + +where :math:`\gamma_1(X) = \dfrac{\Expect{\left( X - \mu\right)^3}}{\sigma}` +is the skewness of :math:`X` with :math:`\mu = \Expect{X}`, :math:`\sigma^2 = \Var{X}` and :math:`\delta \in \Rset^+_*` the shift +scale. Its value is fixed in the `KernelSmoothing-DefaultShiftScale` key of the class :class:`~openturns.ResourceMap`. + +Once a kernel smoothed distribution has been fitted on the transformed data, the fitted distribution of :math:`X` +is built as a :class:`~openturns.CompositeDistribution` from :math:`T^{-1}` and the kernel smoothed distribution." + +// --------------------------------------------------------------------- + +%feature("docstring") OT::KernelSmoothing::getUseLogTransform +"Accessor to the log-transform flag. + +Returns +------- +useLogTransform : bool + Flag to tell if the kernel smoothing distribution is built on the log-transformed data. + +Notes +----- +This treatment is available in dimension 1 only." // --------------------------------------------------------------------- %feature("docstring") OT::KernelSmoothing::computeSilvermanBandwidth @@ -344,13 +489,16 @@ automaticUpperBound : bool Returns ------- bandwidth : :class:`~openturns.Point` - Bandwidth which components are evaluated according to the Silverman rule - assuming a normal distribution. - The bandwidth uses a robust estimate of the - sample standard deviation, based on the interquartile range introduced - in :ref:`kernel_smoothing` (rather than the sample standard deviation). - This method can manage a multivariate sample and produces a - multivariate bandwidth. + Bandwidth computed according to the Silverman rule. + +Notes +----- +Each component of the bandwidth which components is evaluated according to the Silverman rule +assuming a normal distribution.The bandwidth uses a robust estimate of the +sample standard deviation, based on the interquartile range introduced +in :ref:`kernel_smoothing` (rather than the sample standard deviation). +This method can manage a multivariate sample and produces a +multivariate bandwidth. " // --------------------------------------------------------------------- @@ -360,11 +508,12 @@ bandwidth : :class:`~openturns.Point` Returns ------- bandwidth : :class:`~openturns.Point` - Bandwidth which components are evaluated according to the plugin rule. + Bandwidth computed according to the plug-in rule. Notes ----- -This plug-in method is based on the *solve-the-equation* rule from [sheather1991]_. +Each component of the bandwidth which components is evaluated according to +the plug-in rule. This plug-in rule is based on the *solve-the-equation* rule from [sheather1991]_. This method can take a lot of time for large samples, as the cost is quadratic with the sample size. Several keys of the :class:`~openturns.ResourceMap` are used by the [sheather1991]_ method. @@ -373,17 +522,12 @@ Several keys of the :class:`~openturns.ResourceMap` are used by the [sheather199 to estimate the bandwidth. It defines the absolute tolerance used by the solver to solve the nonlinear equation. - - The `KernelSmoothing-MaximumIteration` key defines the maximum number of iterations used by the solver. - - The `KernelSmoothing-RelativePrecision` key defines the relative tolerance. - - The `KernelSmoothing-AbsolutePrecision` key defines the absolute tolerance. - - The `KernelSmoothing-ResidualPrecision` key defines the absolute tolerance on the residual. - - The `KernelSmoothing-CutOffPlugin` key is the cut-off value introduced in :ref:`kernel_smoothing`. @@ -415,11 +559,7 @@ when the sample size is large. Let :math:`n` be the sample size. The estimator depends on the threshold sample size :math:`n_t` defined in the -`KernelSmoothing-SmallSize` key of the :class:`~openturns.ResourceMap`. - +`KernelSmoothing-SmallSize` key of the :class:`~openturns.ResourceMap`: -- If :math:`n \leq n_t`, i.e. for a small sample, we use the plugin solve-the-equation - method. - -- Otherwise, the *mixed* rule is used. -" +- if :math:`n \leq n_t`, i.e. for a small sample, we use the plugin solve-the-equation method, +- otherwise, the *mixed* rule is used." diff --git a/python/test/t_KernelSmoothing_std.py b/python/test/t_KernelSmoothing_std.py index e0e0ef3385..2e861af324 100755 --- a/python/test/t_KernelSmoothing_std.py +++ b/python/test/t_KernelSmoothing_std.py @@ -1,6 +1,7 @@ #! /usr/bin/env python import openturns as ot +from openturns.testing import assert_almost_equal ot.TESTPREAMBLE() @@ -229,3 +230,20 @@ sample = distribution.getSample(30) h = factory.computePluginBandwidth(sample)[0] print("with reduced cutoff. h=%.6g" % (h)) + +# test of logTransform +for i, distribution in enumerate([ot.LogNormal(0.0, 2.5), + ot.Beta(20000.5, 2.5, 0.0, 1.0), + ot.Exponential(), + ot.WeibullMax(1.0, 0.9, 0.0), + ot.Mixture([ot.LogNormal(-1.0, 1.0, -1.0), ot.LogNormal(1.0, 1.0, 1.0)], [0.2, 0.8])]): + sample = distribution.getSample(10000) + kernel = ot.KernelSmoothing() + kernel.setUseLogTransform(True) + fitted = kernel.build(sample) + quantile = distribution.computeQuantile(0.25) + assert_almost_equal(distribution.computePDF(quantile), fitted.computePDF(quantile), 0.05) + quantile = distribution.computeQuantile(0.5) + assert_almost_equal(distribution.computePDF(quantile), fitted.computePDF(quantile), 0.05) + quantile = distribution.computeQuantile(0.75) + assert_almost_equal(distribution.computePDF(quantile), fitted.computePDF(quantile), 0.05)