From 71797379f64c39fa2b63599a1b6dc1ab7f9a5679 Mon Sep 17 00:00:00 2001 From: Michael BAUDIN Date: Thu, 27 Apr 2023 12:46:30 +0200 Subject: [PATCH] Doc: Added 3 examples on regression analysis Create a new Linthurst use case for regression --- ChangeLog | 2 + python/doc/bibliography.rst | 3 + .../plot_stepwise.py | 80 +--- .../plot_chaos_cv.py | 60 ++- .../general_methods/plot_pce_design.py | 445 ++++++++++++++++++ .../plot_regression_interval.py | 271 +++++++++++ .../general_methods/plot_regression_sinus.py | 436 +++++++++++++++++ python/doc/usecases/use_case_linthurst.rst | 31 ++ python/doc/usecases/usecases.rst | 2 +- python/src/usecases/linthurst.py | 79 ++++ 10 files changed, 1340 insertions(+), 69 deletions(-) create mode 100755 python/doc/examples/numerical_methods/general_methods/plot_pce_design.py create mode 100755 python/doc/examples/numerical_methods/general_methods/plot_regression_interval.py create mode 100755 python/doc/examples/numerical_methods/general_methods/plot_regression_sinus.py create mode 100644 python/doc/usecases/use_case_linthurst.rst create mode 100644 python/src/usecases/linthurst.py diff --git a/ChangeLog b/ChangeLog index f085ed9625..085a6ce300 100644 --- a/ChangeLog +++ b/ChangeLog @@ -38,6 +38,7 @@ * UniformOverMesh (openturns.experimental) * GeneralizedExtremeValueValidation (openturns.experimental) * Coles (openturns.usecases.coles) + * Linthurst (openturns.usescases.linthurst) ==== API changes ==== * Deprecated LinearLeastSquaresCalibration::getCandidate, use getStartingPoint @@ -79,6 +80,7 @@ === Documentation === * Add API documentation to common use cases pages + * Added new use case: Linthurst === Python module === diff --git a/python/doc/bibliography.rst b/python/doc/bibliography.rst index 0afb9ec5b2..988c1482fe 100644 --- a/python/doc/bibliography.rst +++ b/python/doc/bibliography.rst @@ -33,6 +33,9 @@ Bibliography uncertainty propagation and sensitivity analysis.*, PhD thesis. Blaise Pascal University-Clermont II, France, 2009. `pdf `__ +.. [blatman2011] Blatman, G., and Sudret, B.. + *Adaptive sparse polynomial chaos expansion based on least angle regression.* + Journal of Computational Physics 230 (2011) 2345–2367. .. [burnham2002] Burnham, K.P., and Anderson, D.R. *Model Selection and Multimodel Inference: A Practical Information Theoretic Approach*, Springer, 2002. diff --git a/python/doc/examples/meta_modeling/general_purpose_metamodels/plot_stepwise.py b/python/doc/examples/meta_modeling/general_purpose_metamodels/plot_stepwise.py index 328aa7f566..2a00cc8a26 100644 --- a/python/doc/examples/meta_modeling/general_purpose_metamodels/plot_stepwise.py +++ b/python/doc/examples/meta_modeling/general_purpose_metamodels/plot_stepwise.py @@ -14,61 +14,31 @@ from openturns.viewer import View import numpy as np import matplotlib.pyplot as plt +from openturns.usecases import linthurst +# %% +# We define the data. +# + +# %% +ds = linthurst.Linthurst() +dimension = ds.data.getDimension() - 1 + +input_sample = ds.data[:, 1: dimension + 1] +print("Input :") +print(input_sample[:5]) +output_sample = ds.data[:, 0] +print("Output :") +print(output_sample[:5]) + +input_description = input_sample.getDescription() +output_description = output_sample.getDescription() + +n = input_sample.getSize() +print("n = ", n) +dimension = ds.data.getDimension() - 1 +print("dimension = ", dimension) -description = ["BIO", "SAL", "pH", "K", "Na", "Zn"] -data = [ - [676, 33, 5, 1441.67, 35185.5, 16.4524], - [516, 35, 4.75, 1299.19, 28170.4, 13.9852], - [1052, 32, 4.2, 1154.27, 26455, 15.3276], - [868, 30, 4.4, 1045.15, 25072.9, 17.3128], - [1008, 33, 5.55, 521.62, 31664.2, 22.3312], - [436, 33, 5.05, 1273.02, 25491.7, 12.2778], - [544, 36, 4.25, 1346.35, 20877.3, 17.8225], - [680, 30, 4.45, 1253.88, 25621.3, 14.3516], - [640, 38, 4.75, 1242.65, 27587.3, 13.6826], - [492, 30, 4.6, 1281.95, 26511.7, 11.7566], - [984, 30, 4.1, 553.69, 7886.5, 9.882], - [1400, 37, 3.45, 494.74, 14596, 16.6752], - [1276, 33, 3.45, 525.97, 9826.8, 12.373], - [1736, 36, 4.1, 571.14, 11978.4, 9.4058], - [1004, 30, 3.5, 408.64, 10368.6, 14.9302], - [396, 30, 3.25, 646.65, 17307.4, 31.2865], - [352, 27, 3.35, 514.03, 12822, 30.1652], - [328, 29, 3.2, 350.73, 8582.6, 28.5901], - [392, 34, 3.35, 496.29, 12369.5, 19.8795], - [236, 36, 3.3, 580.92, 14731.9, 18.5056], - [392, 30, 3.25, 535.82, 15060.6, 22.1344], - [268, 28, 3.25, 490.34, 11056.3, 28.6101], - [252, 31, 3.2, 552.39, 8118.9, 23.1908], - [236, 31, 3.2, 661.32, 13009.5, 24.6917], - [340, 35, 3.35, 672.15, 15003.7, 22.6758], - [2436, 29, 7.1, 528.65, 10225, 0.3729], - [2216, 35, 7.35, 563.13, 8024.2, 0.2703], - [2096, 35, 7.45, 497.96, 10393, 0.3205], - [1660, 30, 7.45, 458.38, 8711.6, 0.2648], - [2272, 30, 7.4, 498.25, 10239.6, 0.2105], - [824, 26, 4.85, 936.26, 20436, 18.9875], - [1196, 29, 4.6, 894.79, 12519.9, 20.9687], - [1960, 25, 5.2, 941.36, 18979, 23.9841], - [2080, 26, 4.75, 1038.79, 22986.1, 19.9727], - [1764, 26, 5.2, 898.05, 11704.5, 21.3864], - [412, 25, 4.55, 989.87, 17721, 23.7063], - [416, 26, 3.95, 951.28, 16485.2, 30.5589], - [504, 26, 3.7, 939.83, 17101.3, 26.8415], - [492, 27, 3.75, 925.42, 17849, 27.7292], - [636, 27, 4.15, 954.11, 16949.6, 21.5699], - [1756, 24, 5.6, 720.72, 11344.6, 19.6531], - [1232, 27, 5.35, 782.09, 14752.4, 20.3295], - [1400, 26, 5.5, 773.3, 13649.8, 19.588], - [1620, 28, 5.5, 829.26, 14533, 20.1328], - [1560, 28, 5.4, 856.96, 16892.2, 19.242], -] -input_description = ["SAL", "pH", "K", "Na", "Zn"] -output_description = ["BIO"] -sample = ot.Sample(np.array(data)) -dimension = sample.getDimension() - 1 -n = sample.getSize() # %% # Complete linear model @@ -80,11 +50,9 @@ # We start by creating a linear model which takes into account all of the physicochemical variables present within the Linthrust data set. # # Let us consider the following linear model :math:`\tilde{Y} = a_0 + \sum_{i = 1}^{d} a_i X_i + \epsilon`. If all of the predictive variables -# are considered, the regression can be performed with the help of the `LinearModelAlgorithm` class. +# are considered, the regression can be performed with the help of the :class:`~openturns.LinearModelAlgorithm` class. # %% -input_sample = sample[:, 1: dimension + 1] -output_sample = sample[:, 0] algo_full = ot.LinearModelAlgorithm(input_sample, output_sample) algo_full.run() result_full = ot.LinearModelResult(algo_full.getResult()) diff --git a/python/doc/examples/meta_modeling/polynomial_chaos_metamodel/plot_chaos_cv.py b/python/doc/examples/meta_modeling/polynomial_chaos_metamodel/plot_chaos_cv.py index a3dc48e9c8..2d00a36a55 100644 --- a/python/doc/examples/meta_modeling/polynomial_chaos_metamodel/plot_chaos_cv.py +++ b/python/doc/examples/meta_modeling/polynomial_chaos_metamodel/plot_chaos_cv.py @@ -1,16 +1,21 @@ """ -Chaos cross-validation ----------------------- +Polynomial chaos expansion cross-validation +=========================================== """ # %% +# Introduction +# ------------ +# # In this example, we show how to perform the cross-validation of the # :ref:`Ishigami model` using polynomial chaos expansion. # More precisely, we use the methods suggested in [muller2016]_, chapter 5, page 257. # We make the assumption that a dataset is given and we create a metamodel using this data. # Once done, we want to assess the quality of the metamodel using the data we have. +# Another example of this method is presented in +# :doc:`/auto_numerical_methods/general_methods/plot_pce_design`. # -# We compare several methods: +# In this example, we compare two methods: # # - split the data into two subsets, one for training and one for testing, # - use k-fold validation. @@ -24,7 +29,6 @@ # It uses the K-Fold method with :math:`k = 5`. # The code uses the :class:`~openturns.KFoldSplitter` class, which computes the appropriate indices. # Similar results can be obtained with :class:`~openturns.LeaveOneOutSplitter` at a higher cost. -# # This cross-validation method is used here to see which polynomial degree # leads to an accurate metamodel of the Ishigami test function. @@ -33,8 +37,17 @@ import openturns.viewer as otv from openturns.usecases import ishigami_function +# %% +# Define helper functions +# ----------------------- + +# %% +# The next function creates a polynomial chaos expansion from +# a training data set and a total degree. # %% + + def compute_sparse_least_squares_chaos( inputTrain, outputTrain, basis, totalDegree, distribution ): @@ -70,20 +83,22 @@ def compute_sparse_least_squares_chaos( projectionStrategy = ot.LeastSquaresStrategy( inputTrain, outputTrain, selectionAlgorithm ) - enumfunc = basis.getEnumerateFunction() - P = enumfunc.getStrataCumulatedCardinal(totalDegree) - adaptiveStrategy = ot.FixedStrategy(basis, P) - chaosalgo = ot.FunctionalChaosAlgorithm( + enumerateFunction = basis.getEnumerateFunction() + basisSize = enumerateFunction.getBasisSizeFromTotalDegree(totalDegree) + adaptiveStrategy = ot.FixedStrategy(basis, basisSize) + chaosAlgo = ot.FunctionalChaosAlgorithm( inputTrain, outputTrain, distribution, adaptiveStrategy, projectionStrategy ) - chaosalgo.run() - result = chaosalgo.getResult() + chaosAlgo.run() + result = chaosAlgo.getResult() return result # %% +# The next function computes the Q2 score by splitting the data set +# into a training set and a test set. - +# %% def compute_Q2_score_by_splitting( X, Y, basis, totalDegree, distribution, split_fraction=0.75 ): @@ -119,8 +134,9 @@ def compute_Q2_score_by_splitting( # %% +# The next function computes the Q2 score by K-Fold. - +# %% def compute_Q2_score_by_kfold(X, Y, basis, totalDegree, distribution, n_folds=5): """ Compute score by KFold. @@ -155,6 +171,10 @@ def compute_Q2_score_by_kfold(X, Y, basis, totalDegree, distribution, n_folds=5) return Q2_score +# %% +# Define the training data set +# ---------------------------- + # %% # We start by generating the input and output sample. We use a sample size equal to 1000. @@ -195,6 +215,10 @@ def compute_Q2_score_by_kfold(X, Y, basis, totalDegree, distribution, n_folds=5) # %% metamodel = result.getMetaModel() +# %% +# Validate the metamodel from a test set +# -------------------------------------- + # %% # In order to validate our polynomial chaos, we generate an extra pair of # input / output samples and use the :class:`~openturns.MetaModelValidation` class. @@ -228,6 +252,10 @@ def compute_Q2_score_by_kfold(X, Y, basis, totalDegree, distribution, n_folds=5) # The K-Fold validation aims at solving some of these issues, so that all the # available data is used in order to estimate the :math:`Q^2` score. +# %% +# Compute the Q2 score from a test set +# ------------------------------------ + # %% # In the following script, we compute the :math:`Q^2` score associated with each polynomial degree from 1 to 10. degree_max = 10 @@ -251,6 +279,10 @@ def compute_Q2_score_by_kfold(X, Y, basis, totalDegree, distribution, n_folds=5) # %% # We see that the polynomial degree may be increased up to degree 7, # after which the :math:`Q^2` score does not increase much. + +# %% +# Compute the Q2 score from K-Fold cross-validation +# ------------------------------------------------- # # One limitation of the previous method is that the estimate of the :math:`Q^2` may be sensitive to the particular split of the dataset. # The following script uses 5-Fold cross validation to estimate the :math:`Q^2` score. @@ -272,6 +304,10 @@ def compute_Q2_score_by_kfold(X, Y, basis, totalDegree, distribution, n_folds=5) # %% # The conclusion is similar to the previous method. + +# %% +# Conclusion +# ---------- # # When we select the best polynomial degree which maximizes the :math:`Q^2` score, # the danger is that the validation set is used both for computing the :math:`Q^2` and to maximize it: diff --git a/python/doc/examples/numerical_methods/general_methods/plot_pce_design.py b/python/doc/examples/numerical_methods/general_methods/plot_pce_design.py new file mode 100755 index 0000000000..97e7a9bd17 --- /dev/null +++ b/python/doc/examples/numerical_methods/general_methods/plot_pce_design.py @@ -0,0 +1,445 @@ +""" +Compute leave-one-out error of a polynomial chaos expansion +=========================================================== +""" +# %% +# +# Introduction +# ------------ +# +# In this example, we compute the design matrix of a polynomial chaos +# expansion using the :class:`~openturns.DesignProxy` class. +# Then we compute the analytical leave-one-out error using the +# diagonal of the projection matrix. +# To do this, we use equations from [blatman2009]_ page 85 +# (see also [blatman2011]_). +# In this advanced example, we use the :class:`~openturns.DesignProxy` +# and :class:`~openturns.QRMethod` low level classes. +# A naive implementation of this method is presented in +# :doc:`/auto_meta_modeling/polynomial_chaos_metamodel/plot_chaos_cv` +# using K-Fold cross-validation. + +# %% +# The design matrix +# ----------------- +# In this section, we analyze why the :class:`~openturns.DesignProxy` +# is linked to the classical linear least squares regression problem. +# Let :math:`n` be the number of observations and :math:`m` be the number of coefficients +# of the linear model. +# Let :math:`D \in \mathbb{R}^{n \times m}` be the design matrix, i.e. +# the matrix that produces the predictions of the linear regression model from +# the coefficients: +# +# .. math:: +# +# \hat{\vect{y}} = D \vect{a} +# +# where :math:`\vect{a} \in \mathbb{R}^m` is the vector of coefficients, +# :math:`\hat{y} \in \mathbb{R}^n` is the +# vector of predictions. +# The linear least squares problem is: +# +# .. math:: +# +# \operatorname{argmin}_{\vect{a} \in \mathbb{R}^m} +# \left\| D \vect{a} - \vect{y} \right\|_2^2. +# +# The solution is given by the normal equations, i.e. the vector of coefficients +# is the solution of the following linear system of equations: +# +# .. math:: +# +# G \vect{a} = D^T \vect{y} +# +# where :math:`G \in \Rset^{n \times n}` is the *Gram* matrix: +# +# .. math:: +# +# G := D^T D. +# +# The hat matrix is the projection matrix defined by: +# +# .. math:: +# +# H := D \left(D^T D\right)^{-1} D^T. +# +# The hat matrix puts a hat to the vector of observations to +# produce the vector of predictions of the linear model: +# +# .. math:: +# +# \hat{\vect{y}} = H \vect{y} +# +# To solve a linear least squares problem, we need to evaluate the +# design matrix :math:`D`, which is the primary goal of +# the :class:`~openturns.DesignProxy` class. +# Let us present some examples of situations where the design matrix +# is required. +# +# - When we use the QR decomposition, we actually do not need to evaluate it in +# our script: the :class:`~openturns.QRMethod` class knows how to compute the +# solution without evaluating the Gram matrix :math:`D^T D`. +# - We may need the inverse Gram matrix +# :math:`\left(D^T D\right)^{-1}` sometimes, for example when we want to create +# a D-optimal design. +# - Finally, when we want to compute the analytical leave-one-out error, +# we need to compute the diagonal of the projection matrix :math:`H`. +# +# For all these purposes, the `DesignProxy` is *the* tool. + +# %% +# The leave-one-out error +# ----------------------- +# In this section, we show that the leave-one-error of a regression problem +# can be computed using an analytical formula which depends on the hat matrix +# :math:`H`. +# We consider the physical model: +# +# .. math:: +# +# y = g(\vect{x}) +# +# where :math:`\vect{x} \in \Rset^{n_X}` is the input and :math:`y \in \Rset` is +# the output. +# Consider the problem of approximating the physical model :math:`g` by the +# linear model: +# +# .. math:: +# +# \hat{y} := \tilde{g}(\vect{x}) = \sum_{k = 1}^m a_k \psi_k(\vect{x}) +# +# for any :math:`\vect{x} \in \Rset^{n_X}` where :math:`\{\psi_k : \Rset^{n_X} \rightarrow \Rset\}_{k = 1, \ldots, m}` are the basis functions and +# :math:`\vect{a} \in \Rset^m` is a vector of parameters. +# The mean squared error is ([blatman2009]_ eq. 4.23 page 83): +# +# .. math:: +# +# \operatorname{MSE}\left(\tilde{g}\right) +# = \mathbb{E}_{\vect{X}}\left[\left(g\left(\vect{X}\right) - \tilde{g}\left(\vect{X}\right) \right)^2 \right] +# +# The leave-one-out error is an estimator of the mean squared error. +# Let: +# +# .. math:: +# +# \cD = \{\vect{x}^{(1)}, \ldots, \vect{x}^{(n)} \in \Rset^{n_X}\} +# +# be independent observations of the input random vector :math:`\vect{X}` and +# let :math:`\{y^{(1)}, \ldots, y^{(n)} \in \Rset^{n_X}\}` be the corresponding +# observations of the output of the physical model: +# +# .. math:: +# +# y^{(j)} = g\left(\vect{x}^{(j)}\right) +# +# for :math:`j = 1, ..., n`. +# Let :math:`\vect{y} \in \Rset^n` be the vector of observations: +# +# .. math:: +# +# \vect{y} = (y^{(1)}, \ldots, y^{(n)})^T. +# +# +# Consider the following set of inputs, let aside the :math:`j`-th input: +# +# .. math:: +# +# \cD^{(-j)} := \left\{\vect{x}^{(1)}, \ldots, \vect{x}^{(j - 1)}, \vect{x}^{(j + 1)}, \ldots, \vect{x}^{(n)}\right\} +# +# for :math:`j \in \{1, ..., n\}`. +# Let :math:`\vect{y}^{(-j)} \in \Rset^{n - 1}` be the vector of +# observations, let aside the :math:`j`-th observation: +# +# .. math:: +# +# \vect{y}^{(-j)} = (y^{(1)}, \ldots, y^{(j - 1)}, y^{(j + 1)}, \ldots, y^{(n)})^T +# +# for :math:`j \in \{1, ..., n\}`. +# Let :math:`\tilde{g}^{(-j)}` the metamodel built on the data set :math:`\left(\cD^{(-j)}, \vect{y}^{(-j)}\right)`. +# The leave-one-out error is: +# +# .. math:: +# +# \widehat{\operatorname{MSE}}_{LOO}\left(\tilde{g}\right) +# = \frac{1}{n} \sum_{j = 1}^n \left(g\left(\vect{x}^{(j)}\right) - \tilde{g}^{(-j)}\left(\vect{x}^{(j)}\right)\right)^2 +# +# The leave-one-out error is sometimes referred to as *predicted residual sum of +# squares* (PRESS) or *jacknife error*. +# In the next section, we show how this estimator can be computed analytically, +# using the hat matrix. + +# %% +# The analytical leave-one-out error +# ---------------------------------- +# One limitation of the previous equation is that we must train +# :math:`n` different surrogate models, which can be long in some situations. +# To overcome this limitation, we can use the following equations. +# Let :math:`\boldsymbol{\Psi} \in \Rset^{n \times m}` design matrix ([blatman2009]_ eq. 4.32 page 85): +# +# .. math:: +# +# \boldsymbol{\Psi}_{ik} = \psi_k\left(\vect{x}^{(j)}\right) +# +# for :math:`j = 1, ..., n` and :math:`k = 1, ..., m`. +# The matrix :math:`\boldsymbol{\Psi}` is mathematically equal to the +# :math:`D` matrix presented earlier in the present document. +# Let :math:`H \in \Rset^{n \times n}` be the projection matrix: +# +# .. math:: +# +# H = \boldsymbol{\Psi} \left(\boldsymbol{\Psi}^T \boldsymbol{\Psi}\right) \boldsymbol{\Psi}^T. +# +# It can be proved that ([blatman2009]_ eq. 4.33 page 85): +# +# .. math:: +# +# \widehat{\operatorname{MSE}}_{LOO}\left(\tilde{g}\right) +# = \frac{1}{n} \sum_{j = 1}^n \left(\frac{g\left(\vect{x}^{(j)}\right) - \tilde{g}\left(\vect{x}^{(j)}\right)}{1 - h_{jj}}\right)^2 +# +# where :math:`h_{jj} \in \Rset` is the diagonal of the hat matrix +# for :math:`j \in \{1, ..., n\}`. +# The goal of this example is to show how to implement the previous equation +# using the :class:`~openturns.DesignProxy` class. + +import openturns as ot +import openturns.viewer as otv +import numpy as np +from openturns.usecases import ishigami_function + + +# %% +# Create the polynomial chaos model +# --------------------------------- + +# %% +# We load the Ishigami model. +im = ishigami_function.IshigamiModel() + +# %% +# Create a training sample. + +# %% +nTrain = 100 +xTrain = im.distributionX.getSample(nTrain) +yTrain = im.model(xTrain) + +# %% +# Create the chaos. + + +def ComputeSparseLeastSquaresFunctionalChaos( + inputTrain, + outputTrain, + multivariateBasis, + basisSize, + distribution, + sparse=True, +): + if sparse: + selectionAlgorithm = ot.LeastSquaresMetaModelSelectionFactory() + else: + selectionAlgorithm = ot.PenalizedLeastSquaresAlgorithmFactory() + projectionStrategy = ot.LeastSquaresStrategy( + inputTrain, outputTrain, selectionAlgorithm + ) + adaptiveStrategy = ot.FixedStrategy(multivariateBasis, basisSize) + chaosAlgorithm = ot.FunctionalChaosAlgorithm( + inputTrain, outputTrain, distribution, adaptiveStrategy, projectionStrategy + ) + chaosAlgorithm.run() + chaosResult = chaosAlgorithm.getResult() + return chaosResult + + +# %% +multivariateBasis = ot.OrthogonalProductPolynomialFactory([im.X1, im.X2, im.X3]) +totalDegree = 5 +enumerateFunction = multivariateBasis.getEnumerateFunction() +basisSize = enumerateFunction.getBasisSizeFromTotalDegree(totalDegree) +print("Basis size = ", basisSize) + +sparse = False # For full PCE and comparison with analytical LOO error +chaosResult = ComputeSparseLeastSquaresFunctionalChaos( + xTrain, + yTrain, + multivariateBasis, + basisSize, + im.distributionX, + sparse, +) + +# %% +# The DesignProxy +# --------------- + +# %% +# The :class:`~openturns.DesignProxy` class provides methods used to create the objects necessary to solve +# the least squares problem. +# More precisely, it provides the :meth:`~openturns.DesignProxy.computeDesign` +# method that we need to evaluate the design matrix. +# In many cases we do not need that matrix, but the Gram matrix (or its inverse). +# The :class:`~openturns.DesignProxy` class is needed by a least squares solver, +# e.g. :class:`~openturns.QRMethod` that knows how to actually compute the coefficients. +# +# Another class is the :class:`~openturns.Basis` class which manages a set of +# functions as the functional basis for the decomposition. +# This basis is required by the constructor of the :class:`~openturns.DesignProxy` because it defines +# the columns of the matrix. +# +# In order to create that basis, we use the :meth:`~openturns.FunctionalChaosResult.getReducedBasis` method, +# because the model selection (such as :class:`~openturns.LARS` for example) +# may have selected functions which best predict the output. +# This may reduce the number of coefficients to estimate and +# improve their accuracy. +# This is important here, because it defines the number of +# columns in the design matrix. + +# %% +reducedBasis = chaosResult.getReducedBasis() # As a result of the model selection +transformation = ( + chaosResult.getTransformation() +) # As a result of the input distribution +zTrain = transformation( + xTrain +) # Map from the physical input into the transformed input + +# %% +# We can now create the design. + +# %% +designProxy = ot.DesignProxy(zTrain, reducedBasis) + +# %% +# To actually evaluate the design matrix, we +# can specify the columns that we need to evaluate. +# This can be useful when we perform model selection, because +# not all columns are always needed. +# This can lead to CPU and memory savings. +# In our case, we evaluate all the columns, which corresponds +# to evaluate all the functions in the basis. + +# %% +reducedBasisSize = reducedBasis.getSize() +print("Reduced basis size = ", reducedBasisSize) +allIndices = range(reducedBasisSize) +designMatrix = designProxy.computeDesign(allIndices) +print("Design matrix : ", designMatrix.getNbRows(), " x ", designMatrix.getNbColumns()) + +# %% +# Solve the least squares problem. +lsqMethod = ot.QRMethod(designProxy, allIndices) +betaHat = lsqMethod.solve(yTrain.asPoint()) + +# %% +# Compute the inverse of the Gram matrix. + +# %% +inverseGram = lsqMethod.getGramInverse() +print("Inverse Gram : ", inverseGram.getNbRows(), "x", inverseGram.getNbColumns()) + +# %% +# Compute the raw leave-one-out error +# ----------------------------------- +# In this section, we show how to compute the raw leave-one-out +# error using the naive formula. +# To do this, we could use implement the :class:~openturns.KFoldSplitter` class +# with `K = N`. +# Since this would complicate the script and obscure its purpose, +# we implement the leave-one-out method naively. + +# %% +# Compute leave-one-out error +predictionsLOO = ot.Sample(nTrain, 1) +residuals = ot.Point(nTrain) +for j in range(nTrain): + indicesLOO = list(range(nTrain)) + indicesLOO.pop(j) + xTrainLOO = xTrain[indicesLOO] + yTrainLOO = yTrain[indicesLOO] + xj = xTrain[j] + yj = yTrain[j] + + chaosResultLOO = ComputeSparseLeastSquaresFunctionalChaos( + xTrainLOO, + yTrainLOO, + multivariateBasis, + basisSize, + im.distributionX, + sparse, + ) + metamodelLOO = chaosResultLOO.getMetaModel() + predictionsLOO[j] = metamodelLOO(xj) + residuals[j] = (yj - predictionsLOO[j])[0] +mseLOO = residuals.normSquare() / nTrain +print("mseLOO = ", mseLOO) + +# %% +# For each point in the training sample, we plot the predicted leave-one-out +# output prediction depending on the observed output. + +graph = ot.Graph("Leave-one-out validation", "Observation", "LOO prediction", True) +cloud = ot.Cloud(yTrain, predictionsLOO) +graph.add(cloud) +curve = ot.Curve(yTrain, yTrain) +graph.add(curve) +graph.setColors(ot.Drawable().BuildDefaultPalette(2)) +view = otv.View(graph) + +# %% +# In the previous method, we must pay attention to the fact that +# the comparison that we are going to make is not necessarily +# valid if we use the :class:`~openturns.LARS` selection method, +# because this may lead to a different active basis for each leave-one-out +# sample. +# +# One limitation of the previous script is that it can be relatively +# long when the sample size increases or when the size of the +# functional basis increases. +# In the next section, we use the analytical formula: this can leads +# to significant time savings in some cases. + + +# %% +# Compute the analytical leave-one-out error +# ------------------------------------------ + +# %% +# Get the diagonal of the projection matrix. +# This is a :class:`~openturns.Point`. + +# %% +diagonalH = lsqMethod.getHDiag() +print("diagonalH : ", diagonalH.getDimension()) + +# %% +# Compute the metamodel predictions. + +# %% +metamodel = chaosResult.getMetaModel() +yHat = metamodel(xTrain) + +# %% +# Compute the residuals. + +# %% +residuals = yTrain.asPoint() - yHat.asPoint() + +# %% +# Compute the analytical leave-one-out error: +# perform elementwise division and exponentiation + +# %% +delta = np.array(residuals) / (1.0 - np.array(diagonalH)) +squaredDelta = delta**2 +leaveOneOutMSE = ot.Sample.BuildFromPoint(squaredDelta).computeMean()[0] +print("MSE LOO = ", leaveOneOutMSE) +relativeLOOError = leaveOneOutMSE / yTrain.computeVariance()[0] +q2LeaveOneOut = 1.0 - relativeLOOError +print("Q2 LOO = ", q2LeaveOneOut) + +# %% +# We see that the MSE leave-one-out error is equal to the naive LOO error. +# The numerical differences between the two values are the consequences +# of the rounding errors in the numerical evaluation of the hat matrix. + +otv.View.ShowAll() diff --git a/python/doc/examples/numerical_methods/general_methods/plot_regression_interval.py b/python/doc/examples/numerical_methods/general_methods/plot_regression_interval.py new file mode 100755 index 0000000000..e4be929e8e --- /dev/null +++ b/python/doc/examples/numerical_methods/general_methods/plot_regression_interval.py @@ -0,0 +1,271 @@ +""" +Compute confidence intervals of a regression model from data +============================================================ +""" + +# %% +# +# Introduction +# ------------ +# +# In this example, we compute the pointwise confidence interval of the +# estimator of the conditional expectation given an input. +# More precisely, we compute confidence intervals of the output of +# a regression model. +# The linear regression model is an order 1 multivariate polynomial. +# This model is built from a data set. +# In this advanced example, we use the :class:`~openturns.DesignProxy` +# and :class:`~openturns.QRMethod` low level classes. +# Another example of this method is presented in +# :doc:`/auto_numerical_methods/general_methods/plot_regression_sinus`. + +# %% + +import openturns as ot +import openturns.viewer as otv +import numpy as np +from openturns.usecases import linthurst + + +# %% + +palette = ot.Drawable.BuildTableauPalette(5) + +ot.RandomGenerator.SetSeed(0) + +# %% +# +# Get the data +# ------------ +# +# We consider the so-called :ref:`Linthurst` data set, which contains measures of +# aerial biomass (BIO) as well as 5 five physico-chemical properties of +# the soil: salinity (SAL), pH, K, Na, and Zn. +# The data set is taken from [rawlings2001]_ table 5.1 page 63. + + +# %% +# We define the data. +# + +# %% +ds = linthurst.Linthurst() + + +# %% +# Get the input and output samples. + +# %% +dimension = ds.data.getDimension() - 1 +print("dimension = ", dimension) +sampleSize = ds.data.getSize() +print("sampleSize = ", sampleSize) +inputSample = ds.data[:, 1: dimension + 1] +print("Input :") +print(inputSample[:5]) +outputSample = ds.data[:, 0] +print("Output :") +print(outputSample[:5]) + +inputDescription = inputSample.getDescription() +outputDescription = outputSample.getDescription() + + +# %% +# We want to create a linear regression model. +# To do this, we need to create a functional basis. +# We make the choice of using only degree 1 polynomials +# for each marginal input variable. + +functionCollection = [] +basisFunction = ot.SymbolicFunction(inputDescription, ["1.0"]) +functionCollection.append(basisFunction) +for i in range(dimension): + basisFunction = ot.SymbolicFunction(inputDescription, [inputDescription[i]]) + functionCollection.append(basisFunction) +basis = ot.Basis(functionCollection) + + +# %% +# We can then print the basis. + +basisSize = basis.getSize() +print("Basis size = ", basisSize) +for i in range(basisSize): + basisFunction = basis.build(i) + print("Function #", i, basisFunction) + + +# %% +# Create the least squares model +# ------------------------------ +# +# To create the least squares model, we use the :class:`~openturns.DesignProxy` class, +# which defines the design matrix of the linear regression model. +# Then we solve the problem using QR decomposition. + +designProxy = ot.DesignProxy(inputSample, basis) +indices = range(basisSize) # We consider all the functions in the basis +lsqMethod = ot.QRMethod(designProxy, indices) +betaHat = lsqMethod.solve(outputSample.asPoint()) +print(betaHat) + +# %% +# Based on the solution of the linear least squares problem, we +# can create the metamodel and evaluate the residuals. + +metamodel = ot.LinearCombinationFunction(basis, betaHat) + +# %% +# Compute residuals, variance and design matrix +# --------------------------------------------- +# We need to estimate the variance of the residuals. +# To do this we evaluate the predictions of the regression model +# on the training sample and compute the residuals. +yHat = metamodel(inputSample).asPoint() +residuals = yHat - outputSample.asPoint() + +# %% +# In order to compute confidence intervals of the mean, we need +# to estimate the sample standard deviation. + +sigma2Hat = residuals.normSquare() / (sampleSize - basisSize) +print("sigma2Hat", sigma2Hat) +sigmaHat = np.sqrt(sigma2Hat) +print("sigmaHat = ", sigmaHat) + +# %% +# We evaluate the design matrix and store it as a :class:`~openturns.Sample`. + +designMatrix = lsqMethod.computeWeightedDesign() +designSample = ot.Sample(np.array(ot.Matrix(designMatrix))) + + +# %% +# Compute confidence intervals +# ---------------------------- +# +# The next function will help to compute confidence intervals. +# It is based on regression analysis. + + +def computeRegressionConfidenceInterval( + lsqMethod, + betaHat, + sigma2Hat, + designSample, + alpha=0.95, + mean=True, + epsilonSigma=1.0e-5, +): + """ + Compute a confidence interval for the estimate of the mean. + + Evaluates this confidence interval at points in the design matrix. + + This code is based on (Rawlings, Pantula & David, 1998) + eq. 3.51 and 3.52 page 90. + + Parameters + ---------- + lsqMethod: ot.LeastSquaresMethod + The linear least squares method (e.g. QR or Cholesky). + betaHat : ot.Point(parameterDimension) + The solution of the least squares problem. + sigma2Hat : float > 0.0 + The estimate of the variance. + designSample : ot.Sample(size, parameterDimension) + The design matrix of the linear model. + This is the value of the functional basis depending on the + input sample. + Each row represents the input where the confidence interval + is to be computed. + alpha : float, in [0, 1] + The width of the confidence interval. + mean : bool + If True, then computes the confidence interval of the mean. + This interval contains yTrue = E[y|x] with probability alpha. + Otherwise, computes a confidence interval of the prediction at point x. + This interval contains y|x with probability alpha. + epsilonSigma : float > 0.0 + A relatively small number. The minimal value of variance, which + avoids a singular Normal distribution. + + Reference + --------- + - O. Rawlings John, G. Pantula Sastry, and A. Dickey David. + Applied regression analysis—a research tool. Springer New York, 1998. + + Returns + ------- + confidenceBounds : ot.Sample(size, 2) + The first column contains the lower bound. + The second column contains the upper bound. + """ + + inverseGram = lsqMethod.getGramInverse() + sampleSize = designSample.getSize() + confidenceBounds = ot.Sample(sampleSize, 2) + for i in range(sampleSize): + x0 = designSample[i, :] + meanYHat = x0.dot(betaHat) + sigma2YHat = x0.dot(inverseGram * x0) * sigma2Hat + if not mean: + sigma2YHat += sigma2Hat + sigmaYHat = np.sqrt(sigma2YHat) + sigmaYHat = max(epsilonSigma, sigmaYHat) # Prevents a zero s.e. + distribution = ot.Normal(meanYHat, sigmaYHat) + interval = distribution.computeBilateralConfidenceInterval(alpha) + lower = interval.getLowerBound() + upper = interval.getUpperBound() + confidenceBounds[i, 0] = lower[0] + confidenceBounds[i, 1] = upper[0] + return confidenceBounds + + +# %% +# We evaluate the value of the basis functions on the test sample. +# This sample is used to compute the confidence interval. + +alpha = 0.95 +confidenceIntervalMean = computeRegressionConfidenceInterval( + lsqMethod, betaHat, sigma2Hat, designSample, alpha=alpha +) + + +# %% +# For a given observation, we can print the input, the observed +# output, the predicted output and the confidence interval of +# the conditional expectation. + +i = 5 # The index of the observation +print("x = ", inputSample[i]) +print("Y observation = ", outputSample[i]) +print("Y prediction = ", yHat[i]) +print("Confidence interval of the mean = ", confidenceIntervalMean[i]) + + +# %% +# In order to see how the model fits to the data, we can +# create the validation plot. +# Each vertical bar represents the 95% confidence interval +# of the estimate of the conditional expectation of the linear regression model. + +validation = ot.MetaModelValidation(inputSample, outputSample, metamodel) +graph = validation.drawValidation().getGraph(0, 0) +q2Score = validation.computePredictivityFactor()[0] +graph.setTitle("Q2 = %.2f%%" % (100.0 * q2Score)) +graph.setXTitle("Observations") +graph.setYTitle("Metamodel") +for i in range(sampleSize): + curve = ot.Curve([outputSample[i, 0]] * 2, confidenceIntervalMean[i]) + graph.add(curve) +view = otv.View(graph) + +# %% +# We see that the linear regression model is not a very accurate +# metamodel, as can be seen from the relatively low Q2 score. +# The metamodel predictions are not very close to observations, +# which is why the points are not close to the diagonal of the plot. +# Hence, the confidence intervals do not cross the diagonal very often. +# In this case, we may want to create a more accurate metamodel. diff --git a/python/doc/examples/numerical_methods/general_methods/plot_regression_sinus.py b/python/doc/examples/numerical_methods/general_methods/plot_regression_sinus.py new file mode 100755 index 0000000000..57a8a17c34 --- /dev/null +++ b/python/doc/examples/numerical_methods/general_methods/plot_regression_sinus.py @@ -0,0 +1,436 @@ +""" +Compute confidence intervals of a univariate noisy function +=========================================================== +""" + +# %% +# +# Introduction +# ------------ +# +# In this example, we compute the pointwise confidence interval of the +# estimator of the conditional expectation given an input. +# We consider noisy observations of the sine function. +# Then we perform linear least squares regression to fit an order 4 +# polynomial. +# For a given point x, the code computes the confidence interval +# of the prediction y. +# This is the confidence interval of the conditional expectation +# given the input. +# Secondly, we compute the confidence interval of the noisy output observations. +# In this advanced example, we use the :class:`~openturns.QRMethod` low level class. +# Another example of this method is presented in +# :doc:`/auto_numerical_methods/general_methods/plot_regression_interval`. + +# %% + +import openturns as ot +import openturns.viewer as otv +import numpy as np + + +palette = ot.Drawable.BuildTableauPalette(5) + +ot.RandomGenerator.SetSeed(0) + + +# %% +# +# Compute the data +# ---------------- +# +# We generate noisy observations from the sine function. +# We define the function that we are going to approximate. + + +# %% +g = ot.SymbolicFunction(["x"], ["sin(2 * pi_ * x)"]) + +# %% +# We plot the function depending on the input. + + +def plotFunction(g, color, lineStyle="dotted"): + curve = g.draw(0.0, 1.0).getDrawable(0) + curve.setColor(color) + curve.setLineStyle("dotted") + curve.setLegend("True") + return curve + + +graph = ot.Graph("Polynomial curve fitting", "x", "y", True, "topright") +# The "unknown" function +graph.add(plotFunction(g, palette[0])) +view = otv.View(graph) + +# %% +# This is a nice, smooth function to approximate with polynomials. +# + + +def linearSample(xmin, xmax, npoints): + """Returns a sample created from a regular grid + from xmin to xmax with npoints points.""" + step = (xmax - xmin) / (npoints - 1) + rg = ot.RegularGrid(xmin, step, npoints) + vertices = rg.getVertices() + return vertices + + +# %% +# We consider observation points in the interval [0,1]. + +nTrain = 100 +xTrain = linearSample(0, 1, nTrain) + + +# %% +# Assume that the observations are noisy and that the noise follows +# a Normal distribution with zero mean and small standard deviation. + +noise = ot.Normal(0, 0.5) +noiseSample = noise.getSample(nTrain) + +# %% +# The following code computes the observation as the sum of the +# function value and of the noise. +# The couple `(xTrain,yTrain)` is the training set: it is used +# to compute the coefficients of the polynomial model. + +yTrain = g(xTrain) + noiseSample +print(yTrain[:5]) + + +# %% +# Then we plot the function and the observations. + + +def plotData(xTrain, yTrain, color, pointStyle="circle"): + cloud = ot.Cloud(xTrain, yTrain) + cloud.setPointStyle(pointStyle) + cloud.setLegend("Observations") + cloud.setColor(palette[1]) + return cloud + + +graph = ot.Graph("Polynomial curve fitting", "x", "y", True, "topright") +# The "unknown" function +graph.add(plotFunction(g, palette[0])) +# Training set +graph.add(plotData(xTrain, yTrain, palette[1])) +view = otv.View(graph) + +# %% +# We see that the noisy observations of the function are relatively +# large compared to the function values. +# It may not be obvious that a regression model can fit that data well. + +# %% +# Compute the coefficients of the polynomial decomposition +# -------------------------------------------------------- +# + +# %% +# In order to approximate the function with polynomials up to degree 4, +# we create a list of strings containing the associated monomials. +# We perform the loop from 0 up to `totalDegree` (but the `range` +# function takes `totalDegree + 1` as its second input argument). + +totalDegree = 4 +polynomialCollection = [f"x^{degree}" for degree in range(0, totalDegree + 1)] +print(polynomialCollection) + + +# %% +# Given the list of strings, we create a symbolic function which computes the +# values of the monomials. + +basisFunction = ot.SymbolicFunction(["x"], polynomialCollection) +print(basisFunction) + + +# %% +# Evaluate the design matrix as the value of the basis functions on the +# training sample. + +designSampleTrain = basisFunction(xTrain) +print(designSampleTrain[:5]) + + +# %% +# Convert the design sample into a design matrix and create +# an instance of the :class:`~openturns.QRMethod` class. +# This class has the :meth:`~openturns.QRMethod.getGramInverse` method that +# we need to compute the confidence interval. + +designMatrixTrain = ot.Matrix(designSampleTrain) +lsqMethod = ot.QRMethod(designMatrixTrain) + +# %% +# Solve the linear least squares problem and get the vector of coefficients. + +betaHat = lsqMethod.solve(yTrain.asPoint()) +print(betaHat) + +# %% +# Compute residuals and variance +# ------------------------------ +# +# We need to estimate the variance of the residuals. +# To do this we evaluate the predictions of the regression model on +# the training sample and compute the residuals. + +yHatTrain = designMatrixTrain * betaHat +residuals = yHatTrain - yTrain.asPoint() +sampleSize = designMatrixTrain.getNbRows() +print("sampleSize=", sampleSize) +nParameters = designMatrixTrain.getNbColumns() +print("nParameters = ", nParameters) +sigma2Hat = residuals.normSquare() / (sampleSize - nParameters) +print("sigma2Hat = ", sigma2Hat) + + +# %% +# The couple `(xTest,yHatTest)` is the set where we want to evaluate the +# prediction confidence intervals. +# In order to evaluate the predictions from the regression model, multiply +# the design matrix evaluated on the test sample with the vector of coefficients. + +nTest = 50 +xTest = linearSample(0, 1, nTest) +designMatrixTest = ot.Matrix(basisFunction(xTest)) +yHatTest = ot.Sample.BuildFromPoint(designMatrixTest * betaHat) + + +# %% +# Then we plot the true function, its noisy observations and the least +# squares model of degree 4. + + +def plotPredictions(xTest, yHatTest, totalDegree, color): + curve = ot.Curve(xTest, yHatTest) + curve.setLegend(f"L.S. degree {totalDegree}") + curve.setColor(color) + return curve + + +graph = ot.Graph("Polynomial curve fitting", "x", "y", True, "topright") +# The "unknown" function +graph.add(plotFunction(g, palette[0])) +# Training set +graph.add(plotData(xTrain, yTrain, palette[1])) +# Predictions +graph.add(plotPredictions(xTest, yHatTest, totalDegree, palette[2])) +view = otv.View(graph) + +# %% +# We see that the least squares polynomial model +# is relatively close to the true function. + + +# %% +# Compute confidence intervals +# ---------------------------- +# +# The next function will help to compute confidence intervals. +# It is based on regression analysis. + + +def computeRegressionConfidenceInterval( + lsqMethod, + betaHat, + sigma2Hat, + designSample, + alpha=0.95, + mean=True, + epsilonSigma=1.0e-5, +): + """ + Compute a confidence interval for the estimate of the mean. + + Evaluates this confidence interval at points in the design matrix. + + This code is based on (Rawlings, Pantula & David, 1998) + eq. 3.51 and 3.52 page 90. + + Parameters + ---------- + lsqMethod: ot.LeastSquaresMethod + The linear least squares method (e.g. QR or Cholesky). + betaHat : ot.Point(parameterDimension) + The solution of the least squares problem. + sigma2Hat : float > 0.0 + The estimate of the variance. + designSample : ot.Sample(size, parameterDimension) + The design matrix of the linear model. + This is the value of the functional basis depending on the + input sample. + Each row represents the input where the confidence interval + is to be computed. + alpha : float, in [0, 1] + The width of the confidence interval. + mean : bool + If True, then computes the confidence interval of the mean. + This interval contains yTrue = E[y|x] with probability alpha. + Otherwise, computes a confidence interval of the prediction at point x. + This interval contains y|x with probability alpha. + epsilonSigma : float > 0.0 + A relatively small number. The minimal value of variance, which + avoids a singular Normal distribution. + + Reference + --------- + - O. Rawlings John, G. Pantula Sastry, and A. Dickey David. + Applied regression analysis—a research tool. Springer New York, 1998. + + Returns + ------- + confidenceBounds : ot.Sample(size, 2) + The first column contains the lower bound. + The second column contains the upper bound. + """ + + inverseGram = lsqMethod.getGramInverse() + sampleSize = designSample.getSize() + confidenceBounds = ot.Sample(sampleSize, 2) + for i in range(sampleSize): + x0 = designSample[i, :] + meanYHat = x0.dot(betaHat) + sigma2YHat = x0.dot(inverseGram * x0) * sigma2Hat + if not mean: + sigma2YHat += sigma2Hat + sigmaYHat = np.sqrt(sigma2YHat) + sigmaYHat = max(epsilonSigma, sigmaYHat) # Prevents a zero s.e. + distribution = ot.Normal(meanYHat, sigmaYHat) + interval = distribution.computeBilateralConfidenceInterval(alpha) + lower = interval.getLowerBound() + upper = interval.getUpperBound() + confidenceBounds[i, 0] = lower[0] + confidenceBounds[i, 1] = upper[0] + return confidenceBounds + + +# %% +# We evaluate the value of the basis functions on the test sample. +# This sample is used to compute the confidence interval. + +# %% +designSampleTest = basisFunction(xTest) + +# %% +# Compute the confidence interval. + +# %% +alpha = 0.95 +confidenceIntervalMean = computeRegressionConfidenceInterval( + lsqMethod, betaHat, sigma2Hat, designSampleTest, alpha=alpha +) + +# %% +# On output, the `confidenceIntervalMean` variable is a :class:`~openturns.Sample` +# of size 50 and dimension 2. + +# %% +print(confidenceIntervalMean.getSize()) + +# %% +# Plot the confidence interval (C.I.) of the pointwise estimator +# of the conditional expectation. + + +# %% +def plotConfidenceInterval( + xTest, confidenceIntervalSample, color, lineStyle="dashed", label="" +): + graph = ot.Graph() + curve = ot.Curve(xTest, confidenceIntervalSample[:, 0]) + curve.setLegend(label) + curve.setColor(color) + curve.setLineStyle(lineStyle) + graph.add(curve) + curve = ot.Curve(xTest, confidenceIntervalSample[:, 1]) + curve.setLegend("") + curve.setColor(color) + curve.setLineStyle(lineStyle) + graph.add(curve) + return graph + + +graph = ot.Graph("Polynomial curve fitting", "x", "y", True, "topright") +# The "unknown" function +graph.add(plotFunction(g, palette[0])) +# Training set +graph.add(plotData(xTrain, yTrain, palette[1])) +# Predictions +graph.add(plotPredictions(xTest, yHatTest, totalDegree, palette[2])) +# Confidence interval of the mean +graph.add( + plotConfidenceInterval( + xTest, + confidenceIntervalMean, + palette[3], + label="Mean %.0f%%" % (100.0 * alpha), + ) +) +view = otv.View(graph) + +# %% +# We see that the pointwise confidence interval contains the true +# model for most points. +# For a small set of points, there are points which are not within +# the bounds, but are not too far away. +# The observations, however, are not contained within these bounds. +# This is the goal of the next cell. + + +# %% +# Finally, compute a 95% C.I. of the observations. + +alpha = 0.95 +confidenceIntervalObservations = computeRegressionConfidenceInterval( + lsqMethod, + betaHat, + sigma2Hat, + designSampleTest, + alpha=alpha, + mean=False, +) + + +# %% +# Then we plot the function, its least squares approximation, the +# C.I. of the mean and the C.I. of the observations. + +# sphinx_gallery_thumbnail_number = 5 +graph = ot.Graph("Polynomial curve fitting", "x", "y", True, "topright") +# The "unknown" function +graph.add(plotFunction(g, palette[0])) +# Training set +graph.add(plotData(xTrain, yTrain, palette[1])) +# Predictions +graph.add(plotPredictions(xTest, yHatTest, totalDegree, palette[2])) +# Confidence interval of the mean +graph.add( + plotConfidenceInterval( + xTest, + confidenceIntervalMean, + palette[3], + label="Mean %.0f%%" % (100.0 * alpha), + ) +) +# Confidence interval of the observations. +graph.add( + plotConfidenceInterval( + xTest, + confidenceIntervalObservations, + palette[4], + label="Obs. %.0f%%" % (100.0 * alpha), + ) +) +view = otv.View(graph) + +# %% +# We see that the confidence interval of the observations contain +# most of the observations. +# The confidence interval of the observations is much larger than the +# C.I. of the mean, as expected from the statistical model. diff --git a/python/doc/usecases/use_case_linthurst.rst b/python/doc/usecases/use_case_linthurst.rst new file mode 100644 index 0000000000..62de3a39b9 --- /dev/null +++ b/python/doc/usecases/use_case_linthurst.rst @@ -0,0 +1,31 @@ +.. _use-case-linthurst: + +The Linthurst data set +====================== + +Introduction +------------- + +We consider the Linthurst data set, which contains measures of +aerial biomass (BIO) as well as 5 five physicochemical properties of +the soil: salinity (SAL), pH, K, Na, and Zn. +The data set is taken from [rawlings2001]_ table 5.1 page 63. + +References +---------- + +* [rawlings2001]_ + +API documentation +----------------- + +.. currentmodule:: openturns.usecases.linthurst + +.. autoclass:: Linthurst + :noindex: + +Examples based on this use case +------------------------------- + +.. minigallery:: openturns.usecases.linthurst.Linthurst + diff --git a/python/doc/usecases/usecases.rst b/python/doc/usecases/usecases.rst index 786839d3e1..a1a1a5e6bc 100644 --- a/python/doc/usecases/usecases.rst +++ b/python/doc/usecases/usecases.rst @@ -25,4 +25,4 @@ Contents use_case_fireSatellite use_case_wingweight coles - + use_case_linthurst diff --git a/python/src/usecases/linthurst.py b/python/src/usecases/linthurst.py new file mode 100644 index 0000000000..805bb9deaa --- /dev/null +++ b/python/src/usecases/linthurst.py @@ -0,0 +1,79 @@ +""" +Use case : Linthurst data set +============================= +""" +import openturns as ot + + +class Linthurst: + """ + Data class for the Linthurst data set. + + We consider the Linthurst data set, which contains measures of + aerial biomass (BIO) as well as 5 five physicochemical properties of + the soil: salinity (SAL), pH, K, Na, and Zn. + The data set is taken from [rawlings2001]_ table 5.1 page 63. + + Attributes + ---------- + + data : :class:`~openturns.Sample` + The data set. + + Examples + -------- + >>> from openturns.usecases import linthurst + >>> ds = linthurst.Linthurst() + >>> print(ds.data[:5]) + """ + + def __init__(self): + data = [ + [676, 33, 5, 1441.67, 35185.5, 16.4524], + [516, 35, 4.75, 1299.19, 28170.4, 13.9852], + [1052, 32, 4.2, 1154.27, 26455, 15.3276], + [868, 30, 4.4, 1045.15, 25072.9, 17.3128], + [1008, 33, 5.55, 521.62, 31664.2, 22.3312], + [436, 33, 5.05, 1273.02, 25491.7, 12.2778], + [544, 36, 4.25, 1346.35, 20877.3, 17.8225], + [680, 30, 4.45, 1253.88, 25621.3, 14.3516], + [640, 38, 4.75, 1242.65, 27587.3, 13.6826], + [492, 30, 4.6, 1281.95, 26511.7, 11.7566], + [984, 30, 4.1, 553.69, 7886.5, 9.882], + [1400, 37, 3.45, 494.74, 14596, 16.6752], + [1276, 33, 3.45, 525.97, 9826.8, 12.373], + [1736, 36, 4.1, 571.14, 11978.4, 9.4058], + [1004, 30, 3.5, 408.64, 10368.6, 14.9302], + [396, 30, 3.25, 646.65, 17307.4, 31.2865], + [352, 27, 3.35, 514.03, 12822, 30.1652], + [328, 29, 3.2, 350.73, 8582.6, 28.5901], + [392, 34, 3.35, 496.29, 12369.5, 19.8795], + [236, 36, 3.3, 580.92, 14731.9, 18.5056], + [392, 30, 3.25, 535.82, 15060.6, 22.1344], + [268, 28, 3.25, 490.34, 11056.3, 28.6101], + [252, 31, 3.2, 552.39, 8118.9, 23.1908], + [236, 31, 3.2, 661.32, 13009.5, 24.6917], + [340, 35, 3.35, 672.15, 15003.7, 22.6758], + [2436, 29, 7.1, 528.65, 10225, 0.3729], + [2216, 35, 7.35, 563.13, 8024.2, 0.2703], + [2096, 35, 7.45, 497.96, 10393, 0.3205], + [1660, 30, 7.45, 458.38, 8711.6, 0.2648], + [2272, 30, 7.4, 498.25, 10239.6, 0.2105], + [824, 26, 4.85, 936.26, 20436, 18.9875], + [1196, 29, 4.6, 894.79, 12519.9, 20.9687], + [1960, 25, 5.2, 941.36, 18979, 23.9841], + [2080, 26, 4.75, 1038.79, 22986.1, 19.9727], + [1764, 26, 5.2, 898.05, 11704.5, 21.3864], + [412, 25, 4.55, 989.87, 17721, 23.7063], + [416, 26, 3.95, 951.28, 16485.2, 30.5589], + [504, 26, 3.7, 939.83, 17101.3, 26.8415], + [492, 27, 3.75, 925.42, 17849, 27.7292], + [636, 27, 4.15, 954.11, 16949.6, 21.5699], + [1756, 24, 5.6, 720.72, 11344.6, 19.6531], + [1232, 27, 5.35, 782.09, 14752.4, 20.3295], + [1400, 26, 5.5, 773.3, 13649.8, 19.588], + [1620, 28, 5.5, 829.26, 14533, 20.1328], + [1560, 28, 5.4, 856.96, 16892.2, 19.242], + ] + self.data = ot.Sample(data) + self.data.setDescription(["BIO", "SAL", "pH", "K", "Na", "Zn"])