ch-PINNConvergence.tex

\documentclass[10pt]{article}
\usepackage[utf8]{inputenc}
\usepackage[T1]{fontenc}
\usepackage{hyperref}
\hypersetup{colorlinks=true, linkcolor=blue, filecolor=magenta, urlcolor=cyan,}
\urlstyle{same}
\usepackage{amsmath}
\usepackage{amsfonts}
\usepackage{amssymb}
\usepackage[version=4]{mhchem}
\usepackage{stmaryrd}
\usepackage{bbold}
\usepackage{graphicx}
\usepackage[export]{adjustbox}
\graphicspath{ {./images/} }

\graphicspath{{/home/chaztikov/git/IBPINN/unc-dissertation/notes}{/home/chaztikov/git/IBPINN/unc-dissertation/notes/ref/ERROR ESTIMATES FOR PHYSICS INFORMED NEURAL2203.09346/2022_12_07_6eb6e893739c47f2dcc9g/images/}}
%1c9bbb3d-8efa-4b13-b788-5cb0e1a584ed/fosls_examples/cylinder_2d/multirun/multirun/results/}}
\title{ERROR ESTIMATES FOR PHYSICS INFORMED NEURAL NETWORKS APPROXIMATING THE NAVIER-STOKES EQUATIONS }


\author{T. DE RYCK, AMEYA D. JAGTAP, AND S. MISHRA}
\date{}


\begin{document}
	\maketitle
	
	
	\begin{abstract}
		We prove rigorous bounds on the errors resulting from the approximation of the incompressible Navier-Stokes equations with (extended) physics informed neural networks. We show that the underlying PDE residual can be made arbitrarily small for tanh neural networks with two hidden layers. Moreover, the total error can be estimated in terms of the training error, network size and number of quadrature points. The theory is illustrated with numerical experiments.
	\end{abstract}
	
	\section{IntrodUCTION}
	Deep learning has been very successfully deployed in a variety of fields including computer vision, natural language processing, game intelligence, robotics, augmented reality and autonomous systems [22] and references therein. In recent years, deep learning is being increasingly used in various contexts in scientific computing such as protein folding and controlled nuclear fusion.
	
	As deep neural networks are universal function approximators, it is also natural to use them as ansatz spaces for the solutions of (partial) differential equations (PDEs). In fact, the literature on the use of deep learning for numerical approximation of PDEs has witnessed exponential growth in the last 2-3 years. Prominent examples for the use of deep learning in PDEs include the deep neural network approximation of high-dimensional semi-linear parabolic PDEs [9], linear elliptic PDEs [37, 18] and nonlinear hyperbolic PDEs [25, 26] and references therein. More recently, DNN-inspired architectures such as DeepONets [5, 24, 21] and Fourier neural operators [23, 17. have been shown to even learn infinite-dimensional operators, associated with underlying PDEs, efficiently.
	
	Another extremely popular avenue for the use of machine learning in numerical approximation of PDEs is in the area of physics informed neural networks (PINNs). First proposed in slightly different forms in the 90s 88, 20, 19], PINNs were resurrected recently in [34, 35] as a practical and computationally efficient paradigm for solving both forward and inverse problems for PDEs. Since then, there has been an explosive growth in designing and applying PINNs for a variety of applications involving PDEs. A very incomplete list of references includes 36, 28, 33, 45, 12, 13, 14, 16, 29, 30, 31, 2, 40, 15, 11, 41] and references therein.
	
	On the other hand and in stark contrast to the widespread applications of PINNs, there has been a pronounced scarcity of papers that rigorously justify why PINNs work. Notable exceptions include [38] where the authors show consistency of PINNs with the underlying linear elliptic and parabolic PDE under stringent assumptions and in 39 where similar estimates are derived for linear advection equations. In 29, 30, the authors proposed a strategy for deriving error estimates for PINNs. To describe this strategy and highlight the underlying theoretical issues, it is imperative to introduce PINNs and we do so in an informal manner here (see section 2 for the formal definitions). To this end, we consider the following very general form of an abstract PDE,
	
	$$
	\begin{aligned}
		\mathcal{D}[u](x, t) & =0, \quad \mathcal{B} u(y, t)=\psi(y, t), \\
		u(x, 0) & =\varphi(x), \quad \text { for } x \in D, y \in \partial D, t \in[0, T]
	\end{aligned}
	$$
	
	Here, $D \subset \mathbb{R}^{d}$ is compact and $\mathcal{D}, \mathcal{B}$ are the differential and boundary operators, $u: D \times[0, T] \rightarrow \mathbb{R}^{m}$ is the solution of the $\mathrm{PDE}, \psi: \partial D \times[0, T] \rightarrow \mathbb{R}^{m}$ specifies the (spatial) boundary condition and $\varphi: D \rightarrow \mathbb{R}^{m}$ is the initial condition.
	
	We seek deep neural networks $u_{\theta}: D \times[0, T] \rightarrow \mathbb{R}^{m}$ (see $2.5$ for a definition), parameterized by $\theta \in \Theta$, constituting the weights and biases, that approximate the solution $u$ of 1.1. . The key idea behind PINNs is to consider pointwise residuals, defined for any sufficiently smooth function $f: D \times[0, T] \rightarrow \mathbb{R}^{m}$ as,
	
	$$
	\begin{aligned}
		\mathcal{R}_{i}[f](x, t) & =\mathcal{D}[f](x, t), \quad \mathcal{R}_{s}[f](y, t)=\mathcal{B} f(y, t)-\psi(y, t), \\
		\mathcal{R}_{t}[f](x) & =f(x, 0)-\varphi(x), \quad x \in D, y \in \partial D, t \in[0, T]
	\end{aligned}
	$$
	
	for $x \in D, y \in \partial D, t \in[0, T]$. Using these residuals, one measures how well a function $f$ satisfies resp. the PDE, the boundary condition and the initial condition of (1.1). Note that for the exact solution $\mathcal{R}_{i}[u]=\mathcal{R}_{s}[u]=\mathcal{R}_{t}[u]=0$.
	
	Hence, within the PINNs algorithm, one seeks to find a neural network $u_{\theta}$, for which all residuals are simultaneously minimized, e.g. by minimizing the quantity,
	
	$$
	\begin{aligned}
		\mathcal{E}_{G}(\theta)^{2} & =\int_{D \times[0, T]}\left|\mathcal{R}_{i}\left[u_{\theta}\right](x, t)\right|^{2} d x d t+\int_{\partial D \times[0, T]}\left|\mathcal{R}_{s}\left[u_{\theta}\right](x, t)\right|^{2} d s(x) d t \\
		& +\int_{D}\left|\mathcal{R}_{t}\left[u_{\theta}\right](x)\right|^{2} d x
	\end{aligned}
	$$
	
	However, the quantity $\mathcal{E}_{G}(\theta)$, often referred to as the population risk or generalization error [29] of the neural network $u_{\theta}$ involves integrals and can therefore not be directly minimized in practice. Instead, the integrals in $1.3$ are approximated by a suitable numerical quadrature (see section $2.3$ for details), resulting in,
	
	$$
	\begin{aligned}
		\mathcal{E}_{T}^{i}\left(\theta, \mathcal{S}_{i}\right)^{2} & =\sum_{n=1}^{N_{i}} w_{i}^{n}\left|\mathcal{R}_{i}\left[u_{\theta}\right]\left(x_{i}^{n}, t_{i}^{n}\right)\right|^{2}, \\
		\mathcal{E}_{T}^{s}\left(\theta, \mathcal{S}_{s}\right)^{2} & =\sum_{n=1}^{N_{s}} w_{s}^{n}\left|\mathcal{R}_{s}\left[u_{\theta}\right]\left(x_{s}^{n}, t_{s}^{n}\right)\right|^{2}, \quad \mathcal{E}_{T}^{t}\left(\theta, \mathcal{S}_{t}\right)^{2}=\sum_{n=1}^{N_{t}} w_{t}^{n}\left|\mathcal{R}_{t}\left[u_{\theta}\right]\left(x_{i}^{t}\right)\right|^{2} \\
		\mathcal{E}_{T}(\theta, \mathcal{S})^{2} & =\mathcal{E}_{T}^{i}\left(\theta, \mathcal{S}_{i}\right)^{2}+\mathcal{E}_{T}^{s}\left(\theta, \mathcal{S}_{s}\right)^{2}+\mathcal{E}_{T}^{t}\left(\theta, \mathcal{S}_{t}\right)^{2}
	\end{aligned}
	$$
	
	with quadrature points in space-time constituting data sets $\mathcal{S}_{i}=\left\{\left(x_{i}^{n}, t_{i}^{n}\right)\right\}_{n=1}^{N_{i}}$, $\mathcal{S}_{s}=\left\{\left(x_{s}^{n}, t_{s}^{n}\right)\right\}_{n=1}^{N_{s}}$ and $\mathcal{S}_{t}=\left\{x_{t}^{n}\right\}_{n=1}^{N_{t}}$, and $w_{q}^{n}$ are suitable quadrature weights for $q=i, t, s$.
	
	Thus, the underlying essence of PINNs is to minimize the training error $\mathcal{E}_{T}(\theta, \mathcal{S})^{2}$ over the neural network parameters $\theta$. This procedure immediately raises the following key theoretical questions (see also [7]) starting with
	
	[Q1. ] Given a tolerance $\varepsilon>0$, do there exist neural networks $\hat{u}=u_{\hat{\theta}}, \widetilde{u}=u_{\widetilde{\theta}}$, parametrized by $\hat{\theta}, \widetilde{\theta} \in \Theta$ such that the corresponding generalization $\mathcal{E}_{G}(\hat{\theta})$ 1.3 and training $\mathcal{E}_{T}\left(\widetilde{\theta}, \mathcal{S}_{s}\right) 1.4$ errors are small i.e., $\mathcal{E}_{G}(\hat{\theta}), \mathcal{E}_{T}\left(\widetilde{\theta}, \mathcal{S}_{s}\right)<\varepsilon$ ? As the aim in the PINNs algorithm is to minimize the training error (and indirectly the generalization error), an affirmative answer to this question is of vital importance as it ensures that the loss (PDE residual) being minimized can be made small. However, minimizing the PDE residual does not necessarily imply that the overall error (difference between the exact solution of the PDE 1.1 and its PINN approximation) is small. This leads to the second key question,
	
	[Q2. ] Given a PINN $\hat{u}$ with small generalization error, is the corresponding total error $\|u-\hat{u}\|$ small, i.e., is $\|u-\hat{u}\|<\delta(\varepsilon)$, for some $\delta(\varepsilon) \sim \mathcal{O}(\varepsilon)$, for some suitable norm $\|\cdot\|$, and with $u$ being the solution of the PDE (1.1)?
	
	An affirmative answer to $\mathrm{Q} 2$ (and $\mathrm{Q} 1$ ) certifies that, in principle, there exists a (physics informed) neural network, corresponding to the parameter $\hat{\theta}$, such that the PDE residual and consequently, the overall error in approximating the solution of the PDE 1.1, are small. However, in practice, we minimize the training error $\mathcal{E}_{T}$ (1.4) and this leads to another key question,
	
	[Q3. ] Given a small training error $\mathcal{E}_{T}\left(\theta^{*}\right)$ and a sufficiently large training set $\mathcal{S}$, is the corresponding generalization error $\mathcal{E}_{G}\left(\theta^{*}\right)$ also proportionately small?
	
	An affirmative answer to question Q3, together with question Q2, will imply that the trained PINN $u_{\theta^{*}}$ is an accurate approximation of the solution $u$ of the underlying PDE 1.1. Thus, answering the above three questions affirmatively will constitute a comprehensive theoretical investigation of PINNs and provide a rationale for their very successful empirical performance.
	
	Given this context, we examine how far the literature has come in answering these key questions on the theory for PINNs. In [29, 30], the authors leverage the stability of solutions of the underlying PDE (1.1 to bound the total error in terms of the generalization error (question Q2). Similarly, they use the accuracy of quadrature rules to bound the generalization error in terms of the training error (question Q3). This approach is implemented for forward problems corresponding to a variety of PDEs such as the semi-linear and quasi-linear parabolic equations and the incompressible Euler and the Navier-Stokes equations [29], radiative transfer equations [31], nonlinear dispersive PDEs such as the KdV equations [2] and for the unique continuation (data assimilation) inverse problem for many linear elliptic, parabolic and hyperbolic PDEs [30]. However, Q1 was not answered in these papers. Moreover, the authors imposed rather stringent assumptions on the weights and biases of the trained PINN, which may not hold in practice.
	
	In [7], the authors answered the key questions Q1, Q2 and Q3 in the case of a large class of linear parabolic PDEs, namely the Kolmogorov PDEs, which include the heat equation and the Black-Scholes equation of option pricing as special examples. Thus, they provided a rigorous and comprehensive error analysis of PINNs for these PDEs. Moreover, they also showed that PINNs overcome the curse of dimensionality in the context of very high-dimensional Kolmogorov equations.
	
	The authors of [7] utilized the linearity of the underlying Kolmogorov heavily in their analysis. It is natural to ask if analogous error estimates can be shown for PINN approximations of nonlinear PDEs. This consideration sets the stage for the current paper where we carry out a thorough error analysis for PINNs approximating a prototypical nonlinear PDE and answer Q1, Q2 and Q3 affirmatively. The nonlinear PDE that we consider is the incompressible Navier-Stokes equation, which is the fundamental mathematical model governing the flow of incompressible Newtonian fluids [42].
	
	We are going to show the following results on the PINN approximation of the incompressible Navier-Stokes equations, - We show that there exist neural networks that approximate the classical solutions of Navier-Stokes equations such that the PINN generalization error 1.3 and the PINN training error 1.4 can be made arbitrarily small. Moreover, we provide explicit bounds on the number of neurons as well as the weights of the network in terms of error tolerance and Sobolev norms of the underlying Navier-Stokes equations. This analysis is also extended for the XPINN approximation [12] of the Navier-Stokes equations, answering Q1 affirmatively for both PINNs and XPINNs.
	
	\begin{itemize}
		\item We bound the total error of the PINN (and XPINN) approximation of the Navier-Stokes equations in terms of the PDE residual (generalization error 1.3 ). Consequently, a small PDE residual implies a small total error, answering Q2 affirmatively.
		
		\item We bound the generalization error $1.3$ in terms of the training error $1.4$ and the number of quadrature points using a midpoint quadrature rule. This affirmatively answers question Q3 and establishes the fact that a small training error and sufficient number of quadrature points suffice to yield a small total error for the PINN (and XPINN) approximation of the NavierStokes equations.
		
		\item We present numerical experiments to illustrate our theoretical results.
		
	\end{itemize}
	
	The rest of our paper is organized as follows: In section 2, we collect preliminary information on the Navier-Stokes equations and neural networks and present the PINN and XPINN algorithms. The error analysis is carried out in section 3 and numerical experiments are presented in section 4
	
	\section{PRELIMINARIES}
	In this section, we collect preliminary information on concepts used in rest of the paper. We start with the form of the Navier-Stokes equations.
	
	\section{The incompressible Navier-Stokes equations.}
	We consider the well-known incompressible Navier-Stokes equations [42] and references therein,
	
	\begin{eqnarray*}		
	\begin{cases}u_{t}+u \cdot \nabla u+\nabla p=\nu \Delta u & \text { in } D \times[0, T] \\ \operatorname{div}(u)=0 & \text { in } D \times[0, T] \\ u(t=0)=u_{0} & \text { in } D .\end{cases}
		\end{eqnarray*}
	Where
	$u: D \times[0, T] \rightarrow \mathbb{R}^{d}$ is the fluid velocity \\ $p: D \rightarrow \mathbb{R}$ is the pressure \\
	$u_{0}: D \rightarrow \mathbb{R}^{d}$ is the initial fluid velocity.
	
	The viscosity is denoted by $\nu \geq 0$. For the rest of the paper, we consider the Navier-Stokes equations (2.1) on the $d$-dimensional torus $D=\mathbb{T}^{d}=[0,1)^{d}$ with periodic boundary conditions.
	
	The existence and regularity of the solution to 2.1 depends on the regularity of $u_{0}$, as is stated by the following well-known theorem [27, Theorem 3.4]. 
	Other regularity results with different boundary conditions can be found in e.g. [42.]
	
%	Theorem 2.1. 
	If $u_{0} \in H^{r}\left(\mathbb{T}^{d}\right)$ with $r>\frac{d}{2}+2$ and $\operatorname{div}\left(u_{0}\right)=0$, then there exist $T>0$ and a classical solution $u$ to the Navier-Stokes equation such that $u(t=0)=u_{0}$ and $u \in C\left([0, T] ; H^{r}\left(\mathbb{T}^{d}\right)\right) \cap C^{1}\left([0, T] ; H^{r-2}\left(\mathbb{T}^{d}\right)\right)$.
	
	Based on this result, we prove that $u$ is Sobolev regular:
	$u \in H^{k}(D \times$ $[0, T])$ for some $k \in \mathbb{N}$
	for $r$ sufficiently large
	
	Corollary 2.2. 
	If $k \in \mathbb{N}$ and $u_{0} \in H^{r}\left(\mathbb{T}^{d}\right)$ with $r>\frac{d}{2}+2 k$ and $\operatorname{div}\left(u_{0}\right)=0$, then there exist $T>0$ and a classical solution $u$ to the Navier-Stokes equation such that 
	\begin{eqnarray}
	u \in H^{k}\left(\mathbb{T}^{d} \times[0, T]\right) \\
	\nabla p \in H^{k-1}\left(\mathbb{T}^{d} \times[0, T]\right) \\ u(t=0)=u_{0}
	\end{eqnarray}
%	Proof. The corollary follows directly from Theorem $2.1$ for $k=1$. Therefore, let $k \geq 2$ be arbitary and assume that $r>\frac{d}{2}+2 k$ and $u_{0} \in H^{r}\left(\mathbb{T}^{d}\right)$ and $\operatorname{div}\left(u_{0}\right)=0$. By Theorem 2.1 there exists $T>0$ and a classical solution $u$ to the Navier-Stokes equation such that $u(t=0)=u_{0}$ and $u \in C\left([0, T] ; H^{r}\left(\mathbb{T}^{d}\right)\right) \cap C^{1}\left([0, T] ; H^{r-2}\left(\mathbb{T}^{d}\right)\right)$. 
%	Following [27, Section 1.8], we find that the pressure $p$ satisfies the equation
%	\begin{eqnarray}
%	-\Delta p=\operatorname{Trace}\left((\nabla u)^{2}\right)=\sum_{i, j} u_{x_{j}}^{i} u_{x_{i}}^{j} .		
%	\end{eqnarray}
%	As $r>\frac{d}{2}+1, H^{r-1}\left(\mathbb{T}^{d}\right)$ is a Banach algebra (see Lemma A.1), it holds that $\Delta p \in C\left([0, T] ; H^{r-1}\left(\mathbb{T}^{d}\right)\right)$ and accordingly $\nabla p \in C\left([0, T] ; H^{r}\left(\mathbb{T}^{d}\right)\right)$. Since $u \in$ $C^{1}\left([0, T] ; H^{r-2}\right)$, we can take the time derivative of equation $2.2$ to find that $\Delta p_{t} \in C\left([0, T] ; H^{r-3}\right)$, since the conditions for $H^{r-3}\left(\mathbb{T}^{d}\right)$ to be a Banach algebra are met. As a result we find that $\nabla p_{t} \in C\left([0, T] ; H^{r-2}\right)$. Taking the time derivative of the Navier-Stokes equations $2.1$, we find that $u_{t t} \in C\left([0, T] ; H^{r-4}\right)$ and therefore $u \in C^{2}\left([0, T] ; H^{r-4}\right)$. Repeating these steps, one can prove that $u \in \cap_{\ell=0}^{k} C^{\ell}\left([0, T] ; H^{r-2 \ell}\left(\mathbb{T}^{d}\right)\right)$. The statement of the corollary then follows from this observation since $\ell+r-2 \ell \geq k$ for all $0 \leq \ell \leq k$ if $r>\frac{d}{2}+2 k$. Similarly, one can prove that $\nabla p \in \cap_{\ell=0}^{k-1} C^{\ell}\left([0, T] ; H^{r-2 \ell}\left(\mathbb{T}^{d}\right)\right)$.
	
	\section{Neural networks} 
	As our objective is to approximate the solution of the incompressible Navier-Stokes equations 2.1 with neural networks, here we formally introduce our definition of a neural network and the related terminology.
	
%	Definition 2.3. 
	Let $R \in(0, \infty], L, W \in \mathbb{N}$ and $l_{0}, \ldots, l_{L} \in \mathbb{N}$. Let $\sigma: \mathbb{R} \rightarrow \mathbb{R}$ be a twice differentiable activation function and define
	
	$$
	\Theta=\Theta_{L, W, R}:=\bigcup_{L^{\prime} \in \mathbb{N}, L^{\prime} \leq L} \bigcup_{l_{0}, \ldots, l_{L} \in\{1, \ldots, W\}^{k=1}}^{L^{\prime}} \times\left([-R, R]^{l_{k} \times l_{k-1}} \times[-R, R]^{l_{k}}\right)
	$$
	
	For $\theta \in \Theta_{L, W, R}$, we define $\theta_{k}:=\left(\mathcal{W}_{k}, b_{k}\right)$ and $\mathcal{A}_{k}: \mathbb{R}^{l_{k-1}} \rightarrow \mathbb{R}^{l_{k}}: x \mapsto \mathcal{W}_{k} x+b_{k}$ for $1 \leq k \leq L$ and and we define $f_{k}^{\theta}: \mathbb{R}^{l_{k-1}} \rightarrow \mathbb{R}^{l_{k}}$ by
	
	$$
	f_{k}^{\theta}(z)= \begin{cases}\mathcal{A}_{L}^{\theta}(z) & k=L \\ \left(\sigma \circ \mathcal{A}_{k}^{\theta}\right)(z) & 1 \leq k<L\end{cases}
	$$
	
	We denote by $u_{\theta}: \mathbb{R}^{l_{0}} \rightarrow \mathbb{R}^{l_{L}}$ the function that satisfies for all $z \in \mathbb{R}^{l_{0}}$ that
	
	$$
	u_{\theta}(z)=\left(f_{L}^{\theta} \circ f_{L-1}^{\theta} \circ \cdots \circ f_{1}^{\theta}\right)(z),
	$$
	
	where in the setting of approximating the Navier-Stokes equation (2.1) we set $l_{0}=$ $d+1$ and $z=(x, t)$. We refer to $u_{\theta}$ as the realization of the neural network associated to the parameter $\theta$ with $L$ layers and widths $\left(l_{0}, l_{1}, \ldots, l_{L}\right)$. We refer to the first $L-1$ layers as hidden layers. For $1 \leq k \leq L$, we say that layer $k$ has width $l_{k}$ and we refer to $\mathcal{W}_{k}$ and $b_{k}$ as the weights and biases corresponding to layer $k$. The width of $u_{\theta}$ is defined as $\max \left(l_{0}, \ldots, l_{L}\right)$. If $L=2$, we say that $u_{\theta}$ is a shallow neural network; if $L \geq 3$, we say that $u_{\theta}$ is a deep neural network.
	
	\section{Quadrature rules}
	In the following sections, we will need to approximate integrals of functions. For this reason, we introduce some notation and recall wellknown results on numerical quadrature rules.
	
	Given $\Lambda \subset \mathbb{R}^{d}$ and $f \in L^{1}(\Lambda)$, we will be interested in approximating $\int_{\Lambda} f(y) d y$, with $d y$ denoting the $d$-dimensional Lebesgue measure. A numerical quadrature rule provides such an approximation by choosing some quadrature points $y_{m} \in \Lambda$ for $1 \leq m \leq M$, and quadrature weights $w_{m}>0$ for $1 \leq m \leq M$, and considers the approximation
	
	$$
	\frac{1}{M} \sum_{m=1}^{M} w_{m} f\left(y_{m}\right) \approx \int_{\Lambda} f(y) d y .
	$$
	
	The accuracy of this approximation depends on the chosen quadrature rule, the number of quadrature points $M$ and the regularity of $f$. Whereas in very high dimensions, random training points or low-discrepancy training points 32 are needed, the relatively low-dimensional setting of the Navier-Stokes equations i.e., $d \leq 4$, allows the use of standard deterministic numerical quadrature points. In order to obtain explicit rates, we will focus on the midpoint rule, but our analysis will also hold for general deterministic numerical quadrature rules.
	
	We briefly recall the midpoint rule. For $N \in \mathbb{N}$, we partition $\Lambda$ into $M \sim N^{d}$ cubes of edge length $1 / N$ and we denote by $\left\{y_{m}\right\}_{m=1}^{M}$ the midpoints of these cubes. The formula and accuracy of the midpoint rule $\mathcal{Q}_{M}^{\Lambda}$ are then given by,
	
	$$
	\mathcal{Q}_{M}^{\Lambda}[f]:=\frac{1}{M} \sum_{m=1}^{M} f\left(y_{m}\right), \quad\left|\int_{\Lambda} f(y) d y-\mathcal{Q}_{M}^{\Lambda}[f]\right| \leq C_{f} M^{-2 / d},
	$$
	
	where $C_{f} \lesssim\|f\|_{C^{2}}$
	
	\section{Physics informed neural networks (PINNs)}
	We seek deep neural networks $u_{\theta}: D \times[0, T] \rightarrow \mathbb{R}^{d}$ and $p_{\theta}: D \times[0, T] \rightarrow \mathbb{R}$ (cf. Definition 2.3), parameterized by $\theta \in \Theta$, constituting the weights and biases, that approximate the solution $u$ of (2.1). To this end, the key idea behind PINNs is to consider pointwise residuals, defined in the setting of the Navier-Stokes equations 2.1 for any sufficiently $\operatorname{smooth} v: D \times[0, T] \rightarrow \mathbb{R}^{d}$ and $q: D \times[0, T] \rightarrow \mathbb{R}$ as
	
	$$
	\begin{aligned}
		\mathcal{R}_{\mathrm{PDE}}[(v, q)](x, t) & =\left(v_{t}+v \cdot \nabla v+\nabla q-\nu \Delta v\right)(x, t), \\
		\mathcal{R}_{\mathrm{div}}[v](x, t) & =\operatorname{div}(v)(x, t) \\
		\mathcal{R}_{s}[v](y, t) & =\mathcal{B} v(y, t)-\psi(y, t), \\
		\mathcal{R}_{t}[v](x) & =v(x, 0)-\varphi(x)
	\end{aligned}
	$$
	
	for $x \in D, y \in \partial D, t \in[0, T]$. In the above, $\mathcal{B}$ is the boundary operator, $\psi$ : $\partial D \times[0, T] \rightarrow \mathbb{R}^{d}$ specifies the (spatial) boundary condition and $\varphi: D \rightarrow \mathbb{R}^{d}$ is the initial condition. Using these residuals, one measures how well a function $f$ satisfies resp. the PDE, the boundary condition and the initial condition of (2.1). Note that for the exact solution to the Navier-Stokes equations (2.1) it holds that $\mathcal{R}_{\mathrm{PDE}}[(u, p)]=\mathcal{R}_{\operatorname{div}}[u]=\mathcal{R}_{s}[u]=\mathcal{R}_{t}[u]=0$
	
	Hence, within the PINNs algorithm, one seeks to find a neural network $\left(u_{\theta}, p_{\theta}\right)$, for which all residuals are simultaneously minimized, e.g. by minimizing the quantity
	
	$$
	\begin{aligned}
		\mathcal{E}_{G}(\theta)^{2}= & \int_{D \times[0, T]}\left\|\mathcal{R}_{\operatorname{PDE}}\left[\left(u_{\theta}, p_{\theta}\right)\right](x, t)\right\|_{\mathbb{R}^{d}}^{2} d x d t+\int_{D \times[0, T]}\left|\mathcal{R}_{\operatorname{div}}\left[u_{\theta}\right](x, t)\right|^{2} d x d t \\
		& +\int_{\partial D \times[0, T]}\left\|\mathcal{R}_{s}\left[u_{\theta}\right](x, t)\right\|_{\mathbb{R}^{d}}^{2} d s(x) d t+\int_{D}\left\|\mathcal{R}_{t}\left[u_{\theta}\right](x)\right\|_{\mathbb{R}^{d}}^{2} d x
	\end{aligned}
	$$
	
	The different terms of $2.9$ are often rescaled using some weights. For simplicity, we set all these weights to one. The quantity $\mathcal{E}_{G}(\theta)$, often referred to as the population risk or generalization error of the neural network $u_{\theta}$, involves integrals and can therefore not be directly minimized in practice. Instead, the integrals in (2.9) are approximated by a numerical quadrature, as introduced in Section $2.3$ As a result, we define the (squared) training loss for PINNs $\theta \mapsto \mathcal{E}_{T}(\theta, \mathcal{S})^{2}$ as follows,
	
	$$
	\begin{aligned}
		\mathcal{E}_{T}(\theta, \mathcal{S})^{2}= & \mathcal{E}_{T}^{\mathrm{PDE}}\left(\theta, \mathcal{S}_{\mathrm{int}}\right)^{2}+\mathcal{E}_{T}^{\mathrm{div}}\left(\theta, \mathcal{S}_{\mathrm{int}}\right)^{2}+\mathcal{E}_{T}^{s}\left(\theta, \mathcal{S}_{s}\right)^{2}+\mathcal{E}_{T}^{t}\left(\theta, \mathcal{S}_{t}\right)^{2} \\
		= & \sum_{n=1}^{N_{\mathrm{int}}} w_{\mathrm{int}}^{n}\left\|\mathcal{R}_{\mathrm{PDE}}\left[\left(u_{\theta}, p_{\theta}\right)\right]\left(t_{\mathrm{int}}^{n}, x_{\mathrm{int}}^{n}\right)\right\|_{\mathbb{R}^{d}}^{2}+\sum_{n=1}^{N_{\mathrm{int}}} w_{\mathrm{int}}^{n}\left|\mathcal{R}_{\mathrm{div}}\left[u_{\theta}\right]\left(t_{\mathrm{int}}^{n}, x_{\mathrm{int}}^{n}\right)\right|^{2} \\
		& +\sum_{n=1}^{N_{s}} w_{s}^{n}\left\|\mathcal{R}_{s}\left[u_{\theta}\right]\left(t_{s}^{n}, x_{s}^{n}\right)\right\|_{\mathbb{R}^{d}}^{2}+\sum_{n=1}^{N_{t}} w_{t}^{n}\left\|\mathcal{R}_{t}\left[u_{\theta}\right]\left(x_{t}^{n}\right)\right\|_{\mathbb{R}^{d}}^{2}
	\end{aligned}
	$$
	
	where the training data set $\mathcal{S}=\left(\mathcal{S}_{\text {int }}, \mathcal{S}_{s}, \mathcal{S}_{t}\right)$ is chosen as quadrature points with respect to the relevant domain (resp. $D \times[0, T], \partial D \times[0, T]$ and $D$ ) and where the $w_{*}^{n}$ are corresponding quadrature weights.
	
	A trained PINN $u^{*}=u_{\theta^{*}}$ is then defined as a (local) minimum of the optimization problem,
	
	$$
	\theta^{*}(\mathcal{S})=\arg \min _{\theta \in \Theta} \mathcal{E}_{T}(\theta, \mathcal{S})^{2},
	$$
	
	with loss function 2.10) (possibly with additional data and weight regularization terms), found by a (stochastic) gradient descent algorithm such as ADAM or LBFGS.
	%	
	%	2.5. Extended physics informed neural networks (XPINNs). In many applications, it happens that the computational domain has a very complicated shape or that the PDE solution shows localized features. In such cases, it is beneficial to decompose the computational domain into non-overlapping regions and deploy different neural networks to approximate the PDE solution in different sub-regions. This idea was first presented in [12], where the authors proposed to decompose the domain in $\mathcal{N}$ closed subdomains with non-overlapping interior and deploy PINNs $u_{\theta_{q}}$ to approximate the exact solution $u$ in each of those subdomains $\Omega_{q}$. Patching together the PINNs for all the subnetworks yields the final approximation $u_{\theta}$, termed extended physics informed neural network (XPINN), defined as,
	%	
	%	$$
	%	u_{\theta}(z)=\sum_{q=1}^{\mathcal{N}} \chi_{q}(z) u_{\theta_{q}}(z),
	%	$$
	%	
	%	for $z=(x, t)$ and where the weight function $\chi_{q}$ is given by,
	%	
	%	$$
	%	\chi_{q}(z)= \begin{cases}0 & z \notin \Omega_{q}, \\ \frac{1}{\#\left\{n: z \in \Omega_{n}\right\}} & z \in \Omega_{q},\end{cases}
	%	$$
	%	
	%	where $\#\left\{n: z \in \Omega_{n}\right\}$ represents the number of subdomains $z$ belongs to. Hence $\sum_{q} \chi_{q}(z)=1$ for all $z$. One can define neural networks $p_{\theta}$ and $p_{\theta_{q}}$ in an analogous way. It is clear that mimimizing the standard PINN loss 2.10 for an XPINN 2.12 would not be a suitable approach. It is necessary that additional terms in the form of interface conditions should be added to the loss function. For this purpose, we define for every $q$ the following residuals in addition to the standard PINN residuals $2.8$
	%	
	%	$$
	%	\mathcal{R}_{u}[f](y, t)=f(y, t)-u_{\theta}(y, t),
	%	$$
	%	
	%	where $y \in \partial \Omega_{q} \backslash \partial D$ and $t \in[0, T]$. The squared generalization error of an XPINN $u_{\theta}$ is then given by
	%	
	%	$$
	%	\begin{aligned}
	%		\mathcal{E}_{G}(\theta)^{2}= & \int_{D \times[0, T]}\left\|\mathcal{R}_{\operatorname{PDE}}\left[\left(u_{\theta}, p_{\theta}\right)\right](x, t)\right\|_{\mathbb{R}^{d}}^{2} d x d t+\int_{D \times[0, T]}\left|\mathcal{R}_{\operatorname{div}}\left[u_{\theta}\right](x, t)\right|^{2} d x d t \\
	%		& +\int_{\partial D \times[0, T]}\left\|\mathcal{R}_{s}\left[u_{\theta}\right](x, t)\right\|_{\mathbb{R}^{d}}^{2} d s(x) d t+\int_{D}\left\|\mathcal{R}_{t}\left[u_{\theta}\right](x)\right\|_{\mathbb{R}^{d}}^{2} d x \\
	%		& +\sum_{q=1}^{\mathcal{N}} \int_{\left(\partial \Omega_{q} \backslash \partial D\right) \times[0, T]}\left\|\mathcal{R}_{u}\left[u_{\theta_{q}}\right](x, t)\right\|_{\mathbb{R}^{d}}^{2} d s(x) d t \\
	%		& +\sum_{q=1}^{\mathcal{N}} \int_{\left(\partial \Omega_{q} \backslash \partial D\right) \times[0, T]}\left\|\mathcal{R}_{\operatorname{PDE}}\left[\left(u_{\theta_{q}}, p_{\theta_{q}}\right)\right](x, t)-\mathcal{R}_{\operatorname{PDE}}\left[u_{\theta}\right](x, t)\right\|_{\mathbb{R}^{d}}^{2} d s(x) d t
	%	\end{aligned}
	%	$$
	%	
	%	The interface conditions on the two last lines of $2.15$ enforce the continuity and possibly even higher regularity of the XPINN at the interface of neighbouring subdomains.
	%	
	%	The XPINN training loss can then be defined by replacing the integrals in $2.15$ by numerical quadratures, in the same way the standard PINN training loss $2.10$ was derived from $2.9$.
	%	
	\section{ERROR ANALYSIS}
	In this section, we will obtain rigorous on the PINN and XPINN approximations of the solutions of the incompressible Navier-Stokes equations. We start with bounds on PINN residuals below.
	
	3.1. Bound on the PINN residuals. 
	From the definition of the interior PINN residuals 2.8, it is clear that if we can find a neural network $\hat{u}$ such that $\|u-\hat{u}\|_{H^{2}(D \times[0, T])}$ is small, then the interior PINN residual will be small as well. 
	
	The approximation (in Sobolev norm) of Sobolev regular functions by tanh neural networks is discussed in Appendix B. The main ingredients are a piecewise polynomial approximation, the existence of which is guaranteed by the Bramble-Hilbert lemma, and the ability of tanh neural networks to efficiently approximate polynomials, the multiplication operator and an approximate partition of unity. The main result of Appendix B is Theorem B.7, which is a variant of [6. Theorem 5.1]. It proves that a tanh neural network with two hidden layers suffices to make $\|u-\hat{u}\|_{H^{2}(D \times[0, T])}$ arbitrarily small and provides explicit bounds on the needed network width. Using this theorem, we can prove the following upper bound on the PINN residual.
	
	Theorem 3.1. 
	Let $d, r, k \in \mathbb{N}$, with $k \geq 3$, and let $u_{0} \in H^{r}\left(\mathbb{T}^{d}\right)$ with $r>\frac{d}{2}+2 k$ and $\operatorname{div}\left(u_{0}\right)=0$. It holds that:
	
	\begin{itemize}
		\item there exist $T>0$ and a classical solution $u$ to the Navier-Stokes equations such that $u \in H^{k}(\Omega), \nabla p \in H^{k-1}(\Omega), \Omega=\mathbb{T}^{d} \times[0, T]$, and $u(t=0)=u_{0}$,
		
		\item for every $N \in \mathbb{N}$, there exist tanh neural networks $\hat{u}_{j}, 1 \leq j \leq d$, and $\widehat{p}$, each with two hidden layers, of widths $3\left\lceil\frac{k}{2}\right\rceil\left(\begin{array}{c}d+k-1 \\ d\end{array}\right)+\lceil T N\rceil+d N$ and $3 d\left(\begin{array}{c}2 d+1 \\ d\end{array}\right)\lceil T N\rceil N^{d}$, such that for every $1 \leq j \leq d$
		
	\end{itemize}
	
	
	\begin{eqnarray}
		\left\|\left(\hat{u}_{j}\right)_{t}+\hat{u} \cdot \nabla \hat{u}_{j}+(\nabla \hat{p})_{j}-\nu \Delta \hat{u}_{j}\right\|_{L^{2}(\Omega)} & \leq C_{1} \ln ^{2}(\beta N) N^{-k+2}, \\
		\|\operatorname{div}(\hat{u})\|_{L^{2}(\Omega)} & \leq C_{2} \ln (\beta N) N^{-k+1} \\
		\left\|\left(u_{0}\right)_{j}-\hat{u}_{j}(t=0)\right\|_{L^{2}\left(\mathbb{T}^{d}\right)} & \leq C_{3} \ln (\beta N) N^{-k+1}
	\end{eqnarray}
	
	
	where the constants $\beta, C_{1}, C_{2}, C_{3}$ are explicitly defined in the proof and can depend on $k, d, T, u$ and $p$ but not on $N$. 
	The weights of the networks can be bounded by $\mathcal{O}\left(N^{(d+k)^{2}}\right)$.
	
	Proof. Let $N>5$. By Corollary $2.2$ it holds that $u \in H^{k}\left(\mathbb{T}^{d} \times[0, T]\right)$ and $\nabla p \in$ $H^{k-1}\left(\mathbb{T}^{d} \times[0, T]\right)$, hence also $p \in H^{k-1}\left(\mathbb{T}^{d} \times[0, T]\right)$. As a result of Theorem B.7 there then exists for every $1 \leq j \leq d$ a tanh neural network $\hat{u}_{j}:=\hat{u}_{j}^{N}$ with two hidden layers and widths $3\left[\frac{k}{2}\right\rceil\left(\begin{array}{c}d+k-1 \\ d\end{array}\right)+\lceil T N\rceil+d N$ and $3 d\left(\begin{array}{c}2 d+1 \\ d\end{array}\right)\lceil T N\rceil N^{d}$ such that for every $0 \leq \ell \leq 2$
	
	$$
	\left\|u_{j}-\hat{u}_{j}\right\|_{H^{\ell}(\Omega)} \leq C_{\ell, k, d+1, u_{j}} \lambda_{\ell}(N) N^{-k+\ell},
	$$
	
	where $\lambda_{\ell}(N)=2^{\ell+1} 3^{d}(1+\delta) \ln ^{\ell}\left(\beta_{\ell, d+1, u_{j}} N^{d+k+2}\right), \delta=\frac{1}{100}$, and the definition of the other constants can be found in Theorem B.7. The weights can be bounded by $\mathcal{O}\left(N^{(d+k)^{2}}\right)$. We write $\hat{u}=\left(\hat{u}_{1}, \ldots, \hat{u}_{d}\right)$. Moreover, by Theorem B.7 there also exists a tanh neural network $\widehat{p}:=\widehat{p}^{N}$ with two hidden layers and the same widths as before such that
	
	$$
	\left\|(\nabla p)_{j}-(\nabla \widehat{p})_{j}\right\|_{L^{2}(\Omega)} \leq\|p-\widehat{p}\|_{H^{1}(\Omega)} \leq C_{1, k-1, d+1, p} \lambda_{1}(N) N^{-k+2} .
	$$
	
	It is now straightforward to bound the PINN residual.
	
	$$
	\left\|\left(u_{j}\right)_{t}-\left(\hat{u}_{j}\right)_{t}\right\|_{L^{2}(\Omega)} \leq\left|u_{j}-\hat{u}_{j}\right|_{H^{1}(\Omega)} .
	$$
	
	By the Sobolev embedding theorem (Lemma A.2 it follows from $u \in C^{1}\left([0, T], H^{r-2}\left(\mathbb{T}^{d}\right)\right)$ that $u \in C^{1}(\Omega)$, and hence
	
	$$
	\begin{aligned}
		\left\|u \cdot \nabla u_{j}-\hat{u} \cdot \nabla \hat{u}_{j}\right\|_{L^{2}(\Omega)} & \leq\left\|u \cdot \nabla u_{j}-\hat{u} \cdot \nabla u_{j}\right\|_{L^{2}(\Omega)}+\left\|\hat{u} \cdot \nabla u_{j}-\hat{u} \cdot \nabla \hat{u}_{j}\right\|_{L^{2}(\Omega)} \\
		& \leq \sqrt{d}\left\|u_{j}\right\|_{C^{1}} \max _{i}\left\|u_{i}-\hat{u}_{i}\right\|_{L^{2}(\Omega)}+\sqrt{d} \max _{i}\left\|\hat{u}_{i}\right\|_{C^{0}}\left|u_{j}-\hat{u}_{j}\right|_{H^{1}(\Omega)}
	\end{aligned}
	$$
	
	and finally also
	
	$$
	\begin{aligned}
		\left\|\Delta u_{j}-\Delta \hat{u}_{j}\right\|_{L^{2}(\Omega)} & \leq \sqrt{d}\left\|u_{j}-\hat{u}_{j}\right\|_{H^{2}(\Omega)} \\
		\|\operatorname{div}(u)-\operatorname{div}(\hat{u})\|_{L^{2}(\Omega)} & \leq \sqrt{d} \max _{i}\left|u_{i}-\hat{u}_{i}\right|_{H^{1}(\Omega)}
	\end{aligned}
	$$
	
	Hence, we find that for $1 \leq j \leq d$
	
	$$
	\begin{aligned}
		& \left\|\left(\hat{u}_{j}\right)_{t}+\hat{u} \cdot \nabla \hat{u}_{j}+(\nabla \widehat{p})_{j}-\nu \Delta \hat{u}_{j}\right\|_{L^{2}(\Omega)} \leq C_{1, k-1, d+1, p} \lambda_{1}(N) N^{-k+2} \\
		& \quad+C_{1, k, d+1, u_{j}} \lambda_{1}(N)\left(1+\sqrt{d} \max _{i}\left\|\hat{u}_{i}\right\|_{C^{0}}\right) N^{-k+1} \\
		& \quad+\sqrt{d} \lambda_{0}(N)\left\|u_{j}\right\|_{C^{1}} C_{0, k, d+1, u_{j}} N^{-k}+\nu \sqrt{d} C_{2, k, d+1, u_{j}} \lambda_{2}(N) N^{-k+2}
	\end{aligned}
	$$
	
	and also
	
	$$
	\|\operatorname{div}(\hat{u})\|_{L^{2}(\Omega)} \leq \sqrt{d} C_{1, k, d+1, u_{1}} \lambda_{1}(N) N^{-k+1} .
	$$
	
	Finally, we find from the multiplicative trace theorem (Lemma A.3) that
	
	$$
	\begin{aligned}
		\left\|\left(u_{0}\right)_{j}-\hat{u}_{j}(t=0)\right\|_{L^{2}\left(\mathbb{T}^{d}\right)} & \leq\left\|u_{j}-\hat{u}_{j}\right\|_{L^{2}(\partial \Omega)} \\
		& \leq \sqrt{\frac{2 \max \left\{2 h_{\Omega}, d+1\right\}}{\rho_{\Omega}}}\left\|u_{j}-\hat{u}_{j}\right\|_{H^{1}(\Omega)} \\
		& \leq \sqrt{\frac{2 \max \left\{2 h_{\Omega}, d+1\right\}}{\rho_{\Omega}}} C_{1, k, d+1, u_{1}} \lambda_{1}(N) N^{-k+1},
	\end{aligned}
	$$
	
	where $h_{\Omega}$ is the diameter of $\Omega$ and $\rho_{\Omega}$ is the radius of the largest $(d+1)$-dimensional ball that can be inscribed into $\Omega$. This concludes the proof.
	
	\begin{figure}
		\includegraphics[width=\textwidth]{./2022_12_07_6eb6e893739c47f2dcc9g-10}
	\end{figure}
	
	Figure 1. Needed neural network size according to Theorem $3.1$ such that (3.1), (3.2) and (3.3) are all smaller than $1 \%$ for varying regularity $r$ of the initial condition i.e., $u_{0} \in H^{r}\left(\mathbb{T}^{d}\right)$.
	
	Thus, the bounds (3.1), 3.2) and (3.3) clearly show that by choosing $N$ sufficiently large, we can make the PINN residuals (2.8) and consequently the generalization error arbitrarily small. This affirmatively answers Q1 in the introduction.
	
	To further illustrate the bounds of Theorem 3.1. we look for a suitable neural network such that (3.1), (3.2) and (3.3) are all smaller than 1\%. Using the notation of the proof, we set $T=1$ and $\nu=\frac{1}{1000}$ and make the simplification that $\|u\|_{H^{k}(\Omega)}=$ 1 for all $k$. The results are shown for $d=2,3$ in Figure 1 for varying regularity $r$ of the initial condition i.e., $u_{0} \in H^{r}\left(\mathbb{T}^{d}\right)$. In particular, for $d=2$ we find that the minimal network size of every sub-network $\hat{u}_{j}$ is $54 \cdot 10^{3}$ neurons. Although it is certainly possible to reach this level of accuracy with smaller networks, see e.g. [16], the networks that follow from Theorem 3.1 are not unreasonably large, even in three space dimensions.
	
	Remark 3.2. One can easily prove that the XPINN loss of the network constructed in the proof of Theorem 3.1 will be small as well.
	
	3.2. Bound on the total error. Next, we will show that neural networks for which the (X)PINN residuals are small, will provide a good $L^{2}$-approximation of the true solution $u: \Omega=D \times[0, T] \rightarrow \mathbb{R}^{d}, p: \Omega \rightarrow \mathbb{R}$ of the Navier-Stokes equation 2.1] on the torus $D=\mathbb{T}^{d}=[0,1)^{d}$ with periodic boundary conditions. Our analysis can be readily extended to other boundary conditions, such as no-slip boundary condition i.e., $u(x, t)=0$ for all $(x, t) \in \partial D \times[0, T]$, and no-penetration boundary conditions i.e., $u(x, t) \cdot \hat{n}_{D}=0$ for all $(x, t) \in \partial D \times[0, T]$.
	
	For neural networks $\left(u_{\theta}, p_{\theta}\right)$, we define the following PINN-related residuals,
	
	$$
	\begin{aligned}
		& \mathcal{R}_{\mathrm{PDE}}=\partial_{t} u_{\theta}+\left(u_{\theta} \cdot \nabla\right) u_{\theta}+\nabla p_{\theta}-\nu \Delta u_{\theta}, \quad \mathcal{R}_{\text {div }}=\operatorname{div}\left(u_{\theta}\right), \\
		& \mathcal{R}_{s, u}(x)=u_{\theta}(x)-u_{\theta}(x+1), \quad \mathcal{R}_{s, p}(x)=p_{\theta}(x)-p_{\theta}(x+1), \\
		& \mathcal{R}_{s, \nabla u}(x)=\nabla u_{\theta}(x)-\nabla u_{\theta}(x+1), \quad \mathcal{R}_{s}=\left(\mathcal{R}_{s, u}, \mathcal{R}_{s, p}, \mathcal{R}_{s, \nabla u}\right), \\
		& \mathcal{R}_{t}=u_{\theta}(t=0)-u(t=0),
	\end{aligned}
	$$
	
	where we drop the $\theta$-dependence in the definition of the residuals for notational convenience.
	
	\begin{figure}
		\includegraphics[width=\textwidth]{./2022_12_07_6eb6e893739c47f2dcc9g-11}
	\end{figure}
	
	Figure 2. Visualization of the set-up for the XPINN framework with two subdomains.
	
	We will also extend our analysis to the XPINN framework for two subdomains (the extension to more subdomains is straightforward). For this reason, we assume that $D=D_{a} \cup D_{b}$, where $D_{a}$ and $D_{b}$ are closed with non-overlapping interior $D_{a} \cap D_{b}=\emptyset$ and common boundary $\Gamma=D_{a} \cap D_{b}$, which we assume to be suitably smooth. We define $\hat{n}_{\Gamma}$ to point outwards of $D_{a}$. Figure 2 provides a visualization of this set-up.
	
	Following 2.12, the XPINN solution is then defined as
	
	$$
	u_{\theta}=\left\{\begin{array}{ll}
		u_{\theta}^{a} & \text { in } D_{a} \backslash \Gamma, \\
		u_{\theta}^{b} & \text { in } D_{b} \backslash \Gamma, \\
		\frac{1}{2}\left(u_{\theta}^{a}+u_{\theta}^{b}\right) & \text { in } \Gamma,
	\end{array} \quad p_{\theta}= \begin{cases}p_{\theta}^{a} & \text { in } D_{a} \backslash \Gamma, \\
		p_{\theta}^{b} & \text { in } D_{b} \backslash \Gamma, \\
		\frac{1}{2}\left(p_{\theta}^{a}+p_{\theta}^{b}\right) & \text { in } \Gamma,\end{cases}\right.
	$$
	
	where $u_{\theta}^{a}, u_{\theta}^{b}, p_{\theta}^{a}, p_{\theta}^{b}$ are neural networks. In addition to the PINN-related residuals, the following XPINN-related residuals need to be defined,
	
	$$
	\begin{aligned}
		& \mathcal{R}_{u}=\max _{j}\left|\left(u_{\theta}^{a}\right)_{j}-\left(u_{\theta}^{b}\right)_{j}\right|, \quad \mathcal{R}_{\nabla u}=\max _{i, j}\left|\partial_{i}\left(u_{\theta}^{a}\right)_{j}-\partial_{i}\left(u_{\theta}^{b}\right)_{j}\right|, \\
		& \mathcal{R}_{p}=\max _{i, j}\left|p_{\theta}^{a}-p_{\theta}^{b}\right| .
	\end{aligned}
	$$
	
	The following theorem then bounds the $L^{2}$-error of the (X)PINN in terms of the residuals defined above, see also [29, 3 for versions of the stability argument used below. We write $|\partial D|,|\Gamma|$ for the $(d-1)$-dimensional Lebesgue measure of $\partial D$ and $\Gamma$, respectively, and $|D|$ for the $d$-dimensional Lebesgue measure of $D$.
	
	Theorem 3.3. Let $d \in \mathbb{N}, D=\mathbb{T}^{d}$ and $u \in C^{1}(D \times[0, T])$ be the classical solution of the Navier-Stokes equation (2.1). Let $\left(u_{\theta}, p_{\theta}\right)$ be a PINN/XPINN with parameters $\theta$, then the resulting $L^{2}$-error is bounded as follows,
	
	$$
	\int_{\Omega}\left\|u(x, t)-u_{\theta}(x, t)\right\|_{2}^{2} d x d t \leq \mathcal{C} T \exp \left(T\left(2 d^{2}\|\nabla u\|_{L^{\infty}(\Omega)}+1\right)\right),
	$$
	
	where the constant $\mathcal{C}$ is defined as,
	
	$$
	\begin{aligned}
		\mathcal{C}= & \left\|\mathcal{R}_{t}\right\|_{L^{2}(D)}^{2}+\left\|\mathcal{R}_{\mathrm{PDE}}\right\|_{L^{2}(\Omega)}^{2}+C_{1} \sqrt{T}\left[\sqrt{|D|}\left\|\mathcal{R}_{\mathrm{div}}\right\|_{L^{2}(\Omega)}+(1+\nu) \sqrt{|\partial D|}\left\|\mathcal{R}_{s}\right\|_{L^{2}(\partial D \times[0, T])}\right. \\
		& \left.+\sqrt{|\Gamma|}\left((1+\nu)\left\|\mathcal{R}_{u}\right\|_{L^{2}(\Gamma \times[0, T])}+\nu\left\|\mathcal{R}_{\nabla u}\right\|_{L^{2}(\Gamma \times[0, T])}+\left\|\mathcal{R}_{p}\right\|_{L^{2}(\Gamma \times[0, T])}\right)\right]
	\end{aligned}
	$$
	
	and $C_{1}=C_{1}\left(\|u\|_{C^{1}},\|\hat{u}\|_{C^{1}},\|p\|_{C^{0}},\|\hat{p}\|_{C^{0}}\right)<\infty$. For PINNs, it holds that $\mathcal{R}_{u}=$ $\mathcal{R}_{\nabla u}=\mathcal{R}_{p}=0$
	
	Proof. Let $\hat{u}=u_{\theta}-u$ and $\hat{p}=p_{\theta}-p$ denote the difference between the solution of the Navier-Stokes equations and a PINN with parameter vector $\theta$. Using the Navier-Stokes equations 2.1 and the definitions of the different residuals, we find after a straightforward calculation that,
	
	$\mathcal{R}_{\mathrm{PDE}}=\hat{u}_{t}+(\hat{u} \cdot \nabla) \hat{u}+(u \cdot \nabla) \hat{u}+(\hat{u} \cdot \nabla) u+\nabla \hat{p}-\nu \Delta \hat{u}$
	
	$\mathcal{R}_{\operatorname{div}}=\operatorname{div}(\hat{u}), \quad \mathcal{R}_{s}(x)=u_{\theta}(x)-u_{\theta}(x+1), \quad \mathcal{R}_{t}=\hat{u}(t=0)$,
	
	$\mathcal{R}_{u}=\max _{j}\left|\left(u_{\theta}^{a}\right)_{j}-\left(u_{\theta}^{b}\right)_{j}\right|, \quad \mathcal{R}_{\nabla u}=\max _{i, j}\left|\partial_{i}\left(u_{\theta}^{a}\right)_{j}-\partial_{i}\left(u_{\theta}^{b}\right)_{j}\right|, \quad \mathcal{R}_{p}=\max _{i, j}\left|p_{\theta}^{a}-p_{\theta}^{b}\right|$.
	
	Next, we recall the following vector equalities,
	
	$$
	\hat{u} \cdot \hat{u}_{t}=\frac{1}{2} \partial_{t}\|u\|_{2}^{2}, \quad \hat{u} \cdot((\hat{u} \cdot \nabla) \hat{u})=\frac{1}{2}(\hat{u} \cdot \nabla)\|\hat{u}\|_{2}^{2}, \quad \hat{u} \cdot((u \cdot \nabla) \hat{u})=\frac{1}{2}(u \cdot \nabla)\|\hat{u}\|_{2}^{2}
	$$
	
	We take the inner product of the first equation in (2.1) and $\hat{u}$, and use the previous vector inequalities to obtain, $(3.20)$
	
	$\frac{1}{2} \partial_{t}\|\hat{u}\|_{2}^{2}+\frac{1}{2}(\hat{u} \cdot \nabla)\|\hat{u}\|_{2}^{2}+\frac{1}{2}(u \cdot \nabla)\|\hat{u}\|_{2}^{2}+\hat{u} \cdot((\hat{u} \cdot \nabla) u)+(\hat{u} \cdot \nabla) \hat{p}-\nu \hat{u} \cdot \Delta \hat{u}=\hat{u} \cdot \mathcal{R}_{\mathrm{PDE}}$
	
	Now let $\Lambda \subset D$ be such that $\partial \Lambda$ is piecewise smooth with outward normal vector $\hat{n}_{\Lambda}$. Denote by $T_{\Lambda}$ the corresponding trace operator. Integrating $3.20$ over $\Lambda$ and integrating by parts yields,
	
	$$
	\begin{aligned}
		\frac{d}{d t} \int_{\Lambda}\|\hat{u}\|_{2}^{2} d x= & \int_{\Lambda} \mathcal{R}_{\operatorname{div}}\left(\|\hat{u}\|_{2}^{2}+2 \hat{p}\right) d x-\int_{\partial \Lambda} T_{\Lambda}(\hat{u}) \cdot \hat{n}_{\Lambda}\left(\|\hat{u}\|_{2}^{2}+2 \hat{p}\right) d s(x) \\
		& -2 \int_{\Lambda} \hat{u} \cdot((\hat{u} \cdot \nabla) u) d x-2 \nu \sum_{j=1}^{d} \int_{\Lambda}\left\|\nabla \hat{u}_{j}\right\|_{2}^{2} d x \\
		& +2 \nu \sum_{j=1}^{d} \int_{\partial \Lambda} T_{\Lambda}\left(\hat{u}_{j}\right)\left(\hat{n}_{\Lambda} \cdot T_{\Lambda}\left(\nabla \hat{u}_{j}\right)\right) d s(x)+2 \int_{\Lambda} \hat{u} \cdot \mathcal{R}_{\mathrm{PDE}} d x
	\end{aligned}
	$$
	
	The use of the trace operator $T_{\Lambda}$ is necessary since the trace of $\hat{u}$ on $\partial \Lambda$ might not agree with the actual definition of $\hat{u}$ as in 3.14. We then find
	
	$$
	\begin{aligned}
		& \sum_{i=a}^{b} \int_{\partial D_{i}} T_{D_{i}}\left(\hat{u}^{i}\right) \cdot \hat{n}_{D_{i}}\left(\left\|\hat{u}^{i}\right\|_{2}^{2}+2 \hat{p}^{i}\right) d s(x)-\int_{\partial D} \hat{u} \cdot \hat{n}_{D}\left(\|\hat{u}\|_{2}^{2}+2 \hat{p}\right) d s(x) \\
		& =\int_{\Gamma} \hat{u}^{a} \cdot \hat{n}_{\Gamma}\left(\left\|\hat{u}^{a}\right\|_{2}^{2}+2 \hat{p}^{a}\right) d s(x)-\int_{\Gamma} \hat{u}^{b} \cdot \hat{n}_{\Gamma}\left(\left\|\hat{u}^{b}\right\|_{2}^{2}+2 \hat{p}^{b}\right) d s(x) \\
		& =\int_{\Gamma}\left(u_{\theta}^{a}-u_{\theta}^{b}\right) \cdot \hat{n}_{\Gamma}\left(\left\|\hat{u}^{a}\right\|_{2}^{2}+2 \hat{p}^{a}\right) d s(x)+\int_{\Gamma} \hat{u}^{b} \cdot \hat{n}_{\Gamma}\left(\left\|\hat{u}^{a}\right\|_{2}^{2}-\left\|\hat{u}^{b}\right\|_{2}^{2}+2\left(p_{\theta}^{a}-p_{\theta}^{b}\right)\right) d s(x)
	\end{aligned}
	$$
	
	And similarly,
	
	$$
	\begin{aligned}
		& \sum_{i=a}^{b} \int_{\partial D_{i}} T_{D_{i}}\left(\hat{u}_{j}^{i}\right)\left(\hat{n}_{D_{i}} \cdot T_{D_{i}}\left(\nabla \hat{u}_{j}^{i}\right)\right) d s(x)-\int_{\partial D} \hat{u}_{j}\left(\hat{n}_{D} \cdot \nabla \hat{u}_{j}\right) d s(x) \\
		& =\int_{\Gamma} \hat{u}_{j}^{a}\left(\hat{n}_{\Lambda} \cdot \nabla \hat{u}_{j}^{a}\right) d s(x)-\int_{\Gamma} \hat{u}_{j}^{b}\left(\hat{n}_{\Lambda} \cdot \nabla \hat{u}_{j}^{b}\right) d s(x) \\
		& =\int_{\Gamma}\left(\left(u_{\theta}^{a}\right)_{j}-\left(u_{\theta}^{b}\right)_{j}\right)\left(\hat{n}_{\Lambda} \cdot \nabla \hat{u}_{j}^{a}\right) d s(x)-\int_{\Gamma} \hat{u}_{j}^{b}\left(\hat{n}_{\Lambda} \cdot \nabla\left(\left(u_{\theta}^{a}\right)_{j}-\left(u_{\theta}^{b}\right)_{j}\right)\right) d s(x)
	\end{aligned}
	$$
	
	Moreover, we calculate that for a constant $C_{1}\left(\|u\|_{C^{1}},\|\hat{u}\|_{C^{1}},\|p\|_{C^{0}},\|\hat{p}\|_{C^{0}}\right)$ it holds that,
	
	$$
	\begin{aligned}
		& -\int_{D} \hat{u} \cdot((\hat{u} \cdot \nabla) u) d x \leq d^{2}\|\nabla u\|_{L^{\infty}(\Omega)} \int_{D}\|\hat{u}\|_{2^{2}}^{2} d x \\
		& \left|\int_{\partial D} \hat{u} \cdot \hat{n}_{D}\left(\|\hat{u}\|_{2}^{2}+2 \hat{p}\right) d s(x)\right| \leq C_{1}\left(\left\|\mathcal{R}_{s, u}\right\|_{L^{1}(\partial D)}+\left\|\mathcal{R}_{s, p}\right\|_{L^{1}(\partial D)}\right) \\
		& \int_{\partial D} \hat{u}_{j}\left(\hat{n}_{D} \cdot \nabla \hat{u}_{j}\right) d s(x) \leq C_{1}\left(\left\|\mathcal{R}_{s, u}\right\|_{L^{1}(\partial D)}+\left\|\mathcal{R}_{s, \nabla u}\right\|_{L^{1}(\partial D)}\right)
	\end{aligned}
	$$
	
	where $\Omega=D \times[0, T]$. Now, summing $3.20$ over the different $\Lambda=D_{i}$, integrating over the interval $[0, \tau] \subset[0, T]$ and using (3.21), (3.22), (3.23) we find that,
	
	$$
	\begin{aligned}
		\int_{D}\|\hat{u}(x, \tau)\|_{2}^{2} d x \leq & \left\|\mathcal{R}_{t}\right\|_{L^{2}(D)}^{2}+C_{1} \sqrt{T|D|}\left\|\mathcal{R}_{\operatorname{div}}\right\|_{L^{2}(\Omega)}+C_{1}(1+\nu) \sqrt{T|\partial D|}\left\|\mathcal{R}_{s}\right\|_{L^{2}(\partial D \times[0, T])} \\
		& +C_{1}(1+\nu) \sqrt{T|\Gamma|} \max _{j}\left\|\left(u_{\theta}^{a}\right)_{j}-\left(u_{\theta}^{b}\right)_{j}\right\|_{L^{2}(\Gamma \times[0, T])} \\
		& +C_{1} \sqrt{T|\Gamma|}\left\|p_{\theta}^{a}-p_{\theta}^{b}\right\|_{L^{2}(\Gamma \times[0, T])}+2 d^{2}\|\nabla u\|_{L^{\infty}(\Omega)} \int_{D \times[0, \tau]}\|\hat{u}(x, t)\|_{2}^{2} d x d t \\
		& +C_{1} \nu \sqrt{T|\Gamma|} \max _{i, j}\left\|\partial_{i}\left(u_{\theta}^{a}\right)_{j}-\partial_{i}\left(u_{\theta}^{b}\right)_{j}\right\|_{L^{2}(\Gamma \times[0, T])} \\
		& +\left\|\mathcal{R}_{\operatorname{PDE}}\right\|_{L^{2}(\Omega)}^{2}+\int_{D \times[0, \tau]}\|\hat{u}(x, t)\|_{2}^{2} d x d t
	\end{aligned}
	$$
	
	where $\mathcal{R}_{s}=\left(\mathcal{R}_{s, u}, \mathcal{R}_{s, p}, \mathcal{R}_{s, \nabla u}\right)$ as in 3.13). Using Grönwall's inequality and integrating over $[0, T]$, we find that,
	
	$$
	\int_{\Omega}\|\hat{u}(x, t)\|_{2}^{2} d x d t \leq \mathcal{C} T \exp \left(T\left(2 d^{2}\|\nabla u\|_{L^{\infty}(\Omega)}+1\right)\right)
	$$
	
	where the constant $\mathcal{C}$ is defined as,
	
	$$
	\begin{aligned}
		\mathcal{C}= & \left\|\mathcal{R}_{t}\right\|_{L^{2}(D)}^{2}+\left\|\mathcal{R}_{\mathrm{PDE}}\right\|_{L^{2}(\Omega)}^{2}+C_{1} \sqrt{T}\left[\sqrt{|D|}\left\|\mathcal{R}_{\operatorname{div}}\right\|_{L^{2}(\Omega)}+(1+\nu) \sqrt{|\partial D|}\left\|\mathcal{R}_{s}\right\|_{L^{2}(\partial D \times[0, T])}\right. \\
		& \left.+\sqrt{|\Gamma|}\left((1+\nu)\left\|\mathcal{R}_{u}\right\|_{L^{2}(\Gamma \times[0, T])}+\nu\left\|\mathcal{R}_{\nabla u}\right\|_{L^{2}(\Gamma \times[0, T])}+\left\|\mathcal{R}_{p}\right\|_{L^{2}(\Gamma \times[0, T])}\right)\right]
	\end{aligned}
	$$
	\\
	Remark 3.4. Although the existence of a $C^{1}$ solution of the Navier-Stokes solution is guaranteed by Theorem 2.1, it is still possible that $\|\nabla u\|_{L^{\infty}(\Omega)}$ becomes very large, e.g. for complicated solutions characterized by strong vorticity [29. In such a case, Theorem 3.3 indicates that the generalization error might be large.
	\\
	Remark 3.5. For PINNs, the $L^{2}$-error is bounded uniquely in terms of residuals that are a part of the PINN generalization error 2.9. This implies that for neural networks with a small PINN loss the corresponding $L^{2}$-error will be small as well, provided that the $C^{1}$-norm of the network does not blow up. This affirmatively answers question Q2. For XPINNs, we can see that the XPINN-specific residuals $\mathcal{R}_{u}$ and $\mathcal{R}_{p}$ (as defined in (3.15) are equivalent with the $\mathcal{R}_{u}$ residual in the XPINN generalization error $(2.15)$. The residual $\mathcal{R}_{\nabla u}$ however does not show up in the original XPINN framework, and should therefore be added to the XPINN loss function $2.15$ to theoretically guarantee a small $L^{2}$-error. Remark 3.6. Variants of Theorem 3.3 for different kinds of boundary conditions can be proven in the same way as above. For example, the statement from Theorem 3.3 still holds for no-slip boundary conditions i.e., $u(x, t)=0$ for all $(x, t) \in \partial D \times$ $[0, T]$, if one defined the spatial boundary residual as $\mathcal{R}_{s}=u_{\theta}$.
	
	Remark 3.7. Although the focus in this paper lies on solving the Navier-Stokes equations for the velocity, we want to note that one can also prove a stability result for $\left\|p-p_{\theta}\right\|_{L^{2}(\Omega)}$ in a similar spirit to Theorem 3.3. The main steps consists of taking the divergence of the Navier-Stokes equations, using the identity (2.2) and rewriting the result in terms of the different residuals.
	
	The existence of a PINN (XPINN) with an arbitrarily small $L^{2}$-error is a simple byproduct of the proof of Theorem 3.1. 
	
	For completeness, we show that one can also use Theorem $3.3$ to obtain a quantitative convergence result on the $L^{2}$-error of the PINN approximation of the solution of the Navier-Stokes equation in terms of the number of neurons of the neural network.
	
	Corollary 3.8. 
	Let $d, r, k \in \mathbb{N}$, where $k \geq 3$ and let $u_{0} \in H^{r}\left(\mathbb{T}^{d}\right)$ with $r>\frac{d}{2}+2 k$ and $\operatorname{div}\left(u_{0}\right)=0$. It holds that:
	
	\begin{itemize}
		\item there exist $T>0$ and a classical solution $u$ to the Navier-Stokes equations such that $u \in H^{k}(\Omega), \nabla p \in H^{k-1}(\Omega), \Omega=\mathbb{T}^{d} \times[0, T]$, and $u(t=0)=u_{0}$,
		
		\item there exist constants $C, \beta>0$ such that for every $N \in \mathbb{N}$, there exist tanh neural networks $\hat{u}_{j}, 1 \leq j \leq d$, and $\widehat{p}$, each with two hidden layers, of widths $3\left[\frac{k}{2}\right\rceil\left(\begin{array}{c}d+k-1 \\ d\end{array}\right)+\lceil T N\rceil+d N$ and $3 d\left(\begin{array}{c}2 d+1 \\ d\end{array}\right)\lceil T N\rceil N^{d}$, such that for every $1 \leq j \leq d$
		
	\end{itemize}
	
	$$
	\|u-\hat{u}\|_{L^{2}(\Omega)} \leq C \ln ^{\kappa}(\beta N) N^{\frac{-k+1}{2}} .
	$$
	
	The value of $C>0$ follows from the proof, $\beta>0$ is as in Theorem $3.1$ and $\kappa=2$ for $k=3$ and $\kappa=\frac{1}{2}$ for $k \geq 4$.
	
	Proof. The corollary is a direct consequence of Theorem 3.3 and Theorem 3.1 and its proof.
	
	\section{Bounds on the total error in terms of training error}
	 Next, we answer the question Q3, raised in the introduction, by providing 
	 a bound of the generalization error in terms of the training error and the size of the training set $\mathcal{S}$, 
	 where $u_{\theta^{*}(\mathcal{S})}$ is the PINN that minimizes the training loss. 
	 Combined with Theorem $3.3$ it will enable us to bound the total error (the $L^{2}$-mismatch between the exact solution of (2.1) and the trained PINN) in terms of the training error and size of the training set.
	
	As already announced in Section 2.3, we will focus on training sets obtained using the midpoint rule $\mathcal{Q}_{M}$ for simplicity. For $f \in\left\{\mathcal{R}_{\mathrm{PDE}}^{2}, \mathcal{R}_{\mathrm{div}}^{2}\right\}$ and $\Lambda=\Omega=$ $D \times[0, T]$ we obtain the quadrature $\mathcal{Q}_{M}^{\text {int }}$, for $f=\mathcal{R}_{t}^{2}$ and $\Lambda=D$ we obtain the quadrature $\mathcal{Q}_{M}^{t}$ and for $f=\mathcal{R}_{s}^{2}$ and $\Lambda=\partial D \times[0, T]$ we obtain the quadrature $\mathcal{Q}_{M}^{s}$. For XPINNs, one additionally needs to consider the quadrature $\mathcal{Q}_{M}^{\Gamma}$ obtained for $f \in\left\{\mathcal{R}_{u}^{2}, \mathcal{R}_{\nabla u}^{2}, \mathcal{R}_{p}^{2}\right\}$ and $\Lambda=\Gamma$
	
	This notation allows us to write the PINN loss $2.10$ in a compact manner,
	
	$$
	\begin{aligned}
		\mathcal{E}_{T}(\mathcal{S}) & =\mathcal{E}_{T}^{\mathrm{PDE}}\left(\theta, \mathcal{S}_{\text {int }}\right)^{2}+\mathcal{E}_{T}^{\mathrm{div}}\left(\mathcal{S}_{\mathrm{int}}\right)^{2}+\mathcal{E}_{T}^{s}\left(\mathcal{S}_{s}\right)^{2}+\mathcal{E}_{T}^{t}\left(\mathcal{S}_{t}\right)^{2}, \\
		& =\mathcal{Q}_{M_{\mathrm{int}}}^{\mathrm{int}}\left[\mathcal{R}_{\mathrm{PDE}}^{2}\right]+\mathcal{Q}_{M_{\mathrm{int}}}^{\mathrm{int}}\left[\mathcal{R}_{\mathrm{div}}^{2}\right]+\mathcal{Q}_{M_{s}}^{s}\left[\mathcal{R}_{s}^{2}\right]+\mathcal{Q}_{M_{t}}^{t}\left[\mathcal{R}_{t}^{2}\right],
	\end{aligned}
	$$
	
	where all residuals are evaluated in the trained PINN $u_{\theta^{*}(\mathcal{S})}$. Using this notation and Theorem $3.3$ from the previous section, we obtain the following theorem that bounds the $L^{2}$-error of the PINN in terms of the training loss and the number of training points. Theorem 3.9. Let $T>0, d \in \mathbb{N}$, let $(u, p) \in C^{4}\left(\mathbb{T}^{d} \times[0, T]\right)$ be the classical solution of the Navier-Stokes equation (2.1) and let $\left(u_{\theta}, p_{\theta}\right)$ be a PINN with parameters $\theta \in \Theta_{L, W, R}$ (cf. Definition 2.3). Then the following error bound holds,
	
	$$
	\begin{aligned}
		\int_{\Omega}\left\|u(x, t)-u_{\theta}(x, t)\right\|_{2}^{2} d x d t & \leq \mathcal{C}(M) T \exp \left(T\left(2 d^{2}\|\nabla u\|_{L^{\infty}(\Omega)}+1\right)\right) \\
		& =\mathcal{O}\left(\mathcal{E}_{T}(\mathcal{S})+M_{t}^{-\frac{2}{d}}+M_{\mathrm{int}}^{-\frac{1}{d+1}}+M_{s}^{-\frac{1}{d}}\right) .
	\end{aligned}
	$$
	
	In the above formula, the constant $\mathcal{C}(M)$ is defined as,
	
	$$
	\begin{aligned}
		\mathcal{C}(M)= & \mathcal{E}_{T}^{t}\left(\mathcal{S}_{t}\right)^{2}+C_{t} M_{t}^{-\frac{2}{d}}+\mathcal{E}_{T}^{\mathrm{PDE}}\left(\theta, \mathcal{S}_{\mathrm{int}}\right)^{2}+C_{\mathrm{PDE}} M_{\mathrm{int}}^{-\frac{2}{d+1}} \\
		& +C_{1} T^{\frac{1}{2}}\left[\mathcal{E}_{T}^{\operatorname{div}}\left(\mathcal{S}_{\mathrm{int}}\right)+C_{\mathrm{div}} M_{\mathrm{int}}^{-\frac{1}{d+1}}+(1+\nu)\left(\mathcal{E}_{T}^{s}\left(\mathcal{S}_{s}\right)+C_{s} M_{s}^{-\frac{1}{d}}\right)\right]
	\end{aligned}
	$$
	
	and where,
	
	$$
	\begin{aligned}
		C_{1} & \lesssim\|u\|_{C^{1}}+\|p\|_{C^{0}}+\|\hat{u}\|_{C^{1}}+\|\hat{p}\|_{C^{0}} \\
		& \lesssim\|u\|_{C^{1}}+\|p\|_{C^{0}}+(d+1)^{2}\left(16 e^{2} W^{3} R\|\sigma\|_{C^{1}}\right)^{L}, \\
		C_{t} & \lesssim\|u\|_{C^{2}}^{2}+\|\hat{u}\|_{C^{2}}^{2} \lesssim\|u\|_{C^{2}}^{2}+\left(e^{2} 2^{6} W^{3} R^{2}\|\sigma\|_{C^{2}}\right)^{2 L}, \\
		C_{\mathrm{PDE}} & \lesssim\left\|\hat{u}_{j}\right\|_{C^{4}}^{2} \lesssim\left(2 e^{2} 4^{4} W^{3} R^{4}\|\sigma\|_{C^{4}}\right)^{4 L}, \\
		C_{\mathrm{div}}, C_{s} & \lesssim\left\|\hat{u}_{j}\right\|_{C^{3}} \lesssim\left(4 e^{2} 3^{4} W^{3} R^{3}\|\sigma\|_{C^{3}}\right)^{3 L / 2} .
	\end{aligned}
	$$
	
	Proof. The main error estimate of the theorem follows directly from combining Theorem $3.3$ with the quadrature error formula (2.7). The complexity of $C_{1}$ follows from Theorem $3.3$ and Lemma C.1, which states that
	
	$$
	\left\|\hat{u}_{j}\right\|_{C^{n}} \leq 16^{L}(d+1)^{2 n}\left(e^{2} n^{4} W^{3} R^{n}\|\sigma\|_{C^{n}}\right)^{n L}
	$$
	
	for $n \in \mathbb{N}$, all $j$ and similarly for $\hat{p}$. The complexities of the other constants then follow from this formula and the observation that for every residual $\mathcal{R}_{q}$ it holds that $\left\|\mathcal{R}_{q}^{2}\right\|_{C^{n}} \leq 2^{n}\left\|\mathcal{R}_{q}\right\|_{C^{n}}^{2}$ (from the general Leibniz rule). For instance, we obtain in this way that
	
	$$
	C_{t} \lesssim\left\|\mathcal{R}_{t}^{2}\right\|_{C^{2}} \lesssim\|u\|_{C^{2}}^{2}+\|\hat{u}\|_{C^{2}}^{2} \lesssim\|u\|_{C^{2}}^{2}+\left(e^{2} 2^{6} W^{3} R^{2}\|\sigma\|_{C^{2}}\right)^{2 L}
	$$
	
	In a similar way, one can calculate that
	
	$$
	C_{\mathrm{PDE}} \lesssim\left\|\mathcal{R}_{\mathrm{PDE}}^{2}\right\|_{C^{2}} \lesssim\|\hat{u}\|_{C^{4}}^{2} \lesssim\left(2 e^{2} 4^{4} W^{3} R^{4}\|\sigma\|_{C^{4}}\right)^{4 L}
	$$
	
	Finally, we find that
	
	$$
	C_{s}^{2}, C_{\operatorname{div}}^{2} \lesssim\left\|\mathcal{R}_{s}^{2}\right\|_{C^{2}},\left\|\mathcal{R}_{\operatorname{div}}^{2}\right\|_{C^{2}} \lesssim\|\hat{u}\|_{C^{3}}^{2} \lesssim\left(4 e^{2} 3^{4} W^{3} R^{3}\|\sigma\|_{C^{3}}\right)^{3 L} .
	$$
	
	Remark 3.10. For XPINNs, an entirely analogous result can be proven using the same approach.
	
	Remark 3.11. Note that the upper bounds on the constants in $3.32$ depend polynomially on the network width $W$ but exponentially on the network depth $L$. These bounds seem to suggest that one might expect a smaller $L^{2}$-error for a rather shallow (but wide) network than for a very deep network. In Theorem 3.1, we have already proven explicit error bounds for a neural network with only two hidden layers. Remark 3.12. If we assume that the optimization algorithm used to minimize the training loss i.e., to solve (2.11), finds a global minimum, then one can prove that the training error in Theorem $3.9$ is small if the training set and hypothesis space is large enough. To see this, first fix an error tolerance $\epsilon$ and observe that for the network $\hat{u}$ that was constructed in Theorem 3.1 it holds that all relevant PINN residuals and therefore also the generalization error $\mathcal{E}_{G}\left(\theta_{\hat{u}}\right)(2.9$ are of order $\mathcal{O}(\varepsilon)$. If one then constructs the training set such that $M_{\mathrm{int}} \sim \epsilon^{-\frac{d+T}{2}}$ and $M_{t} \sim M_{s} \sim \epsilon^{-\frac{d}{2}}$ then it holds that $\mathcal{E}_{T}^{q}\left(\theta_{\Psi}\right) \leq \epsilon+\mathcal{E}_{G}^{q}\left(\theta_{\Psi}\right)$ for $q \in\{s, t, \operatorname{div}, \mathrm{PDE}\}$ and as a consequence that $\mathcal{E}_{T}\left(\theta_{\hat{u}}, \mathcal{S}\right)=\mathcal{O}(\epsilon)$. If the optimization algorithm reaches a global minimum, the training loss of $u_{\theta^{*}(\mathcal{S})}$ will be upper bounded by that of $\hat{u}$. Therefore it also holds that $\mathcal{E}_{T}(\mathcal{S})=\mathcal{O}(\epsilon)$
	
	\section{Numerical EXPERIMENTS}
	In this section, we seek to illustrate the bounds on error of the PINN and XPINN approximations of the Navier-Stokes equations (2.1), empirically with a numerical experiment.
	
	To this end, we consider the Navier-Stokes equations in two space dimensions and initial data that corresponds to the Taylor-Green vortex test case, which is an unsteady flow of decaying vortices. The exact closed form solutions of Taylor-Green vortex problem are given by
	
	$$
	\begin{aligned}
		& u(t, x, y)=-\cos (\pi x) \sin (\pi y) \exp \left(-2 \pi^{2} \nu t\right) \\
		& v(t, x, y)=\sin (\pi x) \cos (\pi y) \exp \left(-2 \pi^{2} \nu t\right) \\
		& p(t, x, y)=-\frac{\rho}{4}[\cos (2 \pi x)+\cos (2 \pi y)] \exp \left(-4 \pi^{2} \nu t\right)
	\end{aligned}
	$$
	
	The spatio-temporal domain is $x, y \in[0.5,4.5]^{2}$ and $t \in[0,1]$.
	
	The Taylor-Green vortex serves two key requirements in our context. First, it provides an analytical solution of the Navier-Stokes equations and enables us to evaluate $L^{2}$-errors with respect to this exact solution and without having to consider further (numerical) approximations. Second, the underlying solution is clearly smooth enough to fit the regularity criteria of all our error estimates, presented in the previous section.
	
	We will approximate the Taylor-Green vortex with PINNs and XPINNs. In case of XPINNs, we decompose the domain into two subdomains along $x$-axis $(x \geq 2.5$ and $x<2.5)$ where separate neural networks are employed. On the common interface we used 300 points for stitching these two subdomains together. The value of density is set at $\rho=1$. An ensemble training procedure is performed to find the correlation between the total error $(\mathcal{E})$ and the training error $\left(\mathcal{E}_{T}\right)$ for different values of $\nu$. For the neural network training we used full batch with Adam optimizer for the first 20000 number of iterations, followed by L-BFGS optimizer 4] for another 60000 iterations or till convergence. The number of layers in both PINN and XPINN are 2 (as suggested by the theory) with 80 neurons in each layer, and the quadrature points are $27 \mathrm{~K}$, which are obtained using mid-point rule. The learning rate is $8 \mathrm{e}-4$, and the activation function is hyperbolic tangent in both cases. 
	
	We train the networks 80 times with different set of initialization to weights and biases. 
	
	Figure 3 shows the $\log$ of training error vs. $\log$ of total error for each parameter configuration during ensemble training with three different values of viscosity $\nu$. 
	
	As seen from this figure, total error $\mathcal{E}=\left\|u-u^{*}\right\|_{L^{2}}$ 
	and the training error 
	$\mathcal{E}_{T}(3.29$ 
	are very tightly correlated (along the diagonal in Figure 3). 
	
	In particular and consistent with the estimates in Theorem 3.9, a small training error implies a small total error. 
	Moreover, we see from Figure 3 that the total error $\mathcal{E}$ approximately scales as the square root of training error i.e., $\mathcal{E} \lesssim \sqrt{\mathcal{E}_{T}}$, which is also consistent with the bounds in Theorem 3.9
	\begin{figure}
	\includegraphics[width=1.0\linewidth]{./2022_12_07_6eb6e893739c47f2dcc9g-17}		
	\caption{Log of training error vs. $\log$ of total error for each parameter (weights and biases) configuration during ensemble training. We used $\nu=0,0.01$ and $0.1$.}
	\end{figure}
	

	Next, we investigate the behavior of the total and training errors by varying the number of quadrature points. To this end, we train both PINNs and XPINNs 20 times with different parameter initializations and plot the mean and standard deviation of the errors as shown in Figure 4. All results are of a neural network architecture with 2 hidden layers, with 80 neurons in each layer and the hyperbolic tangent activation function. Moreover, the learning rate is the same as before.
	
	We see from Figure 4 that both the training as well as total errors decay with respect to the number of quadrature points till they are saturated around $27 \mathrm{~K}$ quadrature points and do not decay any further. 
	

	\begin{figure}
	\includegraphics[width=1.0\linewidth]{./2022_12_07_6eb6e893739c47f2dcc9g-18}	
	\caption{Training and total errors for different number of quadrature points (residual points).}
	\end{figure}	

	To further illustrate the error estimates derived in the previous section, we revisit the error estimate 3.30. 
	Given the elaboration of the appearing constants in (3.32) and the fact that we have access to the exact solution for the Taylor-Green vortex as well as to the (derivatives of) the PINN, we can explicitly compute a theoretical bound on the total error in 3.30). This error depends on the number of quadrature points as well as on the particular weights of the trained PINN. This theoretical bound is also depicted in Figure 4. We see from this figure that the computed theoretical bound closely tracks the qualitative as well as quantitative behavior of the total error for all cases considered here. 
	
	The rates of decay of both the error and the bound are very similar. However the bound is not quantitatively sharp as there is an approximately one order of magnitude difference in its amplitude vis a vis the total error. 
	
	Such non-sharp bounds on the error are common in theoretical machine learning, see [1] for instance. Even in the case of PINNs, they were already seen in 29 where the authors observed at least two to three orders of magnitude discrepancy between their theoretical bounds and the realized total error. 
	
	Given this context, an order of magnitude discrepancy between the bound in $3.30$ and the observed total error is quite satisfactory.
	
	
\begin{figure}
		\includegraphics[width=1.0\linewidth]{./2022_12_07_6eb6e893739c47f2dcc9g-19}
		\caption{Training and total errors for different number of neurons.}
\end{figure}	

	Finally, we study the behavior of the error as the number of neurons is increased. Given our theoretical considerations, where the relevant error estimates where shown for tanh neural networks with two hidden layers, we restrict ourselves to this setting by only varying the network width and keep the number of hidden layers fixed at two. 
	We again train the PINN and XPINNs networks 20 times with different parameter initializations. 
	The learning rate has the same value as in the previous numerical experiments and the number of quadrature points is fixed at $64 \mathrm{~K}$. 
	The resulting training and total errors are presented in Figure 5 and show that the total error decreases with the number of neurons in each layer till it gets saturated. Moreover, the computable upper bound $3.30$ is also depicted and we see from this figure, that the bound (3.30) follows the same decaying trend, till saturation, as the total as well as training errors in this particular example.
	
	\section{DisCussion}
	Physics informed neural networks have been very successful in the numerical approximation of the solutions of forward as well as inverse problems for various classes of PDEs. However, there is a significant paucity of theoretical results on the resulting errors. Following the framework of a recent paper [7], we revisit the key theoretical questions Q1 (on the smallness of the PDE residual in the class of neural networks), Q2 (a small residual implying a small total error) and Q3 (small training errors imply small total errors for sufficient number of quadrature points), raised in the introduction. We have answered these questions affirmatively for the incompressible Navier-Stokes equations in this paper. The incompressible Navier-Stokes equations constitute a very important example for nonlinear PDEs and PINNs have already been used in approximating them before 16 but without much theoretical justification.
	
	Summarizing our theoretical results, we have shown in this paper that
	
	\begin{itemize}
		\item For sufficiently smooth (Sobolev regular) initial data, there exists neural networks, with the tanh activation function and with two hidden layers, such that the resulting PDE residuals can be arbitrarily small. Moreover in Theorem 3.1 we obtain very precise quantitative estimates on the sizes of resulting neural networks, in terms of regularity of the underlying classical solution. The proof of this approximation result relies heavily on the smoothness of the solutions of Navier-Stokes and on the approximation of smooth functions by neural networks in sufficiently high Sobolev norms.
		
		\item In Theorem 3.3, we show that the total $L^{2}$ error of the PINN (and XPINN) approximations is bounded by the PDE residuals for the incompressible Navier-Stokes equations. Moreover, the underlying constants in the bound are clearly quantified in terms of the underlying classical solution as well as the approximating neural networks. This result leverages the stability (or rather coercivity) of classical solutions of the Navier-Stokes equations. Thus, we answer question Q2 affirmatively by showing a small PDE residual implies a small total error.
		
		\item In Theorem 3.9, we answer question Q3 by proving a bound $3.30$ on the total error in terms of the training error and the number of quadrature points. Thus, if one reaches a global minimum of the underlying optimization problem 2.11, one can show that the training error, and consequently the total error, can be made as small as possible if sufficient number of quadrature points are considered.
		
	\end{itemize}
	
	Taken together, the above theorems constitute the first comprehensive theoretical analysis of PINNs (and XPINNs) for a prototypical nonlinear PDE, the NavierStokes equations. We also illustrate the bounds in a simple numerical experiment demonstrating a qualitative as well as quantitative agreement between the rigorous bounds and the empirical results.
	
	Given this account of the strengths of our results, it is also fair to point out possible limitations and highlight avenues for future investigation. These include
	
	\begin{itemize}
		\item Our estimates do not estimate the training error $\mathcal{E}_{T}$, except under the assumption that one finds a global minimum for the optimization problem 2.11). In practice, it is well known that (stochastic) gradient descent algorithms converge to local minima. In such cases, there is no guarantee on the smallness of the training error. Thus, one needs to find new techniques to estimate training errors for PINNs. On the other hand, our and other numerical results, see [16] for instance, indicate that the training error can be made small. Then a bound like 3.30 clearly indicates that the overall error will be small. This is indeed borne out in numerical experiments (see Figure 5 .
		
		\item Our estimates rely heavily on the regularity of the underlying solutions of Navier-Stokes equations (2.1). There are two caveats in this context. First, in two space dimensions, one knows that the underlying solution will be sufficiently regular if the corresponding initial data is regular enough [42]. However, in three space dimensions, such results are a part of the millennium prize problems and are incredibly hard to obtain. On the more practical level, it is clear from Theorems $3.3$ and $3.9$ that the errors will grow if the $C^{1}$ norms of the underlying exact solutions are large. This is clearly the case, particularly in three space dimensions, where solution gradients grow as vortices are stretched. Hence, one can expect the PINN errors to grow too and this is indeed seen in practice (see section 5 of 29] for instance). However, traditional numerical methods such as finite element methods and spectral viscosity methods also suffer from the same issue and it is not expected to be different for PINNs. In this context, it would be interesting to investigate if the approaches which are based on weak formulations of PDE residuals might lead to better estimates and numerical results.
		
	\end{itemize}
	
	Finally, only the forward problem is considered here. It would be interesting to extend the theoretical tools and bounds in the paper to inverse problems for the Navier-Stokes equations (see [36] ) as well as physics informed operator learning [44].
	
	\section{REFERENCES}
	[1] S. Arora, R. Ge, B. Neyshabur, and Y. Zhang. Stronger generalization bounds for deep nets via a compression approach. In Proceedings of the 35th International Conference on Machine Learning, ICML, volume 80 of Proceedings of Machine Learning Research, pages 254-263, 2018.
	
	[2] G. Bai, U. Koley, S. Mishra, and R. Molinaro. Physics informed neural networks (PINNs) for approximating nonlinear dispersive PDEs. arXiv preprint arXiv:2104.05584, 2021.
	
	[3] A. Biswas, J. Tian, and S. Ulusoy. Error estimates for deep learning methods in fluid dynamics. arXiv preprint arXiv:2008.02844v1, 2020.
	
	[4] R. H. Byrd, P. Lu, J. Nocedal, and C. Zhu. A limited memory algorithm for bound constrained optimization. SIAM Journal on scientific computing, 16(5):1190-1208, 1995.
	
	[5] T. Chen and H. Chen. Universal approximation to nonlinear operators by neural networks with arbitrary activation functions and its application to dynamical systems. IEEE Transactions on Neural Networks, 6(4):911-917, 1995.
	
	[6] T. De Ryck, S. Lanthaler, and S. Mishra. On the approximation of functions by tanh neural networks. Neural Networks, 143:732-750, 2021.
	
	[7] T. De Ryck and S. Mishra. Error analysis for physics informed neural networks (PINNs) approximating Kolmogorov PDEs. Preprint, available from arXiv:2106:14473, 2021.
	
	[8] M. Dissanayake and N. Phan-Thien. Neural-network-based approximations for solving partial differential equations. Communications in Numerical Methods in Engineering, 1994.
	
	[9] W. E, J. Han, and A. Jentzen. Deep learning-based numerical methods for high-dimensional parabolic partial differential equations and backward stochastic differential equations. Communications in Mathematics and Statistics, 5(4):349-380, 2017.
	
	[10] R. Hiptmair and C. Schwab. Numerical Methods for Elliptic and Parabolic Boundary Value Problems. ETH Zürich, 2008.
	
	[11] Z. Hu, A. D. Jagtap, G. E. Karniadakis, and K. Kawaguchi. When do extended physics-informed neural networks (XPINNs) improve generalization? arXiv preprint arXiv:2109.09444, 2021
	
	[12] A. D. Jagtap and G. E. Karniadakis. Extended physics-informed neural networks (XPINNs): A generalized space-time domain decomposition based deep learning framework for nonlinear partial differential equations. Communications in Computational Physics, 28(5):2002-2041, 2020.
	
	[13] A. D. Jagtap, E. Kharazmi, and G. E. Karniadakis. Conservative physics-informed neural networks on discrete domains for conservation laws: Applications to forward and inverse problems. Computer Methods in Applied Mechanics and Engineering, 365:113028, 2020.
	
	[14] A. D. Jagtap, Z. Mao, N. Adams, and G. E. Karniadakis. Physics-informed neural networks for inverse problems in supersonic flows. arXiv preprint arXiv:2202.11821, 2022.
	
	[15] A. D. Jagtap, D. Mitsotakis, and G. E. Karniadakis. Deep learning of inverse water waves problems using multi-fidelity data: Application to Serre-Green-Naghdi equations. Ocean Engineering, 248:110775, 2022.
	
	[16] X. Jin, S. Cai, H. Li, and G. E. Karniadakis. NSFnets (Navier-Stokes flow nets): Physicsinformed neural networks for the incompressible Navier-Stokes equations. Journal of Computational Physics, 426:109951, 2021.
	
	[17] N. Kovachki, S. Lanthaler, and S. Mishra. On universal approximation and error bounds for Fourier Neural Operators. Journal of Machine Learning Research, 22:1-76, 2021.
	
	[18] G. Kutyniok, P. Petersen, M. Raslan, and R. Schneider. A theoretical analysis of deep neural networks and parametric PDEs. Constructive Approximation, pages 1-53, 2021. [19] I. E. Lagaris, A. Likas, and P. G. D. Neural-network methods for boundary value problems with irregular boundaries. IEEE Transactions on Neural Networks, 11:1041-1049, 2000.
	
	[20] I. E. Lagaris, A. Likas, and D. I. Fotiadis. Artificial neural networks for solving ordinary and partial differential equations. IEEE Transactions on Neural Networks, 9(5):987-1000, 2000.
	
	[21] S. Lanthaler, S. Mishra, and G. E. Karniadakis. Error estimates for deeponets: A deep learning framework in infinite dimensions. Transactions of Mathematics and Its Applications, $6(1), 2022$
	
	[22] Y. LeCun, Y. Bengio, and G. Hinton. Deep learning. Nature, 521(7553):436-444, 2015.
	
	[23] Z. Li, N. Kovachki, K. Azizzadenesheli, B. Liu, K. Bhattacharya, A. Stuart, and A. Anandkumar. Fourier neural operator for parametric partial differential equations, 2020.
	
	[24] L. Lu, P. Jin, and G. E. Karniadakis. DeepONet: Learning nonlinear operators for identifying differential equations based on the universal approximation theorem of operators. arXiv preprint arXiv:1910.03193, 2019.
	
	[25] K. O. Lye, S. Mishra, and D. Ray. Deep learning observables in computational fluid dynamics. Journal of Computational Physics, page 109339, 2020.
	
	[26] K. O. Lye, S. Mishra, D. Ray, and P. Chandrashekar. Iterative surrogate model optimization (ISMO): An active learning algorithm for PDE constrained optimization with deep neural networks. Computer Methods in Applied Mechanics and Engineering, 374:113575, 2021.
	
	[27] A. J. Majda, A. L. Bertozzi, and A. Ogawa. Vorticity and incompressible flow. cambridge texts in applied mathematics. Appl. Mech. Rev., 55(4):B77-B78, 2002.
	
	[28] Z. Mao, A. D. Jagtap, and G. E. Karniadakis. Physics-informed neural networks for highspeed flows. Computer Methods in Applied Mechanics and Engineering, 360:112789, 2020.
	
	[29] S. Mishra and R. Molinaro. Estimates on the generalization error of physics informed neural networks (PINNs) for approximating PDEs. arXiv preprint arXiv:2006.16144, 2020.
	
	[30] S. Mishra and R. Molinaro. Estimates on the generalization error of physics-informed neural networks for approximating a class of inverse problems for PDEs. IMA Journal of Numerical Analysis, 2021.
	
	[31] S. Mishra and R. Molinaro. Physics informed neural networks for simulating radiative transfer. Journal of Quantitative Spectroscopy and Radiative Transfer, 270:107705, 2021.
	
	[32] S. Mishra and T. K. Rusch. Enhancing accuracy of deep learning algorithms by training with low-discrepancy sequences. SIAM Journal on Numerical Analysis, 59(3):1811-1834, 2021.
	
	[33] G. Pang, L. Lu, and G. E. Karniadakis. fPINNs: Fractional physics-informed neural networks. SIAM journal of Scientific computing, 41:A2603-A2626, 2019.
	
	[34] M. Raissi and G. E. Karniadakis. Hidden physics models: Machine learning of nonlinear partial differential equations. Journal of Computational Physics, 357:125-141, 2018.
	
	[35] M. Raissi, P. Perdikaris, and G. E. Karniadakis. Physics-informed neural networks: A deep learning framework for solving forward and inverse problems involving nonlinear partial differential equations. Journal of Computational Physics, 378:686-707, 2019.
	
	[36] M. Raissi, A. Yazdani, and G. E. Karniadakis. Hidden fluid mechanics: A Navier-Stokes informed deep learning framework for assimilating flow visualization data. arXiv preprint arXiv:1808.04327, 2018.
	
	[37] C. Schwab and J. Zech. Deep learning in high dimension: Neural network expression rates for generalized polynomial chaos expansions in UQ. Analysis and Applications, 17(01):19-55, 2019
	
	[38] Y. Shin, J. Darbon, and G. E. Karniadakis. On the convergence and generalization of physics informed neural networks. arXiv preprint arXiv:2004.01806, 2020.
	
	[39] Y. Shin, Z. Zhang, and G. E. Karniadakis. Error estimates of residual minimization using neural networks for linear equations. arXiv preprint arXiv:2010.08019, 2020.
	
	[40] K. Shukla, A. D. Jagtap, J. L. Blackshire, D. Sparkman, and G. E. Karniadakis. A physicsinformed neural network for quantifying the microstructural properties of polycrystalline nickel using ultrasound data: A promising approach for solving inverse problems. IEEE Signal Processing Magazine, 39(1):68-77, 2021.
	
	[41] K. Shukla, A. D. Jagtap, and G. E. Karniadakis. Parallel physics-informed neural networks via domain decomposition. Journal of Computational Physics, 447:110683, 2021.
	
	[42] R. Temam. Navier-Stokes equations: theory and numerical analysis, volume 343. American Mathematical Soc., 2001.
	
	[43] R. Verfürth. A note on polynomial approximation in Sobolev spaces. ESAIM: Mathematical Modelling and Numerical Analysis, 33(4):715-719, 1999.
	
	[44] S. Wang and P. Perdikaris. Long-time integration of parametric evolution equations with physics-informed deeponets. Preprint, available from arXiv:2106:05384, 2021.
	
	[45] L. Yang, X. Meng, and G. E. Karniadakis. B-PINNs: Bayesian physics-informed neural networks for forward and inverse PDE problems with noisy data. Journal of Computational Physics, 425:109913, 2021.
	
	\section{Appendix A. Notation and aUXiliary Results}
	This section provides an overview of the notation used in the paper and recalls some basic results on Sobolev spaces.
	
	A.1. Multi-index notation. For $d \in \mathbb{N}$, we call a $d$-tuple of non-negative integers $\alpha \in \mathbb{N}_{0}^{d}$ a multi-index. We write $|\alpha|=\sum_{i=1}^{d} \alpha_{i}, \alpha !=\prod_{i=1}^{d} \alpha_{i} !$ and, for $x \in \mathbb{R}^{d}$, we denote by $x^{\alpha}=\prod_{i=1}^{d} x_{i}^{\alpha_{i}}$ the corresponding multinomial. Given two multi-indices $\alpha, \beta \in \mathbb{N}_{0}^{d}$, we say that $\alpha \leq \beta$ if, and only if, $\alpha_{i} \leq \beta_{i}$ for all $i=1, \ldots, d$. For a multi-index $\alpha$, we define the following multinomial coefficient
	
	$$
	\left(\begin{array}{c}
		|\alpha| \\
		\alpha
	\end{array}\right)=\frac{|\alpha| !}{\alpha !},
	$$
	
	and, given $\alpha \leq \beta$, we define a corresponding multinomial coefficient by
	
	$$
	\left(\begin{array}{l}
		\beta \\
		\alpha
	\end{array}\right)=\prod_{i=1}^{d}\left(\begin{array}{l}
		\beta_{i} \\
		\alpha_{i}
	\end{array}\right)=\frac{\beta !}{\alpha !(\beta-\alpha) !} .
	$$
	
	For $\Omega \subseteq \mathbb{R}^{d}$ and a function $f: \Omega \rightarrow \mathbb{R}$ we denote by
	
	$$
	D^{\alpha} f=\frac{\partial^{|\alpha|} f}{\partial x_{1}^{\alpha_{1}} \cdots \partial x_{d}^{\alpha_{d}}}
	$$
	
	the classical or distributional (i.e. weak) derivative of $f$.
	
	We will also encounter the set $P_{n, d}=\left\{\alpha \in \mathbb{N}_{0}^{d}:|\alpha|=n\right\}$, for which it holds that $\left|P_{n, d}\right|=\left(\begin{array}{c}n+d-1 \\ n\end{array}\right)$
	
	A.2. Sobolev spaces. Let $d \in \mathbb{N}, k \in \mathbb{N}_{0}, 1 \leq p \leq \infty$ and let $\Omega \subseteq \mathbb{R}^{d}$ be open. We denote by $L^{p}(\Omega)$ the usual Lebesgue space and for we define the Sobolev space $W^{k, p}(\Omega)$ as
	
	$$
	W^{k, p}(\Omega)=\left\{f \in L^{p}(\Omega): D^{\alpha} f \in L^{p}(\Omega) \text { for all } \alpha \in \mathbb{N}_{0}^{d} \text { with }|\alpha| \leq k\right\}
	$$
	
	For $p<\infty$, we define the following seminorms on $W^{k, p}(\Omega)$,
	
	$$
	|f|_{W^{m, p}(\Omega)}=\left(\sum_{|\alpha|=m}\left\|D^{\alpha} f\right\|_{L^{p}(\Omega)}^{p}\right)^{1 / p} \quad \text { for } m=0, \ldots, k,
	$$
	
	and for $p=\infty$ we define
	
	$$
	|f|_{W^{m, \infty}(\Omega)}=\max _{|\alpha|=m}\left\|D^{\alpha} f\right\|_{L^{\infty}(\Omega)} \quad \text { for } m=0, \ldots, k .
	$$
	
	Based on these seminorms, we can define the following norm for $p<\infty$,
	
	$$
	\|f\|_{W^{k, p}(\Omega)}=\left(\sum_{m=0}^{k}|f|_{W^{m, p}(\Omega)}^{p}\right)^{1 / p}
	$$
	
	and for $p=\infty$ we define the norm
	
	$$
	\|f\|_{W^{k, \infty}(\Omega)}=\max _{0 \leq m \leq k}|f|_{W^{m, \infty}(\Omega)} .
	$$
	
	The space $W^{k, p}(\Omega)$ equipped with the norm $\|\cdot\|_{W^{k, p}(\Omega)}$ is a Banach space.
	
	We denote by $C^{k}(\Omega)$ the space of functions that are $k$ times continuously differentiable and equip this space with the norm $\|f\|_{C^{k}(\Omega)}=\|f\|_{W^{k, \infty}(\Omega)}$.
	
	We define the Hilbertian Sobolev spaces for $k \in \mathbb{N}_{0}$ as $H^{k}(\Omega)=W^{k, 2}(\Omega)$ with corresponding norms $\|\cdot\|_{H^{k}(\Omega)}=\|\cdot\|_{W^{k, 2}(\Omega)}$ and seminorms $|\cdot|_{H^{m}(\Omega)}=|\cdot|_{W^{m, 2}(\Omega)}$ for integers $m$ with $0 \leq m \leq k$. If $k$ is large enough, the space $H^{k}(\Omega)$ is a Banach algebra. We also recall a version of the Sobolev embedding theorem and a multiplicative trace inequality.
	
	Lemma A.1. For $d, k \in \mathbb{N}$ with $k>\frac{d}{2}, H^{k}(\Omega)$ is a Banach algebra i.e., there exists $c_{k}>0$ such that
	
	$$
	\forall u, v \in H^{k}(\Omega):\|u v\|_{H^{k}(\Omega)} \leq c_{k}\|u\|_{H^{k}(\Omega)}\|v\|_{H^{k}(\Omega)} .
	$$
	
	Lemma A.2. Let $d \in \mathbb{N}, k, \ell \in \mathbb{N}_{0}$ with $k>\ell+\frac{d}{2}$ and $\Omega \subset \mathbb{R}^{d}$ an open set. Every function $f \in H^{k}(\Omega)$ has a continuous representative belonging to $C^{\ell}(\Omega)$.
	
	Lemma A.3 (Multiplicative trace inequality, e.g. Theorem 3.10.1 in [10]). Let $d \geq 2, \Omega \subset \mathbb{R}^{d}$ be a Lipschitz domain and let $\gamma_{0}: H^{1}(\Omega) \rightarrow L^{2}(\partial \Omega):\left.u \mapsto u\right|_{\partial \Omega}$ be the trace operator. Denote by $h_{\Omega}$ the diameter of $\Omega$ and by $\rho_{\Omega}$ the radius of the largest d-dimensional ball that can be inscribed into $\Omega$. Then it holds that
	
	$$
	\left\|\gamma_{0} u\right\|_{L^{2}(\partial \Omega)} \leq \sqrt{\frac{2 \max \left\{2 h_{\Omega}, d\right\}}{\rho_{\Omega}}}\|u\|_{H^{1}(\Omega)}
	$$
	
	Next, we recall the Bramble-Hilbert lemma, which quantifies the accuracy of polynomial approximations of functions in Sobolev spaces. We present a variant of the Bramble-Hilbert lemma for Hilbertian Sobolev spaces proven in [43.
	
	Lemma A.4. Let $\Omega$ be a bounded convex open domain $\mathbb{R}^{d}, d \geq 2$, with diameter h. For every $f \in H^{m}(\Omega)$ there exists a polynomial $p$ of degree at most $m-1$ such that for all $0 \leq j \leq m-1$ it holds that
	
	$$
	|f-p|_{H^{j}(\Omega)} \leq c_{m, j} h^{m-j}|f|_{H^{m}(\Omega)}
	$$
	
	where
	
	$$
	c_{m, j}=\pi^{j-m}\left(\begin{array}{c}
		d+j-1 \\
		j
	\end{array}\right)^{1 / 2} \frac{((m-j) !)^{1 / 2}}{\left(\left[\frac{m-j}{d}\right] !\right)^{d / 2}} .
	$$
	
	We proceed by stating a corollary of the general Leibniz rule for Sobolev regular functions.
	
	Lemma A.5. Let $d \in \mathbb{N}, k \in \mathbb{N}_{0}, \Omega \subset \mathbb{R}^{d}$ and $f \in H^{k}(\Omega)$ and $g \in W^{k, \infty}(\Omega)$. Then it holds that
	
	$$
	\|f g\|_{H^{k}} \leq 2^{k}\|f\|_{H^{k}}\|g\|_{W^{k, \infty}} .
	$$
	
	Finally, we present a result on the Sobolev norm of the composition of two $n$ times continuously differentiable functions [6, Lemma A.7].
	
	Lemma A.6. Let $d, m, n \in \mathbb{N}, \Omega_{1} \subset \mathbb{R}^{d}, \Omega_{2} \subset \mathbb{R}^{m}, f \in C^{n}\left(\Omega_{1} ; \Omega_{2}\right)$ and $g \in$ $C^{n}\left(\Omega_{2} ; \mathbb{R}\right)$. Then it holds that
	
	$$
	\|g \circ f\|_{W^{n, \infty}\left(\Omega_{1}\right)} \leq 16\left(e^{2} n^{4} m d^{2}\right)^{n}\|g\|_{W^{n, \infty}\left(\Omega_{2}\right)} \max _{1 \leq i \leq m}\left\|(f)_{i}\right\|_{W^{n, \infty}\left(\Omega_{1}\right)}^{n} .
	$$
	
	\section{Appendix B. Function approximation BY tanh neUral networks}
	In this section, we prove that for every $f \in H^{m}(\Omega), m \geq 3$, there exists a tanh neural network $\hat{f}$ with two hidden layers such that $\|f-\hat{f}\|_{H^{2}(\Omega)} \leq \epsilon$ for some $\epsilon>0$. In particular, the width of $\hat{f}$ in terms of $\epsilon$ will explicitly be calculated. We will prove this result following the approach of [6]. First, we divide the domain $\Omega$ into cubes of edge length $1 / N$, with $N \in \mathbb{N}$ large enough. On each of these cubes, $f$ can be approximated in Sobolev norm by a polynomial, by virtue of the Bramble-Hilbert lemma. A global approximation can then be constructed by multiplying each polynomial with the indicator function of the corresponding cubes and summing over all cubes. We then prove that replacing these polynomials, multiplications and indicator functions with suitable tanh neural networks results in a new approximation that has approximately the same accuracy. We first list some auxiliary results.
	
	Lemma B.1 (Approximation of multivariate monomials, Corollary $3.6$ in [6]). Let $d, s \in \mathbb{N}, k \in \mathbb{N}_{0}$ and $M>0$. Then for every $\epsilon>0$, there exist a shallow tanh neural network $\Phi_{s, d}:[-M, M]^{d} \rightarrow \mathbb{R}^{\mid P_{s, d+1}} \mid$ of width $3\left[\frac{s+1}{2}\right\rceil\left|P_{s, d+1}\right|$ such that
	
	$$
	\max _{\beta \in P_{s, d+1}}\left\|x^{\beta}-\left(\Phi_{s, d}(x)\right)_{\iota(\beta)}\right\|_{W^{2, \infty}\left([-M, M]^{d}\right)} \leq \epsilon
	$$
	
	where $\iota: P_{s, d+1} \rightarrow\left\{1, \ldots\left|P_{s, d+1}\right|\right\}$ is a bijection. Furthermore, the weights of the network scale as $O\left(\epsilon^{-s / 2}\right)$ for small $\epsilon$.
	
	Lemma B.2 (Shallow approximation of multiplication of $d$ numbers, Corollary $3.7$ in [6]). Let $d \in \mathbb{N}, k \in \mathbb{N}_{0}$ and $M>0$. Then for every $\epsilon>0$, there exist a shallow tanh neural network $\widehat{x}_{d}^{\epsilon}:[-M, M]^{d} \rightarrow \mathbb{R}$ of width $3\left[\frac{d+1}{2}\right]\left|P_{d, d}\right|$ such that
	
	$$
	\left\|\widehat{x}_{d}^{\epsilon}(x)-\prod_{i=1}^{d} x_{i}\right\|_{W^{k, \infty}} \leq \epsilon
	$$
	
	Furthermore, the weights of the network scale as $O\left(\epsilon^{-d / 2}\right)$ for small $\epsilon$.
	
	Lemma B.3. It holds that $\max \left\{|\sigma(x)|,\left|\sigma^{\prime}(x)\right|,\left|\sigma^{\prime \prime}(x)\right|\right\} \leq 1$ for all $x \in \mathbb{R}$.
	
	Next, we summarize the construction of an approximate partition of unity of a domain $\Omega=\prod_{i=1}^{d}\left[0, b_{i}\right]$, as in [6. Section 4]. We divide the domain into cubes of edge length $1 / N$ and denote the corresponding index set by
	
	$$
	\mathcal{N}^{N}=\left\{j \in \mathbb{N}^{d}: j_{i} \leq N b_{i} \text { for all } 1 \leq i \leq d\right\}
	$$
	
	We can then define the cubes for every $j \in \mathcal{N}^{N}$ as,
	
	$$
	I_{j}^{N}=\underset{i=1}{d}\left(\left(j_{i}-1\right) / N, j_{i} / N\right)
	$$
	
	Observe that $\left|\sigma^{\prime}\right|$ and $\left|\sigma^{\prime \prime}\right|$ are monotonously decreasing on $[1, \infty)$. Given $\epsilon>0$, we first find an $\alpha=\alpha(N, \epsilon)$ large enough such that
	
	$$
	\alpha / N \geq 1, \quad 1-\sigma(\alpha / N) \leq \epsilon, \quad \alpha^{m}\left|\sigma^{(m)}(\alpha / N)\right| \leq \epsilon \text { for } m=1,2
	$$
	
	A suitable choice of $\alpha$ is given by the following lemma.
	
	Lemma B.4. The conditions stated in B.5 for $0<\epsilon<1$ are satisfied if
	
	$$
	\alpha=N \ln \left(\frac{4 N^{2}}{e^{2} \epsilon}\right)
	$$
	
	Proof. This is an adaptation of Lemma A.5 in $[6$ for $k=2$. The proof is as in [6], except that one can use Lemma B.3 instead of [6, Lemma A.4]. For $y \in \mathbb{R}$, we then define
	
	$$
	\begin{aligned}
		& \rho_{1}^{N}(y)=\frac{1}{2}-\frac{1}{2} \sigma\left(\alpha\left(y-\frac{1}{N}\right)\right) \\
		& \rho_{j}^{N}(y)=\frac{1}{2} \sigma\left(\alpha\left(y-\frac{j-1}{N}\right)\right)-\frac{1}{2} \sigma\left(\alpha\left(y-\frac{j}{N}\right)\right) \quad \text { for } 2 \leq j \leq N-1 \\
		& \rho_{N}^{N}(y)=\frac{1}{2} \sigma\left(\alpha\left(y-\frac{N-1}{N}\right)\right)+\frac{1}{2}
	\end{aligned}
	$$
	
	Finally, we define for $D \leq d$ the functions
	
	$$
	\Phi_{j}^{N, D}(x)=\prod_{i=1}^{D} \rho_{j_{i}}^{N_{i}}\left(x_{i}\right)
	$$
	
	and the sets $\mathcal{V}_{D}=\left\{v \in \mathbb{Z}^{d}: \max _{1 \leq i \leq D}\left|v_{i}\right| \leq 1\right.$ and $\left.v_{D+1}=\cdots=v_{d}=0\right\}$. The functions $\Phi_{j}^{N, d}$ approximate a partition of unity in the sense that for every $j$ it holds on $I_{j}^{N}$ that,
	
	$$
	\sum_{v \in \mathcal{V}_{d}} \Phi_{j+v}^{N, d} \approx 1 \text { and } \sum_{\substack{v \notin \mathcal{V}_{d}, j+v \in\{1, \ldots, N\}^{d}}} \Phi_{j+v}^{N, d} \approx 0
	$$
	
	This is made exact in the following lemmas.
	
	Lemma B.5 (Lemma $4.1$ in [6]). If $k \in \mathbb{N}_{0}$ and $0<\epsilon<1 / 4$, then
	
	$$
	\left\|\sum_{v \in \mathcal{V}_{d}} \Phi_{j+v}^{N, d}-1\right\|_{W^{k, \infty}\left(I_{j}^{N}\right)} \leq 2^{k d} d \epsilon
	$$
	
	Lemma B.6. Let $k \in\{0,1,2\}$ and $v \in \mathbb{Z}^{d}$ with $\|v\|_{\infty} \geq 2$. Then it holds that
	
	$$
	\left\|\Phi_{j+v}^{N, d}\right\|_{W^{k, \infty}\left(I_{j}^{N}\right)} \leq \alpha^{k} \epsilon .
	$$
	
	Proof. This is an adaptation of Lemma $4.2$ in [6] for $k \leq 2$. The proof is as in [6], except that one can use Lemma B.3 instead of [6, Lemma A.4].
	
	Theorem B.7. Let $d \geq 2, m \geq 3, \delta>0, a_{i}, b_{i} \in \mathbb{Z}$ with $a_{i}<b_{i}$ for $1 \leq i \leq d, \Omega=$ $\prod_{i=1}^{d}\left[a_{i}, b_{i}\right]$ and $f \in H^{m}(\Omega)$. Then for every $N \in \mathbb{N}$ with $N>5$ there exists a tanh neural network $\widehat{f}^{N}$ with two hidden layers, one of width at most $3\left\lceil\frac{m}{2}\right\rceil\left|P_{m-1, d+1}\right|+$ $\sum_{i=1}^{d}\left(b_{i}-a_{i}\right)(N-1)$ and another of width at most $3\left[\frac{d+2}{2}\right\rceil\left|P_{d+1, d+1}\right| N^{d} \prod_{i=1}^{d}\left(b_{i}-\right.$ $\left.a_{i}\right)$, such that for $k \in\{0,1,2\}$ it holds that,
	
	$$
	\left\|f-\widehat{f}^{N}\right\|_{H^{k}(\Omega)} \leq 2^{k} 3^{d} C_{k, m, d, f}(1+\delta) \ln ^{k}\left(\beta_{k, \delta, d, f} N^{d+m+2}\right) N^{-m+k},
	$$
	
	and where we define
	
	$$
	\begin{aligned}
		\beta_{k, \delta, d, f} & =\frac{5 \cdot 2^{k d} \max \left\{\prod_{i=1}^{d}\left(b_{i}-a_{i}\right), d\right\} \max \left\{\|f\|_{W^{k, \infty}(\Omega)}, 1\right\}}{3^{d} \delta \min \left\{1, C_{k, m, d, f}\right\}}, \\
		C_{k, m, d, f} & =\max _{0 \leq \ell \leq k}\left(\begin{array}{c}
			d+\ell-1 \\
			\ell
		\end{array}\right)^{1 / 2} \frac{((m-\ell) !)^{1 / 2}}{\left(\left[\frac{m-\ell}{d}\right] !\right)^{d / 2}}\left(\frac{3 \sqrt{d}}{\pi}\right)^{m-\ell}|f|_{H^{m}} .
	\end{aligned}
	$$
	
	Moreover, the weights of $\widehat{f}^{N}$ scale as $O\left(N^{\gamma}\right)$ with $\gamma=\max \left\{m^{2} / 2, d(1+m / 2+d / 2)\right\}$. Proof. Step 1: construction of the approximation. We divide the domain $\Omega$ into cubes of edge length $1 / N$ and denote the corresponding index set by
	
	$$
	\mathcal{N}^{N}=\left\{j \in \mathbb{N}^{d}: j_{i} \leq N\left(b_{i}-a_{i}\right) \text { for all } 1 \leq i \leq d\right\}
	$$
	
	Furthermore we write $T=\prod_{i=1}^{d}\left(b_{i}-a_{i}\right)$. As a result, $\left|\mathcal{N}^{N}\right|=T N^{d}$. Let us denote $J_{j}^{N}=\chi_{i=1}^{d}\left(\left(j_{i}-2\right) / N,\left(j_{i}+1\right) / N\right)$. We calculate that $\operatorname{diam}\left(J_{j}^{N}\right)=\frac{3 \sqrt{d}}{N}$. As a consequence, the Bramble-Hilbert lemma (Lemma A.4) ensures the existence of a polynomial $p_{j}^{N}$ of degree at most $m-1$ such that for all $0 \leq \ell \leq m-1$ it holds that
	
	$$
	\left|f-p_{j}^{N}\right|_{H^{\ell}\left(J_{j}^{N}\right)} \leq\left(\begin{array}{c}
		d+\ell-1 \\
		\ell
	\end{array}\right)^{1 / 2} \frac{((m-\ell) !)^{1 / 2}}{\left(\left[\frac{m-\ell}{d}\right] !\right)^{d / 2}}\left(\frac{3 \sqrt{d}}{\pi N}\right)^{m-\ell}|f|_{H^{m}}=: \frac{\mathcal{C}_{\ell}^{*}}{N^{m-\ell}}
	$$
	
	To simplify notation, we also define $\mathcal{C}_{k}:=\max _{0 \leq \ell \leq k} \mathcal{C}_{\ell}^{*}$ and $p^{N}=\sum_{j} p_{j}^{N} \chi_{j}$, where $\chi_{j}$ denotes the indicator function on $I_{j}^{N}$. Next, let $q_{j}^{N}$ be a tanh neural network as in Lemma B.1 such that
	
	$$
	\left\|q_{j}^{N}-p_{j}^{N}\right\|_{W^{k, \infty}(\Omega)} \leq \eta \text { and }\left\|q_{j}^{N}-p_{j}^{N}\right\|_{H^{k}(\Omega)} \leq \eta
	$$
	
	In addition, we define
	
	$$
	q_{j}^{N}(x) \widehat{\times} \Phi_{j}^{N, d}(x):=\widehat{x}_{d+1}^{h}\left(q_{j}^{N}(x), \phi_{j_{1}}^{N, d}\left(x_{1}\right), \ldots, \phi_{j_{d}}^{N, d}\left(x_{d}\right)\right)
	$$
	
	where $\widehat{x}:=\widehat{x}_{d+1}^{h}$ is the network from Corollary B.2 and $h=h(N)$ will be defined in the remainder of the proof. We then define our approximation as
	
	$$
	\widehat{f}^{N}(x)=\sum_{j \in \mathcal{N}^{N}} q_{j}^{N}(x) \widehat{\times} \Phi_{j}^{N, d}(x)
	$$
	
	Step 2: estimating the error of the approximation. The triangle inequality gives us
	
	$$
	\begin{aligned}
		\left\|f-\widehat{f}^{N}\right\|_{H^{k}(\Omega)} & \leq\left\|f-\sum_{j \in \mathcal{N}^{N}} f \cdot \Phi_{j}^{N, d}\right\|_{H^{k}(\Omega)}+\left\|_{j \in \mathcal{N}^{N}}\left(f-q_{j}^{N}\right) \cdot \Phi_{j}^{N, d}\right\|_{H^{k}(\Omega)} \\
		& +\left\|\sum_{j \in \mathcal{N}^{N}}\left(q_{j}^{N} \cdot \Phi_{j}^{N, d}-q_{j}^{N} \widehat{x} \Phi_{j}^{N, d}\right)\right\|_{H^{k}(\Omega)}
	\end{aligned}
	$$
	
	We proceed by bounding each term of the right hand side separately. Step 2a: First term of B.20). Let $i \in \mathcal{N}^{N}$ be arbitrary. Recalling that $\mathcal{V}_{d}=$ $\left\{v \in \mathbb{Z}^{d}:\|v\|_{\infty} \leq 1\right\}$, we observe that for $k \in\{0,1,2\}$
	
	$$
	\begin{aligned}
		& \left\|f-\sum_{j \in \mathcal{N}^{N}} f \cdot \Phi_{j}^{N, d}\right\|_{H^{k}\left(I_{i}^{N}\right)} \leq 2^{k}\|f\|_{H^{k}\left(I_{i}^{N}\right)}\left\|1-\sum_{v \in \mathcal{V}_{d}} \Phi_{i+v}^{N, d}\right\|_{W^{k, \infty}\left(I_{i}^{N}\right)} \\
		& +2^{k}\|f\|_{H^{k}\left(I_{i}^{N}\right)} \sum_{\substack{j \in \mathcal{N}^{N} \\ j-i \notin \mathcal{V}_{d}}} \Phi_{j}^{N, d} \|_{W^{k, \infty}\left(I_{i}^{N}\right)} \\
		& \leq 2^{k}\|f\|_{H^{k}\left(I_{i}^{N}\right)}\left(2^{k d} d \epsilon+\left|\mathcal{N}^{N}\right| \alpha^{k} \epsilon\right) \\
		& \leq 2^{k(1+d)}\|f\|_{H^{k}\left(I_{i}^{N}\right)} d \epsilon \\
		& +2^{k}\|f\|_{H^{k}\left(I_{i}^{N}\right)}\left|\mathcal{N}^{N}\right| N^{k} \ln ^{k}\left(\frac{4 N^{2}}{e^{2} \epsilon}\right) \epsilon \\
		& \leq 2^{k} 3^{d} \frac{\delta}{4} \ln ^{k}\left(\frac{4 N^{2}}{e^{2} \epsilon}\right) \frac{\mathcal{C}_{k}}{N^{m-k}},
	\end{aligned}
	$$
	
	where we used Lemma A.5 with $k=2$, Lemma B.5 Lemma B.6 and Lemma B.4 as well as a suitable definition of $\epsilon$, e.g. satisfying
	
	$$
	\epsilon \leq \frac{3^{d} \delta \mathcal{C}_{k}}{2^{3+k+k d} \max \{T, d\} N^{d+m}\|f\|_{H^{k}(\Omega)}},
	$$
	
	where we used that $N>5$.
	
	Step 2b: Second term of $\mathrm{B} .20$. Let $\beta \in \mathbb{N}_{0}^{d}$ be such that $|\beta| \leq k$. Then as a consequence of the general Leibniz rule we find that
	
	$$
	\left\|D^{\beta}\left(\sum_{v \in \mathcal{V}_{d}}\left(f-q_{i+v}^{N}\right) \Phi_{i+v}^{N, d}\right)\right\|_{L^{2}\left(I_{i}^{N}\right)} \leq \sum_{\beta^{\prime} \leq \beta}\left(\begin{array}{c}
		\beta \\
		\beta^{\prime}
	\end{array}\right) \sum_{v \in \mathcal{V}_{d}}\left\|D^{\beta^{\prime}}\left(f-q_{i+v}^{N}\right)\right\|_{L^{2}\left(I_{i}^{N}\right)}\left\|D^{\beta-\beta^{\prime}} \Phi_{i+v, d}^{N, d}\right\|_{L^{\infty}\left(I_{i}^{N}\right)}
	$$
	
	For every $v \in \mathcal{V}_{d}$ and $\beta^{\prime} \leq \beta$ with $\ell:=\left|\beta-\beta^{\prime}\right|$, we can then use the bounds
	
	$$
	\left\|D^{\beta^{\prime}}\left(f-q_{i+v}^{N}\right)\right\|_{L^{2}\left(I_{i}^{N}\right)} \leq\left\|f-q_{i+v}^{N}\right\|_{H^{k-\ell}\left(I_{i}^{N}\right)} \leq \frac{\mathcal{C}_{k}}{N^{m-k+\ell}}+\eta,
	$$
	
	which follows from (B.16) and B.17, and,
	
	$$
	\left\|D^{\beta-\beta^{\prime}} \Phi_{i+v}^{N, d}\right\|_{L^{\infty}\left(I_{i}^{N}\right)} \leq N^{\ell} \ln ^{\ell}\left(\frac{4 N^{2}}{e^{2} \epsilon}\right)
	$$
	
	which follows from Lemma B.3 and Lemma B.4. As $\sum_{\beta^{\prime} \leq \beta}\left(\begin{array}{c}\beta \\ \beta^{\prime}\end{array}\right) \leq 2^{k}$ (as a consequence of the multi-binomial theorem), we find that
	
	$$
	\left\|\sum_{v \in \mathcal{V}_{d}}\left(f-q_{i+v}^{N}\right) \Phi_{i+v}^{N, d}\right\|_{W^{k, \infty}\left(I_{i}^{N}\right)} \leq 2^{k} 3^{d}\left(\frac{\mathcal{C}_{k}}{N^{m-k}}+\eta N^{k}\right) \ln k\left(\frac{4 N^{2}}{e^{2} \epsilon}\right) .
	$$
	
	Combining this result with the triangle inequality, Lemma B.4. Lemma A.5 (B.16), B.17, Lemma B.6 and the fact that $\ln (x) \leq \sqrt{x}$ for $x>0$, we find that
	
	$$
	\begin{aligned}
		& \left\|\sum_{j \in \mathcal{N}^{N}}\left(f-q_{j}^{N}\right) \cdot \Phi_{j}^{N, d}\right\|_{H^{k}\left(I_{i}^{N}\right)} \\
		& \leq\left\|\sum_{v \in \mathcal{V}_{d}}\left(f-q_{i+v}^{N}\right) \Phi_{i+v}^{N, d}\right\|_{H^{k}\left(I_{i}^{N}\right)}+\sum_{\substack{j \in \mathcal{N}^{N} \\ j-i \notin \mathcal{V}_{d}}}\left\|\left(f-q_{j}^{N}\right) \Phi_{j}^{N, d}\right\|_{H^{k}\left(I_{i}^{N}\right)} \\
		& \leq\left\|\sum_{v \in \mathcal{V}_{d}}\left(f-q_{i+v}^{N}\right) \Phi_{i+v}^{N, d}\right\|_{H^{k}\left(I_{i}^{N}\right)}+\sum_{\substack{j \in \mathcal{N}^{N} \\ j-i \notin \mathcal{V}_{d}}} 2^{k}\left\|\left(f-q_{j}^{N}\right)\right\|_{H^{k}\left(I_{i}^{N}\right)}\left\|\Phi_{j}^{N, d}\right\|_{W^{k, \infty}\left(I_{i}^{N}\right)} \\
		& \leq 2^{k} 3^{d}\left(\frac{\mathcal{C}_{k}}{N^{m-2}}+\eta N^{k}\right) \ln ^{k}\left(\frac{4 N^{2}}{e^{2} \epsilon}\right)+2^{k}\left|\mathcal{N}^{N}\right|\left(\mathcal{C}_{k}+\eta\right) N^{k} \ln ^{k}\left(\frac{4 N^{2}}{e^{2} \epsilon}\right) \epsilon \\
		& \leq 2^{k} 3^{d}\left(1+\frac{\delta}{4}\right) \ln ^{k}\left(\frac{4 N^{2}}{e^{2} \epsilon}\right) \frac{\mathcal{C}_{k}}{N^{m-k}}
	\end{aligned}
	$$
	
	where we obtain the last inequality by making a suitable choice of $\eta$ and $\epsilon$, satisfying
	
	$$
	\eta \leq \frac{\delta \mathcal{C}_{k}}{8 N^{m}} \quad \text { and } \quad \epsilon \leq \frac{3^{d} \delta}{4 T N^{d+m}}
	$$
	
	Step 2d: Third term of B.20). Finally, using the triangle inequality, Lemma A.6. Lemma B.1 and Lemma B.3 we obtain that for some $C>0$ depending only on $k$ and $d$,
	
	$$
	\begin{aligned}
		& \left\|\sum_{j \in \mathcal{N}^{N}}\left(q_{j}^{N} \cdot \Phi_{j}^{N, d}-q_{j}^{N} \widehat{x} \Phi_{j}^{N, d}\right)\right\|_{H^{k}\left(I_{i}^{N}\right)} \leq \sqrt{\mu(\Omega)}\left\|\sum_{j \in \mathcal{N}^{N}}\left(q_{j}^{N} \cdot \Phi_{j}^{N, d}-q_{j}^{N} \widehat{x} \Phi_{j}^{N, d}\right)\right\|_{W^{k, \infty}\left(I_{i}^{N}\right)} \\
		& \leq \sqrt{\mu(\Omega)}\left|\mathcal{N}^{N}\right| C \cdot\left\|\widehat{\times}_{d+1}^{h}-\prod_{i=1}^{d+1} x_{i}\right\|_{W^{k, \infty}}\left(\left\|q_{j}^{N}\right\|_{W^{k, \infty}(\Omega)}+\left\|\rho_{i}^{N}\right\|_{W^{k, \infty}(\Omega)}\right)^{k} \\
		& \leq \sqrt{\mu(\Omega)}\left|\mathcal{N}^{N}\right| C \cdot h\left(\left\|q_{j}^{N}\right\|_{W^{k, \infty}(\Omega)}+\alpha^{k}\right)^{k} \\
		& \leq 2^{k} 3^{d} \frac{\delta}{4} \ln ^{k}\left(\frac{4 N^{2}}{e^{2} \epsilon}\right) \frac{\mathcal{C}_{k}}{N^{m-k}}
	\end{aligned}
	$$
	
	where we obtain the last inequality by making a suitable choice of $h$, satisfying
	
	$$
	h \leq \frac{3^{d} \delta \mathcal{C}_{k}}{4 \sqrt{\mu(\Omega)} T N^{d+m-k} C\left(\left\|q_{j}^{N}\right\|_{W^{k, \infty}(\Omega)}+\alpha^{k}\right)^{k}}
	$$
	
	Step 2e: Final error bound. From B.22 and B.28 we find that a suitable definition of $\epsilon$ is given by
	
	$$
	\epsilon=\frac{3^{d} \delta \min \left\{1, \mathcal{C}_{k}\right\}}{2^{3+k d} N^{m+d} \max \{T, d\} \max \left\{\|f\|_{W^{k, \infty}(\Omega)}, 1\right\}} .
	$$
	
	Combining this observation with all previous steps of the proof then leads to the error bound
	
	$$
	\left\|f-\widehat{f}^{N}\right\|_{W^{k, \infty}(\Omega)} \leq 2^{k} 3^{d}(1+\delta) \ln ^{k}\left(\beta N^{d+m+2}\right) \frac{\mathcal{C}_{k}}{N^{m-k}},
	$$
	
	where $k=2$ and where we define
	
	$$
	\beta=\frac{5 \cdot 2^{k d} \max \{T, d\} \max \left\{\|f\|_{W^{k, \infty}(\Omega)}, 1\right\}}{3^{d} \delta \min \left\{1, \mathcal{C}_{k}\right\}},
	$$
	
	where we used that $2^{5} / e^{2} \leq 5$
	
	Step 3: Estimating the network and weights sizes. The first hidden layer requires $3\left[\frac{s}{2}\right]\left|P_{s-1, d+1}\right|$ neurons for the computation of all multivariate monomials (cf. Lemma B.1. For the computation of all $\rho_{j}^{N}\left(x_{i}\right)$ another $\sum_{i=1}^{d}\left(b_{i}-a_{i}\right)(N-1)$ neurons are needed in the first hidden layer. The second hidden layer needs at most $3\left[\frac{d+2}{2}\right]\left|P_{d+1, d+1}\right|$ neurons for realizing $\widehat{x}_{d+1}^{h}$, which needs to be performed $N^{d} \prod_{i=1}^{d}\left(b_{i}-a_{i}\right)$ times.
	
	In the proof we achieved the wanted accuracy by making suitable choices of $\eta, \epsilon, h$. From equation B.31 and Lemma B.4 it follows that $\alpha=O(N \ln (N))$.
	
	For the approximate multiplication, B.30 requires that $h^{-1}=O\left(N^{d+m+2}\right)$. Corollary B.2 then proves that the weights of $\widehat{x}_{d+1}^{h}$ grow as $O\left(N^{d(1+m / 2+d / 2)}\right)$. Finally, the condition $\eta^{-1}=O\left(N^{m}\right)$ from (B.28) corresponds to weights growing as $O\left(N^{m^{2} / 2}\right)$ as a consequence of Corollary B.1. This concludes the proof.
	
	\section{Appendix C. Bounds on the derivative of a neural network}
	Lemma C.1. Let $d, n, L, W \in \mathbb{N}$ and let $u_{\theta}: \mathbb{R}^{d+1} \rightarrow \mathbb{R}^{d+1}$ be a neural network with $\theta \in \Theta_{L, W, R}$ for $L \geq 2, R, W \geq 1$, cf. Definition 2.3. Assume that $\|\sigma\|_{C^{n}} \geq 1$. Then it holds for $1 \leq j \leq d+1$ that
	
	$$
	\left\|\left(u_{\theta}\right)_{j}\right\|_{C^{n}} \leq 16^{L}(d+1)^{2 n}\left(e^{2} n^{4} W^{3} R^{n}\|\sigma\|_{C^{n}}\right)^{n L}
	$$
	
	Proof. Using the notation of Definition 2.3, we define the functions $F_{k}=\mathbb{R}^{l_{k-1}} \rightarrow \mathbb{R}$ for every $1 \leq k \leq L$ as
	
	$$
	F_{k}=f_{L}^{\theta} \circ f_{L-1}^{\theta} \circ \cdots \circ f_{k}^{\theta},
	$$
	
	and note that $F_{1}=u_{\theta}$ and $F_{L}=f_{L}^{\theta}$. An application of [6. Lemma A.7] then brings us that
	
	$$
	\left\|F_{k}\right\|_{C^{n}} \leq 16\left(e^{2} n^{4} l_{k} l_{k-1}^{2}\right)^{n} \max _{1 \leq i \leq l_{k}}\left\|\left(f_{k}^{\theta}\right)_{i}\right\|_{C^{n}}^{n}\left\|F_{k+1}\right\|_{C^{n}} .
	$$
	
	For $R \geq 1$ and $1 \leq k<L$ we find that $\left\|\left(f_{k}^{\theta}\right)_{i}\right\|_{C^{n}} \leq R^{n}\|\sigma\|_{C^{n}}$ for every $i$ and for $k=L$ we find that $\left\|\left(f_{L}^{\theta}\right)_{i}\right\|_{C^{n}} \leq R\left(W\|\sigma\|_{C^{0}}+1\right)$. Combining these inequalities recursively gives us
	
	$$
	\begin{aligned}
		\left\|F_{1}\right\|_{C^{n}} & \leq\left\|F_{L}\right\|_{C^{n}} \prod_{k=1}^{L-1}\left[16\left(e^{2} n^{4} l_{k} l_{k-1}^{2}\right)^{n} \max _{1 \leq i \leq l_{k}}\left\|\left(f_{k}^{\theta}\right)_{i}\right\|_{C^{n}}^{n}\right] \\
		& \leq R\left(W\|\sigma\|_{C^{0}}+1\right)\left[16\left(e^{2} n^{4} W^{3} R^{n}\|\sigma\|_{C^{n}}\right)^{n}\right]^{L-1}(d+1)^{2 n} \\
		& \leq 16^{L}(d+1)^{2 n}\left(e^{2} n^{4} W^{3} R^{n}\|\sigma\|_{C^{n}}\right)^{n L}
	\end{aligned}
	$$
	
	This concludes the proof of the lemma as $F_{1}=u_{\theta}$.
	
	
\end{document}