\section{Approximation for Products of Two Real Numbers}
We will build up the tools necessary to approximate $e^x$ via neural networks in the framework described in the previous sections. While much of the foundation comes from, e.g., \cite{grohs2019spacetime} way, we will, along the way, encounter neural networks not seen in the literature, such as the $\tay$, $\pwr$, $\tun$, and finally a neural network approximant for $e^x$. For each of these neural networks, we will be concerned with at least the following:
\begin{enumerate}[label = (\roman*)]
\item whether their instantiations using the ReLU function (often just continuous functions) are continuous.
\item whether their depths are bounded, at most polynomially, on the type of accuracy we want, $\ve$.
\item whether their parameter estimates are bounded at most polynomially on the type of accuracy we want, $\ve$.
\item The accuracy of our neural networks.
\end{enumerate}
\subsection{The squares of real numbers in $\lb0,1\rb$}
\item$\real_{\rect}\lp\mathfrak{i}_4\rp\in C \lp\R^d, \R^d\rp$.
\item$\lay\lp\mathfrak{i}_d\rp=\lp d,d,d\rp$
\item$\param\lp\mathfrak{i}_4\rp=2d^2+2d$
\end{enumerate}
\end{lemma}
\begin{proof}
Item (i) is straightforward from the fact that for all $d \in\N$ it is the case that $\real_{\rect}\lp\mathfrak{i}_d\rp=\mathbb{I}_d\lp\real_{\rect}\lp\lb\mathbb{I}_d\rb_*\rp+\mymathbb{0}_d\rp+\mymathbb{0}_d$. Item (ii) is straightforward from the fact that $\mathbb{I}_d \in\R^{d \times d}$. We realize Item (iii) by observation.
\end{proof}
\begin{lemma}\label{lem:6.1.1}\label{lem:phi_k}
Let $\lp c_k \rp_{k \in\N}\subseteq\R$, $\lp A_k \rp_{k \in\N}\in\R^{4\times4},$$\mathbb{B}\in\R^{4\times1}$, $\lp C_k \rp_{k\in\N}$ satisfy for all $k \in\N$ that:
\begin{align}\label{(6.0.1)}
A_k = \begin{bmatrix}
2 & -4 &2 & 0 \\
2 & -4 & 2 & 0\\
2 & -4 & 2 & 0\\
-c_k & 2c_k & -c_k & 1
\end{bmatrix}\quad B=\begin{bmatrix}
0 \\ -\frac{1}{2}\\ -1 \\ 0
\end{bmatrix}\quad C_k = \begin{bmatrix}
-c_k & 2c_k &-c_k & 1
\end{bmatrix}
\end{align}
and that:
\begin{align}
c_k = 2^{1-2k}
\end{align}
Let $\Phi_k \in\neu$, $k\in\N$ satisfy for all $k \in[2,\infty)\cap\N$ that $\Phi_1=\lp\aff_{C_1,0}\bullet\mathfrak{i}_4\rp\bullet\aff_{\mymathbb{e}_4,B}$, that for all $d \in\N$, $\mathfrak{i}_d =\lp\lp\mathbb{I}_d, \mymathbb{0}_d \rp, \lp\mathbb{I}_d, \mymathbb{0}_d \rp\rp$ and that:
\item for all $k \in\N$, $x \in\R$ we have $\real_{\rect}\lp\Phi_k\rp\lp x \rp\in C \lp\R, \R\rp$
\item for all $k \in\N$ we have $\lay\lp\Phi_k \rp=\lp1,4,4,...,4,1\rp\in\N^{k+2}$
\item for all $k \in\N$, $x \in\R\setminus\lb0,1\rb$ that $\lp\real_{\rect}\lp\Phi_k \rp\rp\lp x \rp=\rect\lp x \rp$
\item for all $k \in\N$, $x \in\lb0,1\rb$, we have $\left| x^2-\lp\real_{\rect}\lp\xi_k \rp\rp\lp x \rp\right| \les2^{-2k-2}$, and
\item for al $k \in\N$ , we have that $\param\lp\Phi_k \rp=20k-7$
\end{enumerate}
\end{lemma}
\begin{proof}
Let $g_k: \R\rightarrow\lb0,1\rb$, $k \in\N$ be the functions defined as such, satisfying for all $k \in\N$, $x \in\R$ that:
\begin{align}\label{(6.0.3)}
g_1 \lp x \rp&= \begin{cases}
2x & : x \in\lb 0,\frac{1}{2}\rp\\
2-2x &: x\in\lb\frac{1}{2},1\rb\\
0 &: x \in\R\setminus\lb 0,1 \rb
\end{cases}\\
g_{k+1}&= g_1(g_{k}) \nonumber
\end{align}
and let $f_k: \lb0,1\rb\rightarrow\lb0,1\rb$, $k \in\N_0$ be the functions satisfying for all $k \in\N_0$, $n \in\{0,1,...,2^k-1\}$, $x \in\lb\frac{n}{2^k}, \frac{n+1}{2^k}\rp$ that $f_k(1)=1$ and:
and let $r_k =\lp r_{k,1},r_{k,2},r_{k,3},r_{k,4}\rp: \R\rightarrow\R^4$, $k \in\N$ be the functions which which satisfy for all $x \in\R$, $k \in\N$ that:
Note that since it is the case that for all $x \in\R$ that $\rect(x)=\max\{x,0\}$, (\ref{(6.0.3)}) and (\ref{(6.0.5)}) shows that it holds for all $x \in\R$ that:
Note also that combined with (\ref{(6.0.4.2)}), the fact that for all $x\in[0,1]$ it holds that $f_0(x)= x =\max\{x,0\}$ tells us that for all $x \in\R$:
\begin{align}\label{6.0.7}
r_{4,1}(x) = \max\{x,0\} = \begin{cases}
f_0(x) & :x\in [0,1] \\
\max\{x,0\}& :x \in\R\setminus\lb 0,1\rb
\end{cases}
\end{align}
We next claim that for all $k \in\N$, it is the case that:
\begin{align}\label{6.0.8}
\lp\forall x \in\R : 2r_{1,k}(x)-4r_{2,k}(x) + 2r_{3,k}(x) =g(x) \rp
\end{align}
and that:
\begin{align}\label{6.0.9}
\lp\forall x \in\R: r_{4,k} (x) = \begin{cases}
f_{k-1}(x) & :x \in\lb 0,1 \rb\\
\max\{x,0\}& : x \in\R\setminus\lb 0,1\rb
\end{cases}\rp
\end{align}
We prove (\ref{6.0.8}) and (\ref{6.0.9}) by induction. The base base of $k=1$ is proved by (\ref{6.0.6}) and (\ref{6.0.7}). For the induction step $\N\ni k \rightarrow k+1$ assume there does exist a $k \in\N$ such that for all $x \in\R$ it is the case that:
\begin{align}
2r_{1,k}(x) - 4r_{2,k}(x) + 2r_{3,k}(x) = g_k(x)
\end{align}
and:
\begin{align}\label{6.0.11}
r_{4,k}(x) = \begin{cases}
f_{k-1}(x) & : x \in [0,1] \\
\max\{x,0\}&: x \in\R\setminus\lb 0,1 \rb
\end{cases}
\end{align}
Note that then (\ref{(6.0.3)}),(\ref{(6.0.5)}), and (\ref{6.0.6}) then tells us that for all $x \in\R$ it is the case that:
\begin{align}\label{6.0.12}
g_{k+1}\lp x \rp&= g_1(g_k(x)) = g_1(2r_{1,k}(x)+4r_{2,k}(x) + 2r_{3,k}(x)) \nonumber\\
In addition note that (\ref{(6.0.4.2)}), (\ref{(6.0.5)}), and (\ref{6.0.7}) tells us that for all $x \in\R$:
%TODO: Ask about the extra powers of 2 and b_k
\begin{align}\label{6.0.13}
r_{4,k+1}(x) &= \rect\lp\lp -2 \rp^{3-2 \lp k+1 \rp}r_{1,k}\lp x \rp + 2^{4-2 \lp k+1 \rp}r_{2,k}\lp x \rp + \lp -2 \rp^{3-2\lp k+1\rp}r_{3,k}\lp x \rp + r_{4,k}\lp x\rp\rp\nonumber\\
&= \rect\lp\lp -2 \rp^{1-2k}r_{1,k}\lp x \rp + 2^{2-2k}r_{k,2}\lp x \rp + \lp -2 \rp^{1-2k}r_{3,k}\lp x \rp + r_{4,k}\lp x \rp\rp\nonumber\\
&=\rect\lp 2^{-2k}\lb -2r_{1,k}\lp x \rp + 2^2r_{2,k}\lp x \rp -2r_{3,k}\lp x \rp\rb +r_{4,k}\lp x \rp\rp\nonumber\\
&= \rect\lp - \lb 2^{-2k}\rb\lb 2r_{1,k}\lp x \rp -4r_{2,k}\lp x \rp +2r_{3,k}\lp x \rp\rb +r_{4,k}\lp x \rp\rp\nonumber\\
&= \rect\lp -\lb 2^{-2k}\rb g_k \lp x \rp +r_{4,k}\lp x \rp\rp
\end{align}
This and the fact that for all $x\in\R$ it is the case that $\rect\lp x \rp=\max\{x,0\}$, that for all $x\in\lb0 ,1\rb$ it is the case that $f_k \lp x \rp\ges0$, (\ref{6.0.11}), shows that for all $x \in\lb0,1\rb$ it holds that:
\begin{align}\label{6.0.14}
r_{4,k+1}\lp x \rp&= \rect\lp -2 \lb 2^{-2k} g_k \rb + f_{k-1}\lp x \rp\rp = \rect\lp -2 \lp 2^{-2k}g_k \lp x \rp\rp +x-\lb\sum^{k-1}_{j=1}\lp 2^{-2j}g_j \lp x \rp\rp\rb\rp\nonumber\\
&= \rect\lp x - \lb\sum^k_{j=1}2^{-2j}g_j \lp x \rp\rb\rp = \rect\lp f_k \lp x \rp\rp =f_k \lp x \rp
\end{align}
Note next that (\ref{6.0.11}) and (\ref{6.0.13}) then tells us that for all $x\in\R\setminus\lb0,1\rb$:
\begin{align}
r_{4,k+1}\lp x \rp = \max\left\{ -\lp 2^{-2k}g_x \lp x \rp\rp + r_{4,k}\lp x \rp\right\} = \max\{\max\{x,0\},0\} = \max\{x,0\}
\end{align}
Combining (\ref{6.0.12}) and (\ref{6.0.14}) proves (\ref{6.0.8}) and (\ref{6.0.9}). Note that then (\ref{(6.0.1)}) and (\ref{6.0.8}) assure that for all $k\in\N$, $x\in\R$ it holds that $\real_{\rect}\lp\Phi_k \rp\in C \lp\R,\R\rp$ and that:
\begin{align}\label{(6.0.17)}
&\lp\real_{\rect}\lp\Phi_k \rp\rp\lp x \rp\nonumber\\
&= \lp\real_{\rect}\lp\lp\aff_{C_k,0}\bullet\mathfrak{i}_4 \rp\bullet\lp\aff_{A_{k-1},B}\bullet\mathfrak{i}_4 \rp\bullet\cdots\bullet\lp\aff_{A_1,B}\bullet\mathfrak{i}_4 \rp\bullet\aff_{\mymathbb{e}_4,B}\rp\rp\lp x \rp\nonumber\\
&= \lp -2\rp^{1-2k}r_{1,k}\lp x \rp + 2^{2-2k} r_{2,k}\lp x \rp + \lp -2 \rp^{1-2k} r_{3,k}\lp x \rp + r_{4,k}\lp x \rp\nonumber\\
&=\lp -2 \rp^{2-2k}\lp\lb\frac{r_{1,k}\lp x \rp +r_{3,k}\lp x \rp}{-2}\rb + r_{2,k}\lp x \rp\rp +r_{4,k}\lp x \rp\nonumber\\
&=2^{2-2k}\lp\lb\frac{r_{1,k}\lp x \rp+r_{3,k}\lp x \rp}{-2}\rb + r_{2,k}\lp x \rp\rp +r_{4,k}\lp x \rp\nonumber\\
&=2^{-2k}\lp 4r_{2,k}\lp x \rp -2r_{1,k}\lp x \rp -2r_{3,k}\lp x \rp\rp +r_{4,k}\lp x \rp\nonumber\\
&=-\lb 2^{-2k}\rb\lb 2r_{1,k}\lp x \rp -4r_{2,k}\lp x \rp +2r_{3,k}\lp x \rp\rb +r_{4,k}\lp x \rp = -\lb 2^{-2k}\rb g_k \lp x \rp + r_{4,k}\lp x \rp
\end{align}
This and (\ref{6.0.9}) tell us that:
\begin{align}
\lp\real_{\rect}\lp\Phi_k \rp\rp\lp x \rp = - \lp 2^{-2k}g_k \lp x \rp\rp +f_{k-1}\lp x \rp&= -\lp 2^{-2k}g_k \lp x \rp\rp +x-\lb\sum^{k-1}_{j=1} 2^{-2j}g_j \lp x \rp\rb\nonumber\\
&=x-\lb\sum^k_{j=1}2^{-2j}g_j \lp x \rp\rb =f_k\lp x\rp\nonumber
\end{align}
Which then implies for all $k\in\N$, $x \in\lb0,1\rb$ that it holds that:
\begin{align}
\left\| x^2-\lp\real_{\rect}\lp\Phi_k \rp\rp\lp x \rp\right\|\les 2^{-2k-2}
\end{align}
This, in turn, establishes Item (i).
Finally observe that (\ref{(6.0.17)}) then tells us that for all $k\in\N$, $x \in\R\setminus\lb0,1\rb$ it holds that:
\begin{align}
\lp\real_{\rect}\lp\Phi_k \rp\rp\lp x \rp = -2^{-2k}g_k \lp x \rp +r_{4,k}\lp x \rp =r_{4,k}\lp x \rp = \max\{x,0\} = \rect(x)
\end{align}
This establishes Item(iv). Note next that Item(iii) ensures for all $k\in\N$ that $\dep\lp\xi_k \rp= k+1$, and:
\lb\aff_{C_M,0}\bullet\mathfrak{i}_4\rb\bullet\lb\aff_{A_{M-1},0}\bullet\mathfrak{i}_4 \rb\bullet\cdots\bullet\lb\aff_{A_1,B}\bullet\mathfrak{i}_4\rb\bullet\aff_{\mymathbb{e}_4,B}& M \in\lb 2,\infty\rp\cap\N
Items (i)--(iii) are direct consequences of Lemma \ref{lem:6.1.1}, Items (i)--(iii). Note next the fact that $M =\min\left\{\N\cap\lb\frac{1}{2}\log_2\lp\ve^{-1}\rp-1\rb,\infty\right\}$ ensures that:
\begin{align}
M = \min\left\{\N\cap\lb\frac{1}{2}\log_2\lp\ve^{-1}\rp-1\rb, \infty\right\}\ges\min\left\{\lb\max\left\{ 1,\frac{1}{2}\log_2 \lp\ve^{-1}\rp-1\right\},\infty\rb\right\}\ges\frac{1}{2}\log_2 \lp\ve^{-1}\rp-1
\end{align}
This and Item (v) of Lemma \ref{lem:6.1.1} demonstrate that for all $x\in\lb0,1\rb$ it then holds that:
Thus establishing Item (iv). The fact that $M =\min\left\{\N\cap\lb\frac{1}{2}\log_2\lp\ve^{-1}\rp-1,\infty\rb\right\}$ and Item (ii) of Lemma \ref{lem:6.1.1} tell us that:
\caption{Contour plot of the $L^1$ difference between $\Phi$ and $x^2$ over $\lb0,1\rb$ for different values of $\ve$.}
\end{figure}
\begin{remark}
Note that (\ref{def:Phi}) implies that $\dep\lp\Phi\rp\ges4$.
\end{remark}
Now that we have neural networks that perform the squaring operation inside $\lb-1,1\rb$, we may extend to all of $\R$. Note that this neural network representation differs somewhat from the ones in \cite{grohs2019spacetime}.
\subsection{The $\sqr$ network}
\begin{lemma}\label{6.0.3}\label{lem:sqr_network}
Let $\delta,\epsilon\in(0,\infty)$, $\alpha\in(0,\infty)$, $q\in(2,\infty)$, $\Phi\in\neu$ satisfy that $\delta=2^{\frac{-2}{q-2}}\ve^{\frac{q}{q-2}}$, $\alpha=\lp\frac{\ve}{2}\rp^{\frac{1}{q-2}}$, $\real{\rect}\lp\Phi\rp\in C\lp\R,\R\rp$, $\dep(\Phi)\les\max\left\{\frac{1}{2}\log_2(\delta^{-1})+1,2\right\}$, $\param(\Phi)\les\max\left\{10\log_2\lp\delta^{-1}\rp-7,13\right\}$, $\sup_{x \in\R\setminus[0,1]} | \lp\real_{\rect}\lp\Phi\rp-\rect(x)\right| =0$, and $\sup_{x\in\lb0,1\rb} |x^2-\lp\real_{\rect}\lp\Phi\rp\rp\lp x\rp | \les\delta$, let $\Psi\in\neu$ be the neural network given by:
\item it holds that $\real_{\rect}\lp\Psi\rp\in C \lp\R,\R\rp$.
\item it holds that $\lp\real_{\rect}\lp\Psi\rp\rp\lp0\rp=0$
\item it holds for all $x\in\R$ that $0\les\lp\real_{\rect}\lp\Psi\rp\rp\lp x \rp\les\ve+ |x|^2$
\item it holds for all $x \in\R$ that $|x^2-\lp\real_{\rect}\lp\Psi\rp\rp\lp x \rp |\les\ve\max\{1,|x|^q\}$
\item it holds that $\dep(\Psi)\les\max\left\{1+\frac{1}{q-2}+\frac{q}{2(q-2)}\log_2\lp\ve^{-1}\rp,2\right\}$, and
\item it holds that $\param\lp\Psi\rp\les\max\left\{\lb\frac{40q}{q-2}\rb\log_2\lp\ve^{-1}\rp+\frac{80}{q-2}-28,52\right\}$
\end{enumerate}
\end{lemma}
\begin{proof}
Note that for all $x\in\R$ it is the case that:
\begin{align}\label{6.0.21}
\lp\real_{\rect}\lp\Psi\rp\rp\lp x \rp&= \lp\real_{\rect}\lp\lp\aff_{\alpha^{-2}}\bullet\Phi\bullet\aff_{\alpha,0}\rp\oplus\lp\aff_{\alpha^{-2},0}\bullet\Phi\bullet\aff_{-\alpha,0}\rp\rp\rp\lp x \rp\nonumber\\
This and the assumption that $\Phi\in C\lp\R, \R\rp$ along with the assumption that $\sup_{x\in\R\setminus\lb0,1\rb} | \lp\real_{\rect}\lp\Phi\rp\rp\lp x \rp-\rect\lp x\rp | =0$ tells us that for all $x\in\R$ it holds that:
This, in turn, establishes Item (i)--(ii). Observe next that from the assumption that $\real_{\rect}\lp\Phi\rp\in C\lp\R,\R\rp$ and the assumption that $\sup_{x\in\R\setminus\lb0,1\rb} | \lp\real_{\rect}\lp\Phi\rp\rp\lp x \rp-\rect(x) |=0$ ensure that for all $x\in\R\setminus\lb-1,1\rb$ it holds that:
The assumption that for all $\sup_{x\in\R\setminus\lb0,1\rb}|\lp\real_{\rect}\lp\Phi\rp\rp\lp x\rp-\rect\lp x\rp |=0$ and the assumption that $\sup_{x\in\lb0,1\rb} |x^2-\lp\real_{\rect}\lp\Phi\rp\rp\lp x\rp |\les\delta$ show that:
\begin{align}\label{6.0.24}
&\sup_{x \in\lb -1,1\rb}\left|x^2 - \lp\lb\real_{\rect}\lp\Phi\rp\rb\lp x\rp +\lb\real_{\rect}\lp\Phi\rp\lp x \rp\rb\rp\right| \nonumber\\
Next observe that (\ref{6.0.21}) and (\ref{6.0.23}) show that for all $x \in\R\setminus\lb-\lp\frac{\ve}{2}\rp^{\frac{-1}{q-2}}, \lp\frac{\ve}{2}\rp^{\frac{-1}{q-2}}\rb$ it holds that:
\begin{align}\label{6.0.25}
0 \les\lb\real_{\rect}\lp\Psi\rp\rb\lp x \rp&= \lp\frac{\ve}{2}\rp^{\frac{-2}{q-2}}\lp\lb\real_{\rect}\lp\Phi\rp\rb\lp\lp\frac{\ve}{2}\rp^{\frac{1}{q-2}}x \rp + \lb\real_{\rect}\lp\Phi\rp\rb\lp -\lp\frac{\ve}{2}\rp^{\frac{1}{q-2}} x\rp\rp\nonumber\\
The triangle inequality then tells us that for all $x\in\R\setminus\lb-\lp\frac{\ve}{2}\rp^{\frac{-1}{q-2}}, \lp\frac{\ve}{2}\rp^{\frac{-1}{q-2}}\rb$ it holds that:
\begin{align}\label{6.0.25}
\left| x^2- \lp\real_{\rect}\lp\Psi\rp\rp\lp x \rp\right| &= \left| x^2 - \lp\frac{\ve}{2}\rp^{\frac{-1}{q-2}}\left|x\right| \right| \les\lp\left|x \right|^2 + \lp\frac{\ve}{2}\rp^{\frac{-1}{q-2}}\left| x \right| \rp\nonumber\\
&= \lp\left| x \right|^q \left|x\right|^{-(q-2)} + \lp\frac{\ve}{2}\rp^{\frac{-1}{q-2}}\left| x \right|^q\left| x \right|^{-(q-1)}\rp\nonumber\\
&\les\lp\left| x \right|^q \lp\frac{\ve}{2}\rp^{\frac{q-2}{q-2}} + \lp\frac{\ve}{2}\rp^{\frac{-1}{q-2}}\left| x \right|^q \lp\frac{\ve}{2}\rp^{\frac{q-1}{q-2}}\rp\nonumber\\
&= \lp\frac{\ve}{2}+ \frac{\ve}{2}\rp\left| x \right|^q = \ve\left| x \right|^q \les\ve\max\left\{ 1, \left| x \right|^q \right\}
\end{align}
Note that (\ref{6.0.24}), (\ref{6.0.21}) and the fact that $\delta=2^{\frac{-2}{q-2}}\ve^{\frac{q}{q-2}}$ then tell for all $x \in\lb-\lp\frac{\ve}{2}\rp^{\frac{-1}{q-2}}, \lp\frac{\ve}{2}\rp^{\frac{-1}{q-2}}\rb$ it holds that:
Now note that this and (\ref{6.0.25}) tells us that for all $x\in\R$ it is the case that:
\begin{align}
\left| x^2-\lp\real_{\rect}\lp\Psi\rp\rp\lp x \rp\right| \les\ve\max\{1,|x|^q \}
\end{align}
This establishes Item (v). Note that, (\ref{6.0.26}) tells that for all $x \in\lb-\lp\frac{\ve}{2}\rp^{\frac{-1}{q-2}}, \lp\frac{\ve}{2}\rp^{\frac{1}{q-2}}\rb$ it is the case that:
\begin{align}
\left| \lp\real_{\rect}\lp\Psi\rp\rp\lp x \rp\right| \les\left| x^2 - \lp\real_{\rect}\lp\Psi\rp\rp\lp x \rp\right| + \left| x \right|^2 \les\ve + \left| x \right| ^2
\end{align}
This and (\ref{6.0.25}) tells us that for all $x\in\R$:
\begin{align}
\left| \lp\real_{\rect}\rp\lp x \rp\right| \les\ve + |x|^2
\end{align}
This establishes Item (iv).
Note next that by Corollary \ref{affcor}, Remark \ref{5.3.2}, the hypothesis, and the fact that $\delta=2^{\frac{-2}{q-2}}\ve^{\frac{q}{q-2}}$ tells us that:
By symmetry note also that $\param\lp\aff_{\alpha^{-2},0}\bullet\Phi\bullet\aff_{\alpha,0}\rp=\param\lp\aff_{\alpha^{-2},0}\bullet\Phi\bullet\aff_{-\alpha,0}\rp$ and also that $\lay\lp\aff_{\alpha^{-2},0}\bullet\Phi\bullet\aff_{\alpha,0}\rp=\lay\lp\aff_{\alpha^{-2},0}\bullet\Phi\bullet\aff_{-\alpha,0}\rp$. Thus Lemma \ref{paramsum}, Corollary \ref{cor:sameparal}, and the hypothesis tells us that:
\caption{Left: $\log_{10}$ of depths for a simulation with $q \in\lb2.1, 4\rb$, $\ve\in\lp0.1, 2\rb$, and $x \in\lb-5,5\rb$, all with $50$ mesh-points. Right: The theoretical upper limits over the same range of values}
\end{figure}
% Please add the following required packages to your document preamble:
% \usepackage{booktabs}
\begin{table}[h]
\begin{tabular}{@{}l|llllll@{}}
\toprule
& Min. & 1\textsuperscript{st} Qu. & Median & Mean & 3\textsuperscript{rd} Qu. & Max. \\\midrule
\caption{Theoretical upper bounds for $L^1$ error, experimental $L^1$ error and their forward difference, with $q \in\lb2.1, 4\rb$, $\ve\in\lp0.1, 2\rb$, and $x \in\lb-5,5\rb$, all with $50$ mesh-points.}
\end{table}
\subsection{The $\prd$ network}
We are finally ready to give neural network representations of arbitrary products of real numbers. However, this representation differs somewhat from those found in the literature, especially \cite{grohs2019spacetime}, where parallelization (stacking) is used instead of neural network sums. This will help us calculate $\wid_1$ and the width of the second to last layer.
\begin{lemma}\label{prd_network}
Let $\delta,\ve\in\lp0,\infty\rp$, $q\in\lp2,\infty\rp$, $A_1,A_2,A_3\in\R^{1\times2}$, $\Psi\in\neu$ satisfy for all $x\in\R$ that $\delta=\ve\lp2^{q-1}+1\rp^{-1}$, $A_1=\lb1\quad1\rb$, $A_2=\lb1\quad0\rb$, $A_3=\lb0\quad1\rb$, $\real_{\rect}\in C\lp\R, \R\rp$, $\lp\real_{\rect}\lp\Psi\rp\rp\lp0\rp=0$, $0\les\lp\real_{\rect}\lp\Psi\rp\rp\lp x \rp\les\delta+|x|^2$, $|x^2-\lp\real_{\rect}\lp\Psi\rp\rp\lp x \rp |\les\delta\max\{1,|x|^q\}$, $\dep\lp\Psi\rp\les\max\{1+\frac{1}{q-2}+\frac{q}{2(q-2)}\log_2\lp\delta^{-1}\rp ,2\}$, and $\param\lp\Psi\rp\les\max\left\{\lb\frac{40q}{q-2}\rb\log_2\lp\delta^{-1}\rp+\frac{80}{q-2}-28,52\right\}$, then:
\begin{enumerate}[label=(\roman*)]
\item there exists a unique $\Gamma\in\neu$ satisfying:
&=\frac{1}{2}\lp\real_{\rect}\lp\Psi\rp\rp\lp x+y \rp -\frac{1}{2}\lp\real_{\rect}\lp\Psi\rp\rp\lp x \rp - \frac{1}{2}\lp\real_{\rect}\lp\Psi\rp\rp\lp y \rp\label{6.0.33}
%TODO: Revisit this estimate
\end{align}
Note that this, and the assumption that $\lp\real_{\rect}\lp\Psi\rp\rp\lp x \rp\in C \lp\R, \R\rp$ and that $\lp\real_{\rect}\lp\Psi\rp\rp\lp0\rp=0$ ensures:
Next, observe that since by assumption it is the case for all $x,y\in\R$ that $|x^2-\lp\real_{\rect}\lp\Psi\rp\rp\lp x \rp | \les\delta\max\{1,|x|^q\}$, $xy =\frac{1}{2}|x+y|^2-\frac{1}{2}|x|^2-\frac{1}{2}|y|^2$, triangle Inequality and from (\ref{6.0.33}) we have that:
&=\left|\frac{1}{2}\lb\lp\real_{\rect}\lp\Psi\rp\rp\lp x + y \rp - \left|x+y\right|^2 \rb - \frac{1}{2}\lb\lp\real_{\rect}\lp\Psi\rp\rp\lp x \rp -\left| x \right|^2\rb - \frac{1}{2}\lb\lp\real_{\rect}\lp\Psi\rp\rp\lp x \rp -\left|y\right|^2\rb\right| \nonumber\\
&\les\left|\frac{1}{2}\lb\lp\real_{\rect}\lp\Psi\rp\rp\lp x + y \rp - \left|x+y\right|^2 \rb + \frac{1}{2}\lb\lp\real_{\rect}\lp\Psi\rp\rp\lp x \rp -\left| x \right|^2\rb + \frac{1}{2}\lb\lp\real_{\rect}\lp\Psi\rp\rp\lp x \rp -\left|y\right|^2\rb\right| \nonumber\\
Note also that since for all $\alpha,\beta\in\R$ and $p \in\lb1, \infty\rp$ we have that $|\alpha+\beta|^p \les2^{p-1}\lp |\alpha|^p + |\beta|^p \rp$ we have that:
\begin{align}
&\left| \lp\real_{\rect}\lp\Psi\rp\rp\lp x \rp - xy \right| \nonumber\\
&\les\frac{\delta}{2}\lb 2^q + 2\rb\max\left\{1, \left|x\right|^q, \left| y \right|^q \right\} = \ve\max\left\{ 1,\left| x \right|^q, \left| x \right|^q\right\}\nonumber
\end{align}
This proves Item (iv).
By symmetry it holds that $\param\lp\frac{1}{2}\triangleright\lp\Psi\bullet\aff_{A_1,0}\rp\rp=\param\lp-\frac{1}{2}\triangleright\lp\Psi\bullet\aff_{A_2,0}\rp\rp=\param\lp-\frac{1}{2}\triangleright\lp\Psi\bullet\aff_{A_3,0}\rp\rp$ and further that $\lay\lp\frac{1}{2}\triangleright\lp\Psi\bullet\aff_{A_1,0}\rp\rp=\lay\lp-\frac{1}{2}\triangleright\lp\Psi\bullet\aff_{A_2,0}\rp\rp=\lay\lp-\frac{1}{2}\triangleright\lp\Psi\bullet\aff_{A_3,0}\rp\rp$.
Note also that Corollary \ref{affcor} tells us that for all $i \in\{1,2,3\}$ and $a \in\{\frac{1}{2},-\frac{1}{2}\}$ it is the case that:
\begin{align}
\param\lp a \triangleright\lp\Psi\bullet\aff_{A_i,0}\rp\rp = \param\lp\Psi\rp
\end{align}
This, together with Corollary \ref{corsum} indicates that:
Observe next that for $q\in\lp0,\infty\rp$, $\ve\in\lp0,\infty\rp$, $\Gamma$ consists of, among other things, three stacked $\lp\Psi\bullet\aff_{A_i,0}\rp$ networks where $i \in\{1,2,3\}$. Corollary \ref{affcor} tells us therefore, that $\wid_1\lp\Gamma\rp=3\cdot\wid_1\lp\Psi\rp$. On the other hand, note that each $\Psi$ networks consist of, among other things, two stacked $\Phi$ networks, which by Corollary \ref{affcor} and Lemma \ref{lem:sqr_network}, yields that $\wid_1\lp\Gamma\rp=6\cdot\wid_1\lp\Phi\rp$. Finally from Corollary \ref{cor:phi_network}, and Corollary \ref{affcor}, we see that the only thing contributing to the $\wid_1\lp\Phi\rp$ is $\wid_1\lp\mathfrak{i}_4\rp$, which was established from Lemma \ref{lem:mathfrak_i} as $4$. Whence we get that $\wid_1\lp\Gamma\rp=6\cdot4=24$, and that $\wid_{\hid\lp\Gamma\rp}\lp\Gamma\rp=24$. This proves Item (vii)\textemdash(viii). This then completes the proof of the Lemma.
\end{proof}
\begin{corollary}\label{cor_prd}
Let $\delta,\ve\in\lp0,\infty\rp$, $q\in\lp2,\infty\rp$, $A_1,A_2,A_3\in\R^{1\times2}$, $\Psi\in\N$ satisfy for all $x\in\R$ that $\delta=\ve\lp2^{q-1}+1\rp^{-1}$, $A_1=\lb1\quad1\rb$, $A_2=\lb1\quad0\rb$, $A_3=\lb0\quad1\rb$, $\real_{\rect}\in C\lp\R, \R\rp$, $\lp\real_{\rect}\lp\Psi\rp\rp\lp0\rp=0$, $0\les\lp\real_{\rect}\lp\Psi\rp\rp\lp x \rp\les\delta+|x|^2$, $|x^2-\lp\real_{\rect}\lp\Psi\rp\rp\lp x \rp |\les\delta\max\{1,|x|^q\}$, $\dep\lp\Psi\rp\les\max\{1+\frac{1}{q-2}+\frac{q}{2(q-2)}\log_2\lp\delta^{-1}\rp ,2\}$, and $\param\lp\Psi\rp\les\max\left\{\lb\frac{40q}{q-2}\rb\log_2\lp\delta^{-1}\rp+\frac{80}{q-2}-28,52\right\}$, and finally let $\Gamma$ be defined as in Lemma \ref{prd_network}, i.e.:
Note that the triangle inequality, the fact that $\delta=\ve\lp2^{q-1}+1\rp^{-1}$, the fact that for all $x,y\in\R$ it is the case that $|x+y|^2\les2\lp |x|^2+|y|^2\rp$ and (\ref{6.0.33}) tell us that:
\begin{align}
\left| \real_{\rect}\lp\Gamma\rp\lp x,y\rp\right| &\les\frac{1}{2}\left| \real_{\rect}\lp\Psi\rp\lp x+y \rp\right| + \frac{1}{2}\left| \real_{\rect}\lp\Psi\rp\lp x \rp\right| + \frac{1}{2}\left| \real_{\rect}\lp\Psi\rp\lp y \rp\right| \nonumber\\
\caption{A neural network diagram of the $\sqr$. }
\end{figure}
\section{Higher Approximations}\label{sec_tun}
We take inspiration from the $\sm$ neural network to create the $\prd$ neural network. However, we first need to define a special neural network called \textit{tunneling neural network} to stack two neural networks not of the same length effectively.
\subsection{The $\tun$ Neural Networks and Their Properties}
\begin{definition}[R\textemdash,2023, The Tunneling Neural Networks]\label{def:tun}
We define the tunneling neural network, denoted as $\tun_n$ for $n\in\N$ by:
\begin{align}
\tun_n = \begin{cases}
\aff_{1,0}&:n= 1 \\
\id_1 &: n=2 \\
\bullet^{n-2}\id_1 & n \in\N\cap [3,\infty)
\end{cases}
\end{align}
Where $\id_1$ is as in Definition \ref{7.2.1}.
\end{definition}
\begin{remark}
For an \texttt{R} implementation see Listing \ref{Tun}
\end{remark}
\begin{lemma}\label{6.2.2}\label{tun_1}
Let $n\in\N$, $x \in\R$ and $\tun_n \in\neu$. For all $n\in\N$ and $x\in\R$, it is then the case that:
\begin{enumerate}[label = (\roman*)]
\item$\real_{\rect}\lp\tun_n \rp\in C \lp\R, \R\rp$
\item$\dep\lp\tun_n \rp=n$
\item$\lp\real_{\rect}\lp\tun_n \rp\rp\lp x \rp= x$
Note that $\aff_{0,1}\in C \lp\R, \R\rp$ and by Lemma \ref{idprop} we have that $\id_1\in C\lp\R, \R\rp$. Finally, the composition of continuous functions is continuous, hence $\tun_n \in C\lp\R, \R\rp$ for $n \in\N\cap\lb2,\infty\rp$. This proves Item (i).
Note that by Lemma \ref{5.3.2} it is the case that $\dep\lp\aff_{1,0}\rp=1$ and by Lemma \ref{7.2.1} it is the case that $\dep\lp\id_1\rp=2$.
Assume now that for all $n \les N$ that $\dep\lp\tun_n \rp= n$, then for the inductive step, by Lemma \ref{comp_prop} we have that:
Now for the inductive step assume that for all $n\les N\in\N$, it is the case that $\param\lp\tun_n \rp=7+6(n-2)$. For the inductive step, we then have:
Note that Items (i)\textendash(iii) are consequences of Lemma \ref{idprop} and Lemma \ref{comp_prop} respectively. Note now that by observation $\param\lp\tun^d_1\rp= d^2+d$. Next Lemma $\ref{id_param}$ tells us that $\param\lp\tun^d_2\rp=4d^2+3d$
Note also that by definition of neural network composition, we have the following:
&=2d \times d + 2d + 2d\times 2d +2d+2d\times d + d \nonumber\\
&=2d^2+2d+4d^2+2d+2d^2 +d \nonumber\\
&= 8d^2+5d
\end{align}
Suppose now that for all naturals up to and including $n$, it is the case that $\param\lp\tun_n^d\rp=4d^2+3d +\lp n-2\rp\lp4d^2+2d\rp$. For the inductive step, we have the following:
This proves Item (iv). Finally, Item (v) is a consequence of Lemma \ref{5.3.2}
\end{proof}
\subsection{The $\pwr$ Neural Networks and Their Properties}
\begin{definition}[R\textemdash, 2023, The Power Neural Network]\label{def:pwr}
Let $n\in\N$. Let $\delta,\ve\in\lp0,\infty\rp$, $q\in\lp2,\infty\rp$, satisfy that $\delta=\ve\lp2^{q-1}+1\rp^{-1}$. We define the power neural networks $\pwr_n^{q,\ve}\in\neu$, denoted for $n\in\N_0$ as:
\caption{A representation of a typical $\pwr^{q,\ve}_n$ network.}
\end{figure}
\begin{remark}
For an \texttt{R} implementation see Listing \ref{Pwr}
\end{remark}
\begin{remark}
Note that for all $i \in\N$, $q\in\lp2,\infty\rp$, $\ve\in\lp0, \infty\rp$, each $\pwr_i^{q,\ve}$ differs from $\pwr_{i+1}^{q,\ve}$ by atleast one $\prd^{q,\ve}$ network.
\end{remark}
\end{definition}
\begin{lemma}\label{6.2.4}
Let $x,y \in\R$, $\ve\in\lp0,\infty\rp$ and $q \in\lp2,\infty\rp$. It is then the case for all $x,y \in\R$ that:
Note that since any instance of $\mathfrak{p}_i$ contains an instance of $\mathfrak{p}_{i-1}$ for $i \in\N\cap\lb2,\infty\rp$, we have that $\mathfrak{p}_n \in\mathcal{O}\lp\ve^{2(n-1)}\rp$
\end{remark}
\begin{lemma}\label{param_pwr_geq_param_tun}
For all $n \in\N$, $q\in\lp2,\infty\rp$, and $\ve\in\lp0,\infty\rp$, it is the case that $\param\lp\tun_{\dep\lp\pwr^{q,\ve}_n\rp}\rp\les\param\lp\pwr^{q,\ve}_n\rp$.
\end{lemma}
\begin{proof}
Note that for all $n \in\N$ it is straightforwardly the case that $\param\lp\pwr_n^{q,\ve}\rp\ges\param\lp\tun_{\dep\lp\pwr^{q,\ve}_{n-1}\rp}\rp$ because for all $n\in\N$, a $\pwr^{q,\ve}_n$ network contains a $\tun_{\dep\lp\pwr^{q,\ve}_{n-1}\rp}$ network. Note now that for all $i \in\N$ we have from Lemma \ref{tun_1} that $5\les\param\lp\tun_{i+1}\rp-\param\lp\tun_i\rp\les6$. Recall from Corollary \ref{cor:phi_network} that every instance of the $\Phi$ network contains atleast one $\mathfrak{i}_4$ network, which by Lemma \ref{lem:mathfrak_i} has $40$ parameters, whence the $\prd^{q,\ve}$ network has atleast $40$ parameters for all $\ve\in\lp0,\infty\rp$ and $q \in\lp2,\infty\rp$. Note now that for all $i\in\N$, $\pwr^{q,\ve}_{i}$ and $\pwr^{q,\ve}_{i+1}$ differ by atleast as many parameters as there are in $\prd^{q,\ve}$, since, indeed, they differ by atleast one more $\prd^{q,\ve}$. Thus for every increment in $i$, $\pwr_i^{q,\ve}$ outstrips $\tun_i$ by at-least $40-6=34$ parameters. This is true for all $i\in\N$. Whence it is the case that for all $i \in\N$, it is the case that $\param\lp\tun_i\rp\les\param\lp\pwr^{q,\ve}_i\rp$.
Let $\delta,\ve\in\lp0,\infty\rp$, $q\in\lp2,\infty\rp$, and $\delta=\ve\lp2^{q-1}+1\rp^{-1}$. Let $n \in\N_0$, and $\pwr_n \in\neu$. It is then the case for all $n \in\N_0$, and $x \in\R$ that:
\begin{enumerate}[label = (\roman*)]
\item$\lp\real_{\rect}\lp\pwr_n^{q,\ve}\rp\rp\lp x \rp\in C \lp\R, \R\rp$
Note that Item (ii) of Lemma \ref{5.3.2} ensures that $\real_{\rect}\lp\pwr_0\rp=\aff_{1,0}\in C \lp\R, \R\rp$. Note next that by Item (v) of Lemma \ref{comp_prop}, with $\Phi_1\curvearrowleft\nu_1, \Phi_2\curvearrowleft\nu_2, a \curvearrowleft\rect$, we have that:
\begin{align}
\lp\real_{\rect}\lp\nu_1 \bullet\nu_2 \rp\rp\lp x \rp = \lp\lp\real_{\rect}\lp\nu_1 \rp\rp\circ\lp\real_{\rect}\lp\nu_2 \rp\rp\rp\lp x \rp
\end{align}
This, with the fact that the composition of continuous functions is continuous, the fact the stacking of continuous instantiated neural networks is continuous tells us that $\lp\real_{\rect}\pwr_n \rp\in C \lp\R, \R\rp$ for $n \in\N\cap\lb2,\infty\rp$. This establishes Item (i).
Note next that by observation $\dep\lp\pwr_0^{q,\ve}\rp=1$ and by Item (iv) of Lemma \ref{idprop}, it is the case that $\dep\lp\id_1\rp=2$. By Lemmas $\ref{dep_cpy}$ and $\ref{depthofcomposition}$ it is also the case that: $\dep\lp\prd^{q,\ve}\bullet\lb\tun_{\dep(\pwr^{q,\ve}_{n-1})}\boxminus\pwr^{q,\ve}_{n-1}\rb\bullet\cpy\rp=\dep\lp\prd^{q,\ve}\bullet\lb\tun_{\dep(\pwr^{q,\ve}_{n-1})}\boxminus\pwr^{q,\ve}_{n-1}\rb\rp$. Note also that by Lemma we have that $\dep\lp\tun_{\dep\lp\pwr^{q,\ve}_{n-1}\rp}\boxminus\pwr^{q,\ve}_{n-1}\rp=\dep\lp\pwr^{q,\ve}_{n-1}\rp$.
This with Lemma \ref{comp_prop} then yields for $n \in\N$ that:
Note now that $\wid_1\lp\pwr^{q,\ve}_0\rp=\wid_1\lp\aff_{0,1}\rp=1$. Further Lemma \ref{comp_prop}, Remark \ref{5.3.2}, tells us that for all $i,k \in\N$ it is the case that $\wid_i \lp\tun_k\rp\les2$. Observe that since $\cpy_{2,1}, \pwr_0^{q,\ve}$, and $\tun_{\dep\lp\pwr_0^{q,\ve}\rp}$ are all affine neural networks, Lemma \ref{aff_effect_on_layer_architecture}, Corollary \ref{affcor}, and Lemma \ref{prd_network} tells us that:
This completes the base case. For the inductive case, assume that for all $i$ up to and including $k\in\N$ it is the case that $\wid_1\lp\pwr_i^{q,\ve}\rp\les\begin{cases}
Note that Lemma \ref{paramofparallel}, Lemma \ref{param_pwr_geq_param_tun}, Corollary \ref{cor:sameparal}, Lemma \ref{lem:paramparal_geq_param_sum}, and Corollary \ref{cor:bigger_is_better}, tells us it is the case that:
Note next that by definition for all $q\in\lp2,\infty\rp$, and $\ve\in\lp0,\infty\rp$ it is case that $\wid_{\hid\lp\pwr_0^{q,\ve}\rp}\pwr_0^{q,\ve}=\wid_{\hid\lp\aff_{0,1}\rp}=1$. Now, by Lemma \ref{prd_network}, and by construction of $\pwr_i^{q,\ve}$ we may say that for $i\in\N$ it is the case that:
Next note that $\lp\real_{\rect}\lp\pwr_{0,1}\rp\rp\lp x \rp$ is exactly $1$, which implies that for all $x\in\R$ we have that $|x^0-\lp\real_{\rect}\lp\pwr_{0.1}\rp\lp x \rp\rp |=0$. Note also that the instantiations of $\tun_n$ and $\cpy_{2,1}$ are exact. Note next that since $\tun_n$ and $\cpy_{2,1}$ are exact, the only sources of error for $\pwr^{q,\ve}_n$ are $n$ compounding applications of $\prd^{q,\ve}$.
Note that since $\mathfrak{p}_n \in\mathcal{O}\lp\ve^{2(n-1)}\rp$ for $n\in\N\cap\lb2,\infty\rp$, it is the case for all $x\in\R$ then that $\left| x^{n}-\real_{\rect}\lp\pwr^{q,\ve}_n\rp\lp x\rp\right| \in\mathcal{O}\lp\ve^{2q(n-1)}\rp$ for $n \ges2$.
Finally note that $\wid_{\hid\lp\pwr^{q,\ve}_0\rp}\lp\pwr^{q,\ve}_0\rp=1$ from observation. For $n\in\N$, note that the second to last layer is the second to last layer of the $\prd^{q,\ve}$ network. Thus Lemma \ref{prd_network} tells us that:
Note each power network $\pwr_n^{q,\ve}$ is at least as big as the previous power network $\pwr_{n-1}^{q,\ve}$, one differs from the other by one $\prd^{q, ve}$ network.
\end{remark}
\subsection{$\pnm_{n,C}^{q,\ve}$ and Neural Network Polynomials.}
\begin{definition}[Neural Network Polynomials]
Let $\delta,\ve\in\lp0,\infty\rp$, $q\in\lp2,\infty\rp$ and $\delta=\ve\lp2^{q-1}+1\rp^{-1}$. For fixed $q,\ve$, fixed $n \in\N_0$, and for $C =\{c_0,c_1,\hdots, c_n \}\in\R^{n+1}$ (the set of coefficients), we will define the following objects as neural network polynomials:
Let $\delta,\ve\in\lp0,\infty\rp$, $q\in\lp2,\infty\rp$ and $\delta=\ve\lp2^{q-1}+1\rp^{-1}$. It is then the case for all $n\in\N_0$ and $x\in\R$ that:
\begin{enumerate}[label = (\roman*)]
\item$\real_{\rect}\lp\pnm_{n,C}^{q,\ve}\rp\in C \lp\R, \R\rp$
Since Lemma \ref{power_prop} tells us that $\lp\real_{\rect}\lp\pwr_n^{q,\ve}\rp\rp\lp x \rp\in C \lp\R, \R\rp$, for all $n\in\N_0$ and since the finite sum of continuous functions is continuous, this proves Item (i).
Note that $\pnm_n^{q,\ve}$ is only as deep as the deepest of the $\pwr^{q,\ve}_i$ networks, which from the definition is $\pwr_n^{q,\ve}$, which in turn also has the largest bound. Therefore, by Lemma \ref{comp_prop}, Lemma $\ref{5.3.3}$, Lemma $\ref{depth_prop}$, and Lemma \ref{power_prop}, we have that:
Note next that for the case of $n=0$, we have that:
\begin{align}
\pnm_n^{q,\ve} = c_i \triangleright\pwr_0^{q,\ve}
\end{align}
This then yields us $2$ parameters.
Note that each neural network summand in $\pnm_n^{q,\ve}$ consists of a combination of $\tun_k$ and $\pwr_k$ for some $k\in\N$. Each $\pwr_k$ has at least as many parameters as a tunneling neural network of that depth, as Lemma \ref{param_pwr_geq_param_tun} tells us. This, finally, with Lemma \ref{aff_effect_on_layer_architecture}, Corollary \ref{affcor}, and Lemma \ref{power_prop} then implies that:
Finally, note that for all $i\in\N$, Lemma \ref{power_prop}, and the triangle inequality then tells us that it is the case for all $i \in\N$ that:
\begin{align}
\left| x^i - \real_{\rect}\lp\pwr_i^{q,\ve}\rp\lp x \rp\right| &\les\left| x^i-x \cdot\real_{\rect}\lp\pwr_{i-1}^{q,\ve}\rp\lp x\rp\right| + \left| x \cdot\real_{\rect}\lp\pwr_{i-1}^{q,\ve}\rp\lp x\rp -\real_{\rect}\lp\pwr_i^{q,\ve}\rp\lp x \rp\right| \nonumber\\
\end{align}
This, Lemma \ref{6.2.9}, and the fact that instantiation of the tunneling neural network leads to the identity function (Lemma \ref{6.2.2} and Lemma \ref{comp_prop}), together with Lemma \ref{scalar_left_mult_distribution}, and the absolute homogeneity condition of norms, then tells us that for all $x\in\R$, and $c_0,c_1,\hdots, c_n \in\R$ it is the case that:
Note however that since for all $x\in\R$ and $i \in\N\cap\lb2, \infty\rp$, Lemma \ref{prd_network} tells us that $\left| x^{i}-\real_{\rect}\lp\pwr^{q,\ve}_i\rp\lp x\rp\right| \in\mathcal{O}\lp\ve^{2q\lp i-1\rp}\rp$, this, and the fact that $f+g \in\mathcal{O}\lp x^a \rp$ if $f \in\mathcal{O}\lp x^a\rp$, $g \in\mathcal{O}\lp x^b\rp$, and $a \ges b$, then implies that:
Note next in our construction $\aff_{0,1}$ will require tunneling whenever $i\in\N$ in $\pwr_{i}^{q,\ve}$. Lemma \ref{aff_effect_on_layer_architecture} and Corollary \ref{affcor} then tell us that:
Finally note that from the definition of the $\pnm_{n,C}^{q,\ve}$, it is evident that $\wid_{\hid\lp\pwr_{0,C}^{q,\ve}\rp}\lp\pwr_{0,C}^{q,\ve}\rp=1$ since $\pwr_{0,C}^{q,\ve}=\aff_{0,1}$. Other than this network, for all $i \in\N$, $\pwr_{i,C}^{q,\ve}$ end in the $\prd^{q,\ve}$ network, and the deepest of the $\pwr_i^{q,\ve}$ networks is $\pwr^{q,\ve}_n$ inside $\pnm_{n,C}^{q,\ve}$. All other $\pwr_i^{q,\ve}$ must end in tunnels. Whence in the second to last layer, Lemma \ref{prd_network} tells us that:
\subsection{$\xpn_n^{q,\ve}$, $\csn_n^{q,\ve}$, $\sne_n^{q,\ve}$, and Neural Network Approximations of $e^x$, $\cos(x)$, and $\sin(x)$.}
Once we have neural network polynomials, we may take the next leap to transcendental functions. Here, we will explore neural network approximations for three common transcendental functions: $e^x$, $\cos(x)$, and $\sin(x)$.
\begin{lemma}
Let $\nu_1,\nu_2\in\neu$, $f,g \in C \lp\R, \R\rp$, and $\ve_1,\ve_2\in\lp0 ,\infty\rp$ such that for all $x\in\R$ it holds that $\left| f(x)-\real_{\rect}\lp\nu_1\rp\right| \les\ve_1$ and $\left| g(x)-\real_{\rect}\lp\nu_2\rp\right| \les\ve_2$. It is then the case for all $x \in\R$ that:
\begin{align}\label{6.2.14}
\left| \lb f+g \rb\lp x \rp - \real_{\rect}\lp\lb\nu_1 \oplus\nu_2 \rb\rp\lp x \rp\right| \les\ve_1 + \ve_2
\end{align}
\end{lemma}
\begin{proof}
Note that the triangle inequality tells us:
\begin{align}
\left| \lb f+g \rb\lp x \rp - \real_{\rect}\lb\nu_1 \oplus\nu_2 \rb\lp x \rp\right| &= \left| f\lp x \rp +g\lp x \rp -\real_{\rect}\lp\nu_1\rp\lp x \rp -\real_{\rect}\lp\nu_2 \rp\lp x \rp\right|\nonumber\\
&\les\left| f\lp x \rp -\real_{\rect}\lp\nu_1 \rp\lp x \rp\right| + \left| g\lp x \rp - \real_{\rect}\lp\nu_2 \rp\lp x \rp\right| \nonumber\\
&\les\ve_1 + \ve_2 \nonumber
\end{align}
\end{proof}
\begin{lemma}\label{6.2.8}
Let $n\in\N$. Let $\nu_1,\nu_2,...,\nu_n \in\neu$, $\ve_1,\ve_2,...,\ve_n \in\lp0,\infty\rp$ and $f_1,f_2,...,f_n \in C\lp\R, \R\rp$ such that for all $i \in\{1,2,...,n\}$, and for all $x\in\R$, it is the case that, $\left| f_i\lp x \rp-\real_{\rect}\lp\nu_i \rp\lp x \rp\right| \les\ve_i$. It is then the case for all $x\in\R$, that:
This is a consequence of a finite number of applications of (\ref{6.2.14}).
\end{proof}
\begin{definition}[R\textemdash 2023, $\xpn_n^{q,\ve}$ and the Neural Network Taylor Approximations for $e^x$ around $x=0$]
Let $\delta,\ve\in\lp0,\infty\rp$, $q\in\lp2,\infty\rp$ and $\delta=\ve\lp2^{q-1}+1\rp^{-1}$, and let $\pwr_n^{q,\ve}$ be as in Lemma \ref{power_prop}. We define, for all $n\in\N_0$, the family of neural networks $\xpn_n^{q,\ve} as$:
Let $\delta,\ve\in\lp0,\infty\rp$, $q\in\lp2,\infty\rp$ and $\delta=\ve\lp2^{q-1}+1\rp^{-1}$. It is then the case for all $n\in\N_0$ and $x\in\R$ that:
\begin{enumerate}[label = (\roman*)]
\item$\real_{\rect}\lp\xpn_n^{q,\ve}\rp\lp x \rp\in C \lp\R, \R\rp$
This follows straightforwardly from Lemma \ref{nn_poly} with $c_i \curvearrowleft\frac{1}{i!}$ for all $n \in\N$ and $i \in\{0,1,\hdots, n\}$. In particular, Item (iv) benefits from the fact that for all $i \in\N_0$, it is the case that $\frac{1}{i!}\ges0$.
\end{proof}
\begin{lemma}[R\textemdash, 2023]
Let $\delta,\ve\in\lp0,\infty\rp$, $q\in\lp2,\infty\rp$ and $\delta=\ve\lp2^{q-1}+1\rp^{-1}.$ It is then the case for all $n\in\N_0$ and $x\in\lb a,b \rb\subsetneq\R$, where $0\in\lb a,b\rb\subsetneq\R$ that:
Where $\xi$ is between $0$ and $x$ in the Lagrange form of the remainder. Note then, for all $n\in\N_0$, $x\in\lb a,b \rb\subsetneq\R$, and $\xi$ between $0$ and $x$, it is the case, by monotonicity of $e^x$ that the second summand is bounded by:
Whence we have that for fixed $n\in\N_0$ and $b \in\lb0, \infty\rp$, the last summand is constant, whence it is the case that:
\begin{align}
\left| e^x -\real_{\rect}\lp\xpn_n^{q,\ve}\rp\lp x \rp\right| \in\mathcal{O}\lp\ve^{2q(n-1)}\rp
\end{align}
\end{proof}
\begin{definition}[The $\mathsf{Csn}_n^{q,\ve}$ Networks, and Neural Network Cosines]
Let $\delta,\ve\in\lp0,\infty\rp$, $q\in\lp2,\infty\rp$ and $\delta=\ve\lp2^{q-1}+1\rp^{-1}$. Let $\pwr^{q,\ve}_n$ be a neural networks as defined in Definition \ref{def:pwr}. We will define the neural networks $\mathsf{Csn}_{n}^{q,\ve}$ as:
Let $\delta,\ve\in\lp0,\infty\rp$, $q\in\lp2,\infty\rp$ and $\delta=\ve\lp2^{q-1}+1\rp^{-1}$. It is then the case for all $n\in\N_0$ and $x\in\R$ that:
\begin{enumerate}[label = (\roman*)]
\item$\real_{\rect}\lp\csn_n^{q,\ve}\rp\lp x\rp\in C \lp\R, \R\rp$
\left|\sum^n_{i=0}\lb\frac{\lp -1\rp^i x^{2i}}{2i!}\rb- \real_{\rect}\lp\csn_n^{q,\ve}\rp\lp x \rp\right|\in\mathcal{O}\lp\ve^{2q(2n-1)}\rp
\end{align}
This proves Item (iv).
\end{proof}
\begin{lemma}[R\textemdash, 2023]
Let $\delta,\ve\in\lp0,\infty\rp$, $q\in\lp2,\infty\rp$ and $\delta=\ve\lp2^{q-1}+1\rp^{-1}.$ It is then the case for all $n\in\N_0$ and $x\in[a,b]\subseteq\lb0,\infty\rp$ that:
Note that Taylor's theorem states that for all $x \in\lb a,b\rb\subsetneq\R$, where $0\in\lb a,b\rb$, it is the case that:
\begin{align}
\cos\lp x \rp= \sum^n_{i=0}\frac{\lp -1\rp^i}{2i!}x^i + \frac{\cos^{\lp n+1\rp}\lp\xi\rp\cdot x^{n+1}}{(n+1)!}
\end{align}
Note further that for all $n \in\N_0$, and $x \in\R$, it is the case that $\cos^{\lp n \rp}\lp x\rp\les1$. Whence we may conclude that for all $n\in\N_0$, $x\in\lb a,b \rb\subseteq\R$, where $0\in\lb a,b\rb$ and $\xi$ between $0$ and $x$, we may bound the second summand by:
\begin{definition}[R\textemdash, 2023, The $\mathsf{Sne}_n^{q,\ve}$ Newtorks and Neural Network Sines.].
Let $\delta,\ve\in\lp0,\infty\rp$, $q\in\lp2,\infty\rp$ and $\delta=\ve\lp2^{q-1}+1\rp^{-1}$. Let $\pwr^{q,\ve}$ be a neural network defined in Definition \ref{def:pwr}. We will define the neural network $\mathsf{Csn}_{n,q,\ve}$ as:
Let $\delta,\ve\in\lp0,\infty\rp$, $q\in\lp2,\infty\rp$ and $\delta=\ve\lp2^{q-1}+1\rp^{-1}$. It is then the case for all $n\in\N_0$ and $x\in\R$ that:
\begin{enumerate}[label = (\roman*)]
\item$\real_{\rect}\lp\sne_n^{q,\ve}\rp\in C \lp\R, \R\rp$
\left|\sum^n_{i=0}\frac{\lp -1\rp^i}{2i!}\lp x-\frac{\pi}{2}\rp^{2i} - \real_{\rect}\lp\sne_n^{q,\ve}\rp\lp x \rp\right| \in\mathcal{O}\lp\ve^{2q(2n-1)}\rp
\end{align}
\end{enumerate}
\end{lemma}
\begin{proof}
This follows straightforwardly from Lemma \ref{csn_properties}, and the fact that by Corollary \ref{affcor}, there is not a change to the parameter count, by Lemma \ref{comp_cont}, there is no change in depth, by Lemma \ref{aff_prop}, and Lemma \ref{csn_properties}, continuity is preserved, and the fact that $\aff_{1,-\frac{\pi}{2}}$ is exact and hence contributes nothing to the error, and finally by the fact that $\aff_{1,-\frac{\pi}{2}}\rightarrow\lp\cdot\rp-\frac{\pi}{2}$ under instantiation, assures us that the $\sne^{q,\ve}_n$ has the same error bounds as $\csn_n^{q,\ve}$.
\end{proof}
\begin{lemma}[R\textemdash, 2023]
Let $\delta,\ve\in\lp0,\infty\rp$, $q\in\lp2,\infty\rp$ and $\delta=\ve\lp2^{q-1}+1\rp^{-1}.$ It is then the case for all $n\in\N_0$ and $x\in[a,b]\subseteq\lb0,\infty\rp$ that:
\begin{align}
&\left| \sin\lp x\rp - \real_{\rect}\lp\sne_n^{q,\ve}\rp\lp x \rp\right|\nonumber\\
&\les\sum^n_{i=1}\left| \frac{\lp -1\rp^i}{2i!}\right|\lp\left| \lp x -\frac{\pi}{2}\rp\lp\lp x -\frac{\pi}{2}\rp^{2i-1} - \real_{\rect}\lp\pwr^{q,\ve}_{i-1}\rp\lp x-\frac{\pi}{2}\rp\rp\right| + \ve + |x|^q + \mathfrak{p}_{i-1}^q \rp\nonumber\\
&+\frac{|x|^{n+1}}{(n+1)!}\label{sin_diff}
\end{align}
\end{lemma}
\begin{proof}
Note that the fact that $\sin\lp x\rp=\cos\lp x-\frac{\pi}{2}\rp$, Lemma \ref{comp_prop}, and Lemma \ref{aff_prop} then renders (\ref{sin_diff}) as:
&\les\sum^n_{i=1}\left| \frac{\lp -1\rp^i}{2i!}\right|\lp\left| \lp x -\frac{\pi}{2}\rp\lp\lp x -\frac{\pi}{2}\rp^{2i-1} - \real_{\rect}\lp\pwr^{q,\ve}_{i-1}\rp\lp x-\frac{\pi}{2}\rp\rp\right| + \ve + |x|^q + \mathfrak{p}_{i-1}^q \rp+ \frac{|x|^{n+1}}{(n+1)!}\nonumber
\end{align}
\end{proof}
\begin{remark}
Note that under these neural network architectures the famous Pythagorean identity $\sin^2\lp x\rp+\cos^2\lp x\rp=1$, may be rendered approximately, for fixed $n,q,\ve$ as: $\lb\sqr^{q,\ve}\bullet\csn^{q,\ve}_n \rb\oplus\lb\sqr^{q,\ve}\bullet\sne^{q,\ve}_n\rb$. A full discussion of the associated parameter, depth, and accuracy bounds are beyond the scope of this dissertation, and may be appropriate for future work.