\item$\real_{\rect}\lp\id_d \rp\in C \lp\R^d, \R^d \rp$.
\item For all $x \in\R^d$ that:
\begin{align}
\lp\real_{\rect}\lp\id_d \rp\rp\lp x \rp = x \nonumber
\end{align}
\item For $d\in\N$ it is the case that $\dep\lp\id_d\rp=2$
\end{enumerate}
\end{lemma}
\begin{proof}
Note that (\ref{7.2.1}) ensure that $\lay(\id_d)=\lp1,2,1\rp$. Furthermore, ($\ref{7.2.2}$) and Remark \ref{5.3.5} prove that $\lay(\id_d)=\lp d,2d,d \rp$ which in turn proves Item (i). Note now that Remark \ref{5.3.5} tells us that:
Note that \ref{7.2.1} ensures that for all $x \in\R$ it is the case that:
\begin{align}
\lp\real_{\rect}\lp\id_1 \rp\rp\lp x \rp = \rect(x) - \rect(-x) = \max\{x,0\} - \max\{-x,0\} = x
\end{align}
And Lemma \ref{5.3.4} shows us that for all $x =\lp x_1,x_2,...,x_d\rp\in\R^d$ it is the case that $\real_{\rect}\lp\id_d \rp\in C \lp\R^d, \R^d \rp$ and that:
\begin{align}
\lp\real_{\act}\lp\id_d \rp\rp\lp x \rp&= \lp\real_{\act}\lp\boxminus_{i=1}^d \lp\id_1\rp\rp\rp\lp x_1,x_2,...,x_d \rp\nonumber\\
This proves Item (ii)\textemdash(iii). Item (iv) follows straightforwardly from Item (i). This establishes the lemma.
\end{proof}
\begin{remark}
Note here the difference between Definition \ref{actnn} and Definition \ref{7.2.1}.
\end{remark}
\begin{lemma}[R\textemdash, 2023]\label{id_param}
Let $d \in\N$. It then the case that for all $d \in\N$ we have that $\param\lp\id_d\rp=4d^2+3d$
\end{lemma}
\begin{proof}
By observation we have that $\param\lp\id_1\rp=4(1)^2+3(1)=7$. By induction, suppose that this holds for all natural numbers up to and including $n$, i.e., for all naturals up to and including $n$; it is the case that $\param\lp id_n\rp=4n^2+3n$. Note then that $\id_{n+1}=\id_n \boxminus\id_1$. For $W_1$ and $W_2$ of this new network, this adds a combined extra $8n+4$ parameters. For $b_1$ and $b_2$ of this new network, this adds a combined extra $3$ parameters. Thus, we have the following:
\begin{align}
4n^2+3n + 8n+4 + 3 &= 4(n+1)^2+3(n+1)
\end{align}
This completes the induction and hence proves the Lemma.
\end{proof}
\begin{lemma}\label{7.2.3}
Let $\nu\in\neu$ with end-widths $d$. It is then the case that $\real_{\rect}\lp\id_d \bullet\nu\rp\lp x \rp=\real_{\rect}\lp\nu\bullet\id_d \rp=\real_{\rect}\lp\nu\rp$, i.e. $\id_d$ acts as a compositional identity.
\end{lemma}
\begin{proof} From (\ref{5.2.1}) and Definition \ref{7.2.1} we have eight cases.
Case 1 where $d=1$ and subcases:
\begin{enumerate}[label = (1.\roman*)]
\item$\id_d \bullet\nu$ where $\dep(\nu)=1$
\item$\id_d \bullet\nu$ where $\dep(\nu) > 1$
\item$\nu\bullet\id_d$ where $\dep(\nu)=1$
\item$\nu\bullet\id_d$ where $\dep(\nu) > 1$
\end{enumerate}
Case 2 where $d>1$ and subcases:
\begin{enumerate}[label = (2.\roman*)]
\item$\id_d \bullet\nu$ where $\dep(\nu)=1$
\item$\id_d \bullet\nu$ where $\dep(\nu) > 1$
\item$\nu\bullet\id_d$ where $\dep(\nu)=1$
\item$\nu\bullet\id_d$ where $\dep(\nu) > 1$
\end{enumerate}
\textit{Case 1.i:} Let $\nu=\lp\lp W_1,b_1\rp\rp$. Deriving from Definitions \ref{7.2.1} and \ref{5.2.1} we have that:
\begin{align}
\id_1 \bullet\nu&=\lp\lp\begin{bmatrix}
1 \\
-1
\end{bmatrix} W_1, \begin{bmatrix}
1 \\
-1
\end{bmatrix}b_1 + \begin{bmatrix}
0 \\ 0
\end{bmatrix}\rp, \lp\begin{bmatrix}
1 \quad -1,
\end{bmatrix}, \begin{bmatrix}
0
\end{bmatrix}\rp\rp\\
&= \lp\lp\begin{bmatrix}
W_1 \\-W_{1}
\end{bmatrix}, \begin{bmatrix}
b_1 \\ -b_1
\end{bmatrix}\rp,\lp\begin{bmatrix}
1 \quad -1
\end{bmatrix}, \begin{bmatrix}
0
\end{bmatrix}\rp\rp
\end{align}
Let $x \in\R$. Upon instantiation with $\rect$ and $d=1$ we have:
\begin{align}
\lp\real_{\rect}\lp\id_1\bullet\nu\rp\rp\lp x \rp&= \rect(W_1x+b_1)-\rect(-W_1x - b_1) \nonumber\\
\textit{Case 1.ii:} Let $\nu=\lp\lp W_1,b_1\rp, \lp W_2,b_2\rp, ..., \lp W_L, b_L \rp\rp$. Deriving from Definition \ref{7.2.1} and \ref{5.2.1} we have that:
\textit{Case 1.iv:} Let $\nu=\lp\lp W_1,b_1\rp , \lp W_2,b_2\rp,...,\lp W_L, b_L \rp\rp$. Deriving from Definitions \ref{7.2.1} and \ref{5.2.1} we have that:
\textit{Case 2.i} Let $d \in\N\cap[1,\infty)$. Let $\nu\in\neu$ be $\nu=\lp W_1,b_1\rp$ with end-widths $d$. Deriving from Definitions \ref{5.2.1} and \ref{7.2.1} we have:
Let $x \in\R^d$. Upon instantiation with $\rect$ we have that:
\begin{align}
&\lp\real_{\rect}\lp\id_d \bullet\nu\rp\rp\lp x \rp\nonumber\\&= \rect([W_1]_{1,*}\cdot x + [b_1]_1)-\rect(-[W_1]_{1,*}\cdot x -[b_1]_1)+\cdots\nonumber\\& +\rect([W_1]_{d,*}\cdot x+[b_1]_d)-\rect (-[W_1]_{d,*}\cdot x-[b_1]_d) \nonumber\\
&= [W_1]_{1,*}\cdot x + [b_1]_1 + \cdots + [W_1]_{d,*}\cdot x + [b_1]_d \nonumber\\
&= W_1x + b_1 = \real_{\rect}\lp\nu\rp\nonumber
\end{align}
\textit{Case 2.ii:} Let $\nu=\lp\lp W_1,b_1\rp, \lp W_2,b_2\rp, ..., \lp W_L, b_L \rp\rp$. Deriving from Definition \ref{7.2.1} and \ref{5.2.1} we have that:
\textit{Case 2.iv:} Let $\nu=\lp\lp W_1,b_1\rp, \lp W_2,b_2\rp ,...,\lp W_L,b_L \rp\rp$. Deriving from Definitions \ref{7.2.1} and \ref{5.2.1} we have:
This, along with Case 2. iii, implies that the uninstantiated first layer is equivalent to $(W_L,b_L)$ whence $\id_d\bullet\nu=\nu$.
This completes the proof.
\end{proof}
\begin{definition}[Monoid]
Given a set $X$ with binary operation $*$, we say that $X$ is a monoid under the operation $*$ if:
\begin{enumerate}[label = (\roman*)]
\item For all $x,y \in X$ it is the case that $x*y \in X$
\item For all $x,y,z \in X$ it is the case that $(x *y)*z = x*(y*z)$
\item The exists a unique element $e \in X$ such that $e*x=x*e = x$
\end{enumerate}
\end{definition}
\begin{theorem}
Let $d\in\N$. For a fixed $d$, the set of all neural networks $\nu\in\neu$ with instantiations in $\rect$ and end-widths $d$ form a monoid under the operation of $\bullet$.
\end{theorem}
\begin{proof}
This is a consequence of Lemma \ref{7.2.3} and Lemma \ref{5.2.3}.
\end{proof}
\begin{remark}
By analogy with matrices, we may find it helpful to refer to neural networks of end-widths $d$ as ``square neural networks of size $d$''.
\end{remark}
%\section{Modulus of Continuity}
%\begin{definition}
% Let $A\subseteq \R$ and let $f:A \rightarrow \R$. We denote the modulus of continuity $\omega_f: \lb 0,\infty \rb \rightarrow \lb 0,\infty \rb$ as the function given for all $h \in \lb 0,\infty \rb$ as:
% Let $\alpha \in \lb -\infty, \infty \rb$, $b \in \lb a, \infty \rb$, and let $f: \lb a,b \rb \cap \R \rightarrow \R$ be a function. It is then the case that for all all $x,y \in \lb a,b\rb \cap \R$ that $\left| f(x) -f(y)\right| \les \omega_f \lp \left| x-y \right| \rp$.
%\end{lemma}
%\begin{proof}
% Note that (\ref{9.3.1}) implies the lemma.
%\end{proof}
%\begin{lemma}\label{lem:9.3.3}
% Let $A\subseteq \R$, $L \in \lb 0,\infty \rp$, and let $f:A \rightarrow \R$ satisfy for all $x,y \in A$ that $\left| f(x) - f(y)\right| \les L \left|x-y \right|$. It is then the case for all $h \in \lb 0,\infty \rp$ that $\omega_f(h) \les Lh$.
%\end{lemma}
%\begin{proof}
% Since it holds for all $x,y \in \R$ that $\left| f(x) - f(y)\right| \les L \left|x-y \right|$, it then, with (\ref{9.3.1}) imply for all $h \in \lb 0,\infty \rp$ that:
%\section{Linear Interpolation of Real-Valued Functions}
%Note that we need a framework for approximating generic 1-dimensional continuous functions to approximate more complex functions. We introduce the linear interpolation operator and later see how neural networks can approximate 1-dimensional continuous functions to arbitrary precision.
% Let $n \in \N$, $x_0,x_1,...,x_n, y_0,y_1,...,y_n \in \R$. Let it also be the case that $x_0 \leqslant x_1 \leqslant \cdots \leqslant x_n$. We denote by $\lin^{y_0,y_1,...,y_n}_{x_0,x_1,...,x_n}: \R \rightarrow \R$, the function that satisfies for $i \in \{1,2,...,n\}$, and for all $w \in \lp -\infty, x_0 \rp$, $x \in [ x_{i-1},x_i )$, $z \in [ x_n, \infty)$ that:
% \begin{enumerate}[label = (\roman*)]
% \item $\lin^{y_0,y_1,...,y_n}_{x_0,x_1,...,x_n}\lp w \rp = y_0$
% Note that (\ref{7.3.1}) is a direct consequence of Definition \ref{lio}. Item (i) then implies for all $i \in \{1,2,...,n\}$ $x \in [x_{i-1},x_i]$ that:
% Let $N\in \N$, $L,x_0,x_1,...,x_N \in \R$ satisfy $x_0 < x_1 < \cdots < x_N$, and set let $f:\lb x_0,x_N \rb \rightarrow \R$ satisfy for all $x,y \in \lb x_0,x_N\rb$ that $\left| f(x)-f(y)\right| \les L \left| x-y\right|$, it is then the case that:
% \begin{enumerate}[label = (\roman*)]
% \item for all $x,y \in \R$ that:
% \begin{align}
% \left| \lp \lin^{f(x_0),f(x_1),...,f(x_N)}_{x_0,x_1,...,x_N}\rp \lp x \rp - \lp \lin^{f(x_0),f(x_1),...,f(x_N)}_{x_0,x_1,...,x_N}\rp \lp y \rp \right| \les L \left| x-y \right|
% \end{align}, and
% \item that:
% \begin{align}
% \sup_{x \in \lb x_0,x_N \rb }\left| \lp \lin^{f(x_0),f(x_1),...,f(x_N)}_{x_1,x_2,...,x_N}\rp \lp x \rp -f\lp x \rp\right| \les L \lp \max_{i \in \{ 1,2,...N\}} \left| x_i-x_{i-1}\right|\rp
% \end{align}
% \end{enumerate}
%\end{lemma}
%\begin{proof}
% The assumption that for all $x,y \in \lb x_0, x_k \rb$ it is the case that $\left| f(x) - f(y) \right| \les L \left| x-y\right|$ and Lemma \ref{lem:9.3.3} prove Item (i) and Item (ii).
%\end{proof}
%\subsection{Neural Networks to Approximate the $\lin$ Operator}
%\begin{lemma}\label{7.3.3}
% Let $\alpha,\beta,h \in \R$. Denote by $\relu \in \neu$ the neural network given by $\relu = h \circledast \lp \mathsf{i}_1 \bullet \aff_{\alpha,\beta}\rp $. It is then the case that:
% \item for all $x \in \R$ that $\lp \real_{\rect} \lp \relu \rp \rp \lp x \rp = h\max \{\alpha x+\beta ,0\}$
% \end{enumerate}
%\end{lemma}
%\begin{proof}
% Note that by Definition \ref{5.3.1} we know that $\aff_{\alpha,\beta} = \lp \lp \alpha,\beta \rp \rp$, this with Definition \ref{actnn}, and Definition \ref{5.2.1} together tell us that $\mathfrak{i}_1\bullet \aff_{\alpha,\beta} = \lp \alpha,\beta \rp$. A further application of Definition \ref{5.2.1}, and an application of Definition \ref{slm} yields that $h \circledast \lp \mathfrak{i}_1 \bullet \aff_{\alpha,\beta} \rp = \lp \lp \alpha,\beta \rp, \lp h ,0 \rp \rp$. This proves Item (i).
%
% Note that $\lay(\aff_{\alpha,\beta})= (1,1)$, $\lay(\mathfrak{i}_1) = \lp 1,1,1 \rp $, and $\lay(h)=1$. Item (i) of Lemma \ref{6.0.3} therefore tells us that $\lay (\relu) = \lay \lp h \circledast \lp \mathfrak{i}_1 \bullet \aff_{\alpha,\beta}\rp \rp$. This proves Item (ii).
%
% Note that Lemmas \ref{7.1.2} and \ref{6.0.3} tell us that:
% Which proves Item (iv). For all $i \in \{0,1,2,...,N\}$, let $\phi_i$ be $\phi_i = h_i \circledast \lp \mathfrak{i} \bullet \aff_{1,-x_i} \rp $. Next note that \ref{7.3.6}, Lemma \ref{5.3.3}, and \cite[Lemma~3.28]{Grohs_2022} then tell us that:
%This proves (\ref{7.3.10}). In addition, note that (\ref{7.3.8}), (\ref{7.3.10}), and the fact that for all $i \in \{1,2,...,n\}$ it is the case that $x_{i-1} \les x_{i}$ tells us that for all $i \in \{1,2,...,n\}$ and $x \in [x_{i-1},x_i]$ it is the case that:
% For the induction step notice that (\ref{7.3.13}) implies that for all $i \in \{2,3,...,N\}$, $x \in [x_{i-1},x_i]$, with the instantiation that $\forall x \in [x_{i-2},x_{i-1}]: \lp \real_{\rect} \lp \Phi \rp \rp \lp x \rp = y_{i-2} + \lp \frac{y_{i-1}-y_{i-2}}{x_{i-1}-x_{i-2}} \rp \lp x-x_{i-2} \rp $, it is then the case that:
% Together with (\ref{7.3.10.2}), (\ref{7.3.14}), and Definition \ref{lio} establishes Item (iii) thus proving the lemma.
% \end{proof}
%\section{Neural Network Approximations of 1-dimensional Functions.}
%
%\begin{lemma}\label{lem:9.5.1}
% Let $N\in \N$, $L. a. x_0,x_1,...,x_N \in \R$, $b\in \lp a,\infty \rp$, satisfy for all $i \in \left\{0,1,...,N\right\}$ that $x_i = a+ \frac{i(b-a)}{N}$. Let $f:\lb a,b\rb \rightarrow \R$ satisfy for all $x,y \in \lb a,b\rb$ that $\left|f(x) - f(y) \right| \les L\left|x-y\right|$ and let $\mathsf{F} \in \neu$ satisfy:
% \item it holds that for all $x,y \in \R$ that $\left| \lp \real_{\rect} \lp \F \rp \rp \lp x \rp -\lp \real_{\rect} \lp \F \rp\rp\lp y \rp \right| \les L \left| x-y \right|$
% \item it holds that $\sup_{x \in \lb a,b \rb} \left| \lp \real_{\rect} \lp \F \rp \rp\lp x\rp -f(x)\right| \les \frac{L \lp b-a\rp}{N}$, and
% \item $\param\lp \F \rp = 3N+4$.
% \end{enumerate}
%\end{lemma}
%\begin{proof}
% Note that since it is the case that for all $i \in \left\{0,1,...,N \right\}: x_{\min \{i+1,N\}} - x_{\min \{i, N-1\}} = x_{\max\{i,1\}} - x_{\max \{i-1,0\}} = \frac{b-a}{N}$, we have that:
% Thus Items (i)-(iv) of Lemma \ref{9.3.4} prove Items (i)-(iii), and (vi) of this lemma. Item (iii) combined with the assumption that for all $x,y \in \lb a,b \rb: \left| f(x) - f(y) \right| \les \left| x-y \right|$ and Item (i) in Lemma \ref{lem:9.4.3} establish Item (iv). Furthermore, note that Item (iii), the assumption that for all $x,y \in \lb a,b \rb: \left| f(x) -f(y)\right| \les L\left| x-y\right|$, Item (ii) in Lemma \ref{lem:9.4.3} and the fact that for all $i \in \{1,2,..., N\}: x_i-x_{i-1} = \frac{b-a}{N}$ demonstrate for all $x \in \lb a,b \rb$ it holds that:
% \begin{align}
% \left| \lp \real_{\rect} \lp \F \rp\rp \lp x \rp -f\lp x \rp \right| \les L \lp \max_{i \in \{1,2,...,N\}} \left| x_i - x_{i-1}\right|\rp = \frac{L(b-a)}{N}
% \end{align}
%\end{proof}
%\begin{lemma}\label{lem:9.5.2}
% Let $L,a \in \R$, $b\in \lb a, \infty \rp$, $\xi \in \lb a,b \rb$, let $f: \lb a,b \rb \rightarrow \R$ satisfy for all $x,y \in \lb a,b \rb$ that $\left| f(x) - f(y) \right| \les L\left|x-y \right|$, and let $\F \in \neu$ satisfy $\F = \aff_{1,f(\xi)} \bullet \lp 0 \circledast \lp \mathsf{i}_1 \bullet \aff_{1,-\xi} \rp \rp $, it is then the case that:
% Note that Item (i) is a consequence of the fact that $\aff_{1,-\xi}$ is a neural network with a real number as weight and a real number as a bias and the fact that $\lay \lp \mathsf{i}_1 \rp = \lp 1,1,1 \rp$. Note also that Item (iii) of Lemma \ref{7.3.3} proves Item (iii).
%
% Note that from the construction of $\aff$ we have that:
% \begin{align}\label{(9.5.4)}
% \lp \real_{\rect} \lp \F \rp\rp \lp x \rp &= \lp \real_{\rect} \lp 0 \circledast \lp \mathsf{i}_1 \bullet \aff_{1,-\xi}\rp\rp \rp \lp x \rp + f \lp \xi \rp \nonumber \\
% &= 0 \lp \lp \real_{\rect} \lp \mathsf{i}_1 \bullet \aff_{1,-\xi} \rp\rp \lp x \rp \rp + f \lp \xi \rp = f \lp \xi \rp
% \end{align}
% Which establishes Item (iii). Note that (\ref{(9.5.4)}), the fact that $\xi \in \lb a,b\rb$ and the fact that for all $x,y \in \lb a,b \rb$ it is the case that $\left| f(x) - f(y) \right| \les \left| x-y \right|$ give us that for all $x \in \lb a,b \rb$ it holds that:
% \begin{align}
% \left| \lp \real_{\rect} \lp \F \rp\rp \lp x \rp - f\lp x \rp\right| = \left| f\lp \xi \rp - f \lp x \rp\right| \les L \left| x- \xi \right| \les L \max\left\{ \xi -a, b-\xi \right\}
% \end{align}
% This establishes Item (iv). Note a simple parameter count yields the following:
% \begin{align}
% \param \lp \F \rp = 1(1+1)+1(1+1) = 4
% \end{align}
% Establishing Item (v) and hence the lemma. This completes the proof.
%\end{proof}
%\begin{corollary}
% Let $\ve \in (0,\infty)$, $L,a \in \R$, $b \in \lp a,\infty \rp$, $N \in \N_0 \cap \lb \frac{L(b-a)}{\ve}, \frac{L(b-a)}{\ve}+1\rb$, $x_0, x_1,...,x_N \in \R$ satisfy for all $i \in \{ 0,1,...,N\}$ that $x_i = a + \frac{i(b-a)}{\max\{N,1\}}$, let $f: \lb a,b \rb \rightarrow \R$ satisfy for all $x,y \in \lb a,b \rb$ that $\left| f(x) - f(y) \rb \les L\left| x-y \right|$, and let $\F \in \neu$ satisfy:
% The fact that $N \in \N_0 \cap \lb \frac{L(b-a)}{\ve}, \frac{L(b-a)}{\ve}+1 \rb$ ensures that $\frac{L(b-a)}{\max\{ K,1\}} \les \ve$. This and Items (i),(ii),(iv), and (v) in Lemma \ref{lem:9.5.1} and Items (i)-(iii), and (iv) of Lemma $\ref{lem:9.5.2}$ establishes Items (i)-(iv). Furthermore, note that since $N\les 1 + \frac{L(b-a)}{\ve}$, Item (vi) in Lemma \ref{lem:9.5.1} and Item (v) in Lemma \ref{lem:9.5.2} tells us that:
Let $h\in\lp-\infty, \infty\rp$. It is then the case that:
\begin{enumerate}[label = (\roman*)]
\item for $x =\{x_1,x_2\}\in\R^2$ that $\lp\real_{\rect}\lp\trp^h \rp\rp\lp x \rp\in C \lp\R^2, \R\rp$
\item for $x =\{x_1,x_2\}\in\R^2$ that $\lp\real_{\rect}\lp\trp^h \rp\rp\lp x \rp=\frac{1}{2}h \lp x_1+x_2\rp$
\item$\dep\lp\trp^h \rp=1$
\item$\param\lp\trp^h \rp=3$
\item$\lay\lp\trp^h \rp=\lp2,1\rp$
\end{enumerate}
\end{lemma}
\begin{proof}
This a straight-forward consequence of Lemma \ref{5.3.1}
\end{proof}
\begin{definition}[The $\etr$ neural network]
Let $n\in\N$ and $h \in\R_{\ges0}$. We define the neural network $\etr^{n,h}\in\neu$ as:
\begin{align}
\etr^{n,h}\coloneqq\aff_{\underbrace{\lb\frac{h}{2}\ h \ h\ ... \ h \ \frac{h}{2}\rb}_{n+1-many},0}
\end{align}
\end{definition}
\begin{lemma}\label{etr_prop}
Let $n\in\N$. Let $x_0\in\lp-\infty, \infty\rp$, and $x_n \in\lb x_0, \infty\rp$. Let $ x =\lb x_0\: x_1\:...\: x_n\rb\in\R^{n+1}$ and $h\in\lp-\infty, \infty\rp$ such that for all $i \in\{0,1,...,n\}$ it is the case that $x_i = x_0+i\cdot h$. Then:
\begin{enumerate}[label = (\roman*)]
\item for all $x \in\R^{n+1}$ it is the case that $\lp\real_{\rect}\lp\etr^{n,h}\rp\rp\lp x \rp\in C \lp\R^{n+1}, \R\rp$
\item for all $n\in\N$, and $h\in\lp0,\infty\rp$ it is the case that $\lp\real_{\rect}\lp\etr^{n,h}\rp\rp\lp x \rp=\frac{h}{2}\cdot x_0+h\cdot x_1+\cdots+ h\cdot x_{n-1}+\frac{h}{2}\cdot x_n$
\item for all $n \in\N$, and $h \in\lp0,\infty\rp$ it is the case that $\dep\lp\etr^{n,h}\rp=1$
\item for all $n \in\N$ and $h \in\lp0,\infty\rp$ it is the case that $\param\lp\etr^{n,h}\rp= n+2$
\item for all $n\in\N$ and $h \in\lp0,\infty\rp$ it is the case that $\lay\lp\etr^{n,h}\rp=\lp n+1,1\rp$
\end{enumerate}
\end{lemma}
\begin{proof}
This a straightforward consequence of Lemma \ref{5.3.1}.
\end{proof}
\begin{remark}
Let $h \in\lp0,\infty\rp$. Note then that $\trp^h$ is simply $\etr^{2,h}$.
\end{remark}
%\begin{lemma}
% Let $f \in C \lp \R, \R \rp$, $a\in \R, b \in \lb a,\infty\rp$, $N\in \N$, and let $h = \frac{b-a}{N}$. Assume also that $f$ has first and second derivatives almost everywhere. Let $ x = \lb x_0 \: x_1 \:...\: x_n\rb \in \R^{n+1}$ such that for all $i \in \{0,1,...,n\}$ it is the case that $x_i = x_0+i\cdot h$, as such let it also be the case that $f\lp \lb x \rb_{*,*}\rp = \lb f(x_0)\: f(x_1) \: \cdots f(x_n) \rb$. Let $a = x_0$ and $b = x_n$. It is then the case that:
\item$\lp\real_{\rect}\lp\nrm^d_1\rp\rp\lp x \rp\in C \lp\R^d,\R\rp$
\item that for all $x \in\R^d$ that $\lp\real_{\rect}\lp\nrm^d_1\rp\rp\lp x \rp=\left\| x \right\|_1$
\item it holds $\hid\lp\nrm^d_1\rp=1$
\item it holds that $\param\lp\nrm_1^d \rp\les7d^2$
\item it holds that $\dep\lp\nrm^d_1\rp=2$
\end{enumerate}
\end{lemma}
\begin{proof}
Note that by observation, it is the case that $\lay\lp\nrm^d_1\rp=\lp1,2,1\rp$. This and Remark \ref{5.3.2} tells us that for all $d \in\{2,3,...\}$ it is the case that $\lay\lp\boxminus_{i=1}^d \nrm^d_1\rp=\lp d,2d,d\rp$. This, Lemma \ref{comp_prop}, and Lemma \ref{5.3.2} ensure that for all $d \in\{2,3,4,...\}$ it is the case that $\lay\lp\nrm^d_1\rp=\lp d,2d,1\rp$, which in turn establishes Item (i).
Notice now that (\ref{(9.7.1)}) ensures that:
\begin{align}
\lp\real_{\rect}\lp\nrm^d_1 \rp\rp\lp x \rp = \rect\lp x \rp + \rect\lp -x \rp = \max\{x,0 \} + \max\{ -x,0\} = \left| x \right| = \| x \|_1
\end{align}
This along with \cite[Proposition~2.19]{grohs2019spacetime} tells us that for all $d \in\{2,3,4,...\}$ and $x =\lp x_1,x_2,...,x_d\rp\in\R^d$ it is the case that:
Note next that by observation $\hid\lp\nrm^1_1\rp=1$. Remark \ref{5.3.2} then tells us that since the number of layers remains unchanged under stacking, it is then the case that $\hid\lp\nrm^1_1\rp=\hid\lp\boxminus_{i=1}^d \nrm_1^1\rp=1$. Note next that Lemma \ref{5.2.3} then tells us that $\hid\lp\sm_{d,1}\rp=0$ whence Lemma \ref{comp_prop} tells us that:
and as such $\param\lp\nrm^1_1\rp=7$. This, combined with Cor \ref{cor:sameparal}, and the fact that we are stacking identical neural networks then tells us that:
Finally, by observation $\dep\lp\nrm^1_1\rp=2$, we are stacking the same neural network when we have $\nrm^d_1$. Stacking has no effect on depth from Definition \ref{def:stacking}, and by Lemma \ref{comp_prop}, $\dep\lp\sm_{d,1}\bullet\lb\boxminus^d_{i=1}\nrm_1^1\rb\rp=\dep\lp\boxminus\nrm^1_1\rp$. Thus we may conclude that $\dep\lp\nrm^d_1\rp=\dep\lp\nrm_1^1\rp=2$.
This concludes the proof of the lemma.
\end{proof}
\subsection{The $\mxm^d$ Neural Networks}
Given $x\in\R$, it is straightforward to find the maximum; $ x$ is the maximum. For $x \in\R^d$ we may find the maximum via network (\ref{9.7.6.1}), i.e. $\mxm^2$. The strategy is to find maxima for half our entries and half repeatedly until we have one maximum. For $x \in\R^d$ where $d$ is even we may stack $d$ copies of $\mxm^2$ to halve, and for $x \in\R^d$ where $d$ is odd and greater than $3$ we may introduce ``padding'' via the $\id_1$ network and thus require $\frac{d-1}{2}$ copies of $\mxm^2$ to halve.
\begin{definition}[Maxima ANN representations]
Let $\lp\mxm^d\rp_{d \in\N}\subseteq\neu$ represent the neural networks that satisfy:
\begin{enumerate}[label = (\roman*)]
\item for all $d \in\N$ that $\inn\lp\mxm^d \rp= d$
\item for all $d \in\N$ that $\out\lp\mxm^d \rp=1$
\item that $\mxm^1=\aff_{1,0}\in\R^{1\times1}\times\R^1$
\item that:
\begin{align}\label{9.7.6}
\mxm^2 = \lp\lp\begin{bmatrix}
1 & -1 \\ 0 & 1 \\ 0 & -1
\end{bmatrix}, \begin{bmatrix}
0 \\ 0 \\0
\end{bmatrix}\rp, \lp\begin{bmatrix}
1&1&-1
\end{bmatrix}, \begin{bmatrix}
0
\end{bmatrix}\rp\rp
\end{align}
\item it holds for all $d \in\{2,3,...\}$ that $\mxm^{2d}=\mxm^d \bullet\lb\boxminus_{i=1}^d \mxm^2\rb$, and
\item it holds for all $d \in\{2,3,...\}$ that $\mxm^{2d-1}=\mxm^d \bullet\lb\lp\boxminus^d_{i=1}\mxm^2\rp\boxminus\id_1\rb$.
\end{enumerate}
\end{definition}
\begin{remark}
Diagrammatically, this can be represented as:
\begin{figure}[h]
\begin{center}
\tikzset{every picture/.style={line width=0.75pt}}%set default line width to 0.75pt
Assume w.l.o.g. that $d > 1$. Note that (\ref{9.7.6}) ensures that $\hid\lp\mxm^d \rp=1$. This and (\ref{5.2.5}) then tell us that for all $d \in\{2,3,4,...\}$ it is the case that:
And for $d \in\{4,6,8,...\}$ with $\hid\lp\mxm^{\left\lceil\frac{d}{2}\right\rceil}\rp=\left\lceil\log_2\lp\frac{d}{2}\rp\right\rceil$ it holds that:
\begin{align}\label{9.7.8}
\hid\lp\mxm^d \rp = \left\lceil\log_2 \lp\frac{d}{2}\rp\right\rceil + 1 = \left\lceil\log_2 \lp d \rp -1 \right\rceil +1 = \left\lceil\log_2 \lp d \rp\right\rceil
\end{align}
Moreover (\ref{9.7.7}) and the fact that for all $d \in\{3,5,7,...\}$ it holds that $\left\lceil\log_2\lp d+1\rp\right\rceil=\left\lceil\log_2\lp d \rp\right\rceil$ ensures that for all $d \in\{3,5,7,...\}$ with $\hid\lp\mxm^{\left\lceil\frac{d}{2}\right\rceil}\rp=\left\lceil\log_2\lp\left\lceil\frac{d}{2}\right\rceil\rp\right\rceil$ it holds that:
This and (\ref{9.7.8}) demonstrate that for all $d \in\{3,4,5,...\}$ with $\forall k \in\{2,3,...,d-1\}: \hid\lp\mxm^d\rp=\left\lceil\log_2\lp k \rp\right\rceil$ it holds htat $\hid\lp\mxm^d \rp=\left\lceil\log_2\lp d \rp\right\rceil$. The fact that $\hid\lp\mxm^2\rp=1$ and induction establish Item (i).
We next note that $\lay\lp\mxm^2\rp=\lp2,3,1\rp$. This then indicates that for all $i\in\N$ that:
Additionally note that (\ref{9.7.11}) demonstrates that for all $d \in\{4,6,8,...\}$, $i \in\{2,3,...\}$ with $\wid_{i-1}\lp\mxm^{\frac{d}{2}}\rp\les3\left\lceil\lp\frac{d}{2}\rp\frac{1}{2^{i-1}}\right\rceil$ it holds that:
Furthermore note also the fact that for all $d \in\{3,5,7,...\}$, $i \in\N$ it holds that $\left\lceil\frac{d+1}{2^i}\right\rceil=\left\lceil\frac{d}{2^i}\right\rceil$ and (\ref{9.7.12}) assure that for all $d \in\{3,5,7,...\}$, $i\in\{2,3,...\}$ with $\wid_{i-1}\lp\mxm^{\left\lceil\frac{d}{2}\right\rceil}\rp\les3\left\lceil\left\lceil\frac{d}{2}\right\rceil\frac{1}{2^{i-1}}\right\rceil$ it holds that:
This and (\ref{9.7.16}) tells us that for all $d \in\{3,4,...\}$, $i \in\{2,3,...\}$ with $\forall k \in\{2,3,...,d-1\}$, $j \in\{1,2,...,i-1\}: \wid_j \lp\mxm^k \rp\les3\left\lceil\frac{k}{2^j}\right\rceil$ it holds that:
Note next that Lemma \ref{idprop}, Lemma \ref{comp_prop}, and \cite[Proposition~2.19]{grohs2019spacetime} then imply for all $d \in\{2,3,4,...\}$, $x =\{x_1,x_2,...,x_d\}\in\R^d$ it holds that $\lp\real_{\rect}\lp\mxm^d \rp\rp\lp x \rp\in C \lp\R^d,\R\rp$. and $\lp\real_{\rect}\lp\mxm^d \rp\rp\lp x \rp=\max\{ x_1,x_2,...,x_d \}$. This establishes Items (iii)-(iv).
Consider now the fact that Item (ii) implies that the layer architecture forms a geometric series whence we have that the number of bias parameters is bounded by:
\begin{align}
\frac{\frac{3d}{2}\lp 1 - \lp\frac{1}{2}\rp^{\left\lceil\log_2 \lp d\rp\right\rceil +1}\rp}{\frac{1}{2}}&= 3d \lp 1 - \frac{1}{2}^{\left\lceil\log_2 \lp d \rp\right\rceil +1}\rp\nonumber\\
&\les\left\lceil 3d \lp 1 - \frac{1}{2}^{\left\lceil\log_2 \lp d \rp\right\rceil +1}\rp\right\rceil
\end{align}
For the weight parameters, consider the fact that our widths follow a geometric series with ratio $\frac{1}{2}$, and considering that we have an upper bound for the number of hidden layers, and the fact that $\wid_0\lp\mxm^d\rp= d$, would then tell us that the number of weight parameters is bounded by:
Item (vi) is a straightforward consequence of Item (i). This completes the proof of the lemma.
\end{proof}
\subsection{The $\mathsf{MC}$ Neural Network and Approximations via Maximum Convolutions }
Let $f: [a,b]\rightarrow\R$ be a continuous bounded function with Lipschitz constant $L$. Let $x_0\les x_1\les\cdots\les x_N$ be a set of sample points within $[a,b]$, with it being possibly the case that that for all $i \in\{0,1,\hdots, N\}$, $x_i \sim\unif([a,b])$. For all $i \in\{0,1,\hdots, N\}$, define a series of functions $f_0,f_1,\hdots f_N: [a,b]\rightarrow\R$, as such:
\begin{align}
f_i = f(x_i) - L \cdot\left| x-x_i\right|
\end{align}
We will call the approximant $\max_{i \in\{0,1,\hdots, N\}}\{ f_i\}$, the \textit{maximum convolution approximation}. This converges to $f$, as shown in
\begin{lemma}\label{(9.7.5)}\label{lem:mc_prop}
Let $d,N\in\N$, $L\in\lb0,\infty\rp$, $x_1,x_2,\hdots, x_N \in\R^d$, $y =\lp y_1,y_2,\hdots,y_N \rp\in\R^N$ and $\mathsf{MC}\in\neu$ satisfy that:
\item$\hid\lp\mathsf{MC}^{N,d}_{x,y}\rp=\left\lceil\log_2\lp N \rp\right\rceil+1$
\item$\wid_1\lp\mathsf{MC}^{N,d}_{x,y}\rp=2dN$
\item for all $i \in\{2,3,...\}$ we have $\wid_i \lp\mathsf{MC}^{N,d}_{x,y}\rp\les3\left\lceil\frac{N}{2^{i-1}}\right\rceil$
\item it holds for all $x \in\R^d$ that $\lp\real_{\rect}\lp\mathsf{MC}^{N,d}_{x,y}\rp\rp\lp x \rp=\max_{i \in\{1,2,...,N\}}\lp y_i - L \left\| x-x_i \right\|_1\rp$
\item it holds that $\param\lp\mathsf{MC}^{N,d}_{x,y}\rp\les\left\lceil\lp\frac{2}{3}d^2+3d\rp\lp1+\frac{1}{2}^{2\lp\left\lceil\log_2\lp d\rp\right\rceil+1\rp}\rp+1\right\rceil+7N^2d^2+3\left\lceil\frac{N}{2}\right\rceil\cdot2dN$
Throughout this proof let $\mathsf{S}_i \in\neu$ satisfy for all $i \in\{1,2,...,N\}$ that $\mathsf{S}_i =\nrm_1^d \bullet\aff_{\mathbb{I}_d,-x_i}$ and let $\mathsf{X}\in\neu$ satisfy:
Note that (\ref{9.7.20}) and Lemma \ref{comp_prop} tells us that $\out\lp\R\rp=\out\lp\mxm^N \rp=1$ and $\inn\lp\mathsf{MC}^{N,d}_{x,y}\rp=\inn\lp\cpy_{N,d}\rp=d $. This proves Items (i)--(ii). Next observe that since it is the case that $\hid\lp\cpy_{N,d}\rp$ and $\hid\lp\nrm^d_1\rp=1$, Lemma \ref{comp_prop} then tells us that:
Next observe that the fact that $\hid\lp\mathsf{X}\rp=1$, Lemma \ref{comp_prop} and Lemma \ref{9.7.4} tells us that for all $i \in\{2,3,...\}$ it is the case that:
This an \cite[Proposition~2.20]{grohs2019spacetime} combined establishes that for all $x \in\R^d$ it holds that:
\begin{align}
\lp\real_{\rect}\lp\lb\boxminus_{i=1}^N \mathsf{S}_i \rb\bullet\cpy_{N,d}\rp\rp\lp x \rp = \lp\| x-x_1 \|_1, \|x-x_2\|_1,...,\|x-x_N\|_1\rp\nonumber\\
\end{align}
This and Lemma \ref{5.3.3} establishes that for all $x \in\R^d$ it holds that:
\begin{align}
\lp\real_{\rect}\lp\mathsf{X}\rp\rp\lp x \rp&= \lp\real_{\rect}\lp\aff_{-L\mathbb{I}_N,y}\rp\rp\circ\lp\real_{\rect}\lp\lb\boxminus_{i=1}^N \mathsf{S}_i\rb\bullet\cpy_{N,d}\rp\rp\lp x \rp\nonumber\\
\lp\real_{\rect}\lp\mathsf{MC}^{N,d}_{x,y}\rp\rp\lp x \rp&= \lp\real_{\rect}\lp\mxm^N \rp\circ\lp\real_{\rect}\lp\mathsf{X}\rp\rp\rp\lp x \rp\nonumber\\
&=\max_{i\in\{1,2,...,N\}}\lp y_i - L \|x-x_i\|_1\rp
\end{align}
This establishes Item (vi).
For Item (vii) note that Lemma \ref{lem:nrm_prop}, Remark \ref{rem:stk_remark}, Lemma \ref{lem:nrm_prop}, and Corollary \ref{affcor} tells us that for all $d\in\N$ and $x \in\R^d$ it is the case that:
This, along with Corollary \ref{cor:sameparal}, and because we are stacking identical neural networks, then tells us that for all $N \in\N$, it is the case that:
Now, let $d,N \in\N$, $L \in[0,\infty)$, let $x_1,x_2,\hdots, x_N \in\R^d$ and let $y =\{y_1,y_2,\hdots, y_N \}\in\R^N$. Observe that again, Corollary \ref{affcor}, and (\ref{8.3.38}) tells us that:
\caption{Neural network diagramfor the $\mxm$ network}
\end{figure}
\subsection{Lipschitz Function Approximations}\label{(9.7.6)}
\begin{lemma}%TODO: Should we stipulate compact sets?
Let $\lp E,d \rp$ be a metric space. Let $L \in\lb0,\infty\rp$, $D \subseteq E$, $\emptyset\neq C \subseteq D$. Let $f:D \rightarrow\R$ satisfy for all $x\in D$, $y \in C$ that $\left| f(x)-f(y)\right| \les L d \lp x,y \rp$, and let $F:E \rightarrow\R\cup\{\infty\}$ satisfy for all $x\in E$ that:
\begin{align}\label{9.7.30}
F\lp x \rp = \sup_{y\in C}\lb f\lp y \rp - Ld\lp x,y \rp\rb
\end{align}
It is then the case that:
\begin{enumerate}[label = (\roman*)]
\item for all $x \in C$ that $F(x)= f(x)$
\item it holds for all $x \in D$, that $F(x)\les f(x)$
\item it holds for all $x\in E$ that $F\lp x \rp < \infty$
\item it holds for all $x,y \in E$ that $\left| F(x)-F(y)\right| \les Ld\lp x,y \rp$ and,
\item it holds for all $x \in D$ that:
\begin{align}\label{9.7.31}
\left| F\lp x \rp - f \lp x \rp\right| \les 2L \lb\inf_{y\in C} d \lp x,y \rp\rb
\end{align}
\end{enumerate}
\end{lemma}
\begin{proof}
The assumption that $\forall x \in D, y \in C: \left| f(x)- f(y)\right| \les Ld\lp x,y \rp$ ensures that:
This establishes Item (i). Note that (\ref{9.7.31}) then tells us that for all $x\in C$ it holds that:
\begin{align}
F\lp x \rp\ges f(x) - Ld\lp x,y \rp = f\lp x \rp
\end{align}
This with (\ref{9.7.33}) then yields Item (i).
Note next that (\ref{9.7.32}, with $x \curvearrowleft y \text{ and } y \curvearrowleft z)$ and the triangle inequality ensure that for all $x \in E$, $y,z \in C$ it holds that:
This establishes Item (v). Finally, note that Items (i) and (iv), the triangle inequality, and the assumption that $\forall x \in D, y\in C: \left| f(x)- f(y)\right| \les Ld\lp x,y \rp$ ensure that for all $x\in D$ it holds that:
This establishes Item (v) and hence establishes the Lemma.
\end{proof}
\begin{corollary}\label{9.7.6.1}
Let $\lp E,d \rp$ be a metric space, let $L \in\lb0,\infty\rp$, $\emptyset\neq C \subseteq E$, let $f: E \rightarrow\R$ satisfy for all $x\in E$, $y \in C$ that $\left\| f(x)- f(y)\right| \les Ld \lp x,y \rp$, and let $F:E \rightarrow\R\cup\{\infty\}$ satisfy for all $x\in E$ that:
\begin{align}
F \lp x \rp = \sup_{y\in C}\lb f(y) - Ld \lp x,y \rp\rb
\end{align}
It is then the case that:
\begin{enumerate}[label = (\roman*)]
\item for all $x\in C$ that $F(x)= f(x)$
\item for all $x\in E$ that $F(x)\les f(x)$
\item for all $x,y \in E$ that $\left| F(x)- f(y)\right| \les L d \lp x,y \rp$ and
\item for all $x\in E$ that: \begin{align}
\left| F\lp x \rp - f\lp x \rp\right| \les 2L \lb\inf_{y\in C} d \lp x,y \rp\rb
\end{align}
\end{enumerate}
\end{corollary}
\begin{proof}
Note that Lemma \ref{(9.7.6)} establishes Items (i)\textemdash(iv).
Let $d,N \in\N$, $L \in\lb0,\infty\rp$. Let $E \subseteq\R^d$. Let $x_1,x_2,...,x_N \in E$, let $f:E \rightarrow\R$ satisfy for all $x_1,y_1\in E$ that $\left| f(x_1)-f(y_1)\right| \les L \left\| x_1-x_2\right\|_1$ and let $\mathsf{MC}\in\neu$ and $y =\lp f\lp x_1\rp, f \lp x_2\rp,...,f\lp x_N \rp\rp$ satisfy:
Throughout this proof let $F: \R^d \rightarrow\R$ satisfy that:
\begin{align}\label{9.7.43}
F\lp x \rp = \max_{i \in\{1,2,...,N\}}\lp f\lp x_i \rp- L \left\| x-x_i \right\|_1 \rp
\end{align}
Note then that Corollary \ref{9.7.6.1}, (\ref{9.7.43}), and the assumption that for all $x,y \in E$ it holds that $\left| f(x)- f(y)\right| \les L \left\|x-y \right\|_1$ assures that:
Then Lemma \ref{(9.7.5)} tells us that for all $x\in E$ it holds that $F(x)=\lp\real_{\rect}\lp\mathsf{MC}\rp\rp\lp x \rp$. This combined with (\ref{(9.7.44)}) establishes (\ref{(9.7.42)}).
Let $d,N \in\N$, $L \in\lb0,\infty\rp$. Let $\lb a,b\rb\subsetneq\R^d$. Let $x_1,x_2,...,x_N \in\lb a,b\rb$, let $f:\lb a,b\rb\rightarrow\R$ satisfy for all $x_1,x_2\in\lb a,b\rb$ that $\left| f(x_1)-f(x_2)\right| \les L \left| x_1-x_2\right|$ and let $\mathsf{MC}^{N,1}_{x,y}\in\neu$ and $y = f\lp\lb x \rb_*\rp$ satisfy:
It is then the case that for approximant $\mathsf{MC}^{N,1}_{x,y}$ that:
\begin{enumerate}[label = (\roman*)]
\item$\inn\lp\mathsf{MC}^{N,1}_{x,y}\rp=1$
\item$\out\lp\mathsf{MC}^{N,1}_{x,y}\rp=1$
\item$\hid\lp\mathsf{MC}^{N,1}_{x,y}\rp=\left\lceil\log_2\lp N \rp\right\rceil+1$
\item$\wid_1\lp\mathsf{MC}^{N,1}_{x,y}\rp=2N$
\item for all $i \in\{2,3,...\}$ we have $\wid_1\lp\mathsf{MC}^{N,1}_{x,y}\rp\les3\left\lceil\frac{N}{2^{i-1}}\right\rceil$
\item it holds for all $x \in\R^d$ that $\lp\real_{\rect}\lp\mathsf{MC}^{N,1}_{x,y}\rp\rp\lp x \rp=\max_{i \in\{1,2,...,N\}}\lp y_i - L \left| x-x_i \right|\rp$
\item it holds that $\param\lp\mathsf{MC}^{N,1}_{x,y}\rp\les6+7N^2+3\left\lceil\frac{N}{2}\right\rceil\cdot2N$
Items (i)\textemdash(vii) is an assertion of Lemma \ref{lem:mc_prop}. Item (viii) is an assertion of Lemma \ref{lem:maxconv_accuracy} with $d \curvearrowleft1$.