\chapter{Introduction and Basic Notions About Neural Networks}
We seek here to introduce a unified framework for artificial neural networks. This framework borrows from the work presented in \cite{grohsetal} and work done by Joshua Padgett, Benno Kuckuk, and Arnulf Jentzen (unpublished). With this framework in place, we wish to study ANNs from the perspective of trying to see the number of parameters required to define a neural network to solve certain PDEs. The \textit{curse of dimensionality} here refers to the number of parameters necessary to model PDEs and their growth (exponential or otherwise) as dimensions $d$ increase.
\section{The Basic Definition of ANNs and instantiations of ANNs}
\begin{definition}[Rectifier Function]
Let $d \in\N$ and $x \in\R^d$. We denote by $\rect: \R\rightarrow\R$ the function given by:
An artificial neural network is a tuple $\lp\nu, \param, \dep, \inn, \out, \hid, \lay, \wid\rp$ where $\nu\in\neu$ and is equipped with the following functions (referred to as auxiliary functions) satisfying for all \\$\nu\in\lp\bigtimes^L_{k=1}\lb\R^{l_k \times l_{k-1}}\times\R^{l_k}\rb\rp$:
Note that this implies that $\nu=((W_1,b_1),(W_2,b_2),...(W_L,b_L))\in\lp\bigtimes^L_{k=1}\lb\R^{l_k \times l_{k-1}}\times\R^{l_k}\rb\rp$. Note that we denote by $\we_{(\cdot), \nu}: (\we_{n,\nu})_{n\in\{1,2,...,L\}}: \{1,2,...,L\}\rightarrow\lp\bigcup_{m,k \in\N}\R^{m \times k}\rp$ and also $\bi_{(\cdot),\nu}: \lp\bi_{n,\nu}\rp_{\{1,2,...,L\}}: \{1,2,...,L\}\rightarrow\lp\bigcup_{m \in\N}\R^m \rp$ the functions that satisfy for all $n \in\{1,2,...,L\}$ that $\we_{i,\nu}= W_i$ i.e. the weights matrix for neural network $\nu$ at layer $i$ and $\bi_{i,\nu}= b_i$, i.e. the bias vector for neural network $\nu$ at layer $i$.
We will call $l_0$ the \textit{starting width} and $l_L$ the \textit{finishing width}. Together, they will be referred to as \textit{end-widths}.
\begin{remark}
Notice that our definition varies somewhat from the conventional ones found in \cite{petersen_optimal_2018} and \cite{grohs2019spacetime} in that whereas the former talk about auxiliary functions as existing within the set $\neu$ we will talk about these auxiliary functions as something elements of $\neu$ are endowed with. In other words, elements of $\neu$ may exist whose depths and parameter counts, for instance, are undefined and non-determinate.
Note that we develop this definition to closely align to popular deep-learning frameworks such as \texttt{PyTorch}, \texttt{TensorFlow}, and \texttt{Flux}, where, in principle, it is always possible to know the parameter count, depth, number of layers, and other auxiliary information.
We will often say let $\nu\in\neu$, and it is implied that the tuple $\nu$ with the auxiliary functions is what is being referred to. This is analogous to when we say that $X$ is a topological but we mean the pair $\lp X,\tau\rp$, i.e. $X$ endowed with topology $\tau$, or when we say that $Y$ is a measurable space when we mean the triple $\lp X,\Omega, \mu\rp$, i.e. $X$, endowed with $\sigma-$algebra $\Omega$, and measure $\mu$.
Let $\act\in C \lp\R, \R\rp$, we denote by $\real_{\act}: \neu\rightarrow\lp\bigcup_{k,l \in\N} C \lp\R^k, \R^l \rp\rp$ the function satisfying for all $L \in\N$, $l_0,l_1,...,l_L \in\N$, $\nu=\lp\lp W_1, b_1\rp , \lp W_2, b_2\rp ,...,\lp W_L, b_L \rp\rp\in\lp\bigtimes^L_{k=1}\lb\R^{l_k \times l_{k-1}}\times\R^{l_k}\rb\rp$, $x_0\in\R^{l_0}, x_1\in\R^{l_1},...,x_{L-1}\in\R^{l_L-1}$ and with $\forall k \in\N\cap(0,L):x_k =\act\lp\lb W_kx_k+b_k \rb_{*,*}\rp$such that:
\begin{align}\label{5.1.11}
\real_{\act}\lp\nu\rp\in C \lp\R^{l_0}, \R^{l_L}\rp&\text{ and }&\lp\real_{\act}\lp\nu\rp\rp\lp x_0 \rp = W_Lx_{L-1}+b_L
This also ensures that $\lay(\nu)=\lp l_0,l_1,...,l_L \rp\in\N^{L+1}=\N^{\dep(\nu)+1}$ and further that $\inn(\nu)= l_0$, $\out(\nu)= l_L$, and that $\dep(\nu)= L$. Together with ($\ref{5.1.11}$), this proves the lemma.
\end{proof}
\section{Compositions of ANNs}
The first operation we want to be able to do is to compose neural networks. Note that the composition is not done in an obvious way; for instance, note that the last layer of the first component of the composition is superimposed with the first layer of the second component of the composition.
We denote by $\lp\cdot\rp\bullet\lp\cdot\rp: \{\lp\nu_1,\nu_2\rp\in\neu\times\neu: \inn(\nu_1)=\out(\nu_1)\}\rightarrow\neu$ the function satisfying for all $L,M \in\N, l_0,l_1,...,l_L, m_0, m_1,...,m_M \in\N$, $\nu_1=\lp\lp W_1, b_1\rp, \lp W_2, b_2\rp,...,\lp W_L,b_L \rp\rp\in\lp\bigtimes^L_{k=1}\lb\R^{l_k \times l_{k-1}}\times\R^{l_k}\rb\rp$, and $\nu_2=\\\lp\lp W'_1, b'_1\rp, \lp W'_2, b'_2\rp,... \lp W'_M, b'_M \rp\rp\in\lp\bigtimes^M_{k=1}\lb\R^{m_k \times m_{k-1}}\times\R^{m_k}\rb\rp$ with $l_0=\inn(\nu_1)=\out(\nu_2)= m_M$ and :
This is a consequence of \cite[Lemma~2.8]{grohs2019spacetime} with $\Phi_1\curvearrowleft\nu_1$, $\Phi_2\curvearrowleft\nu_2$, and $\Phi_3\curvearrowleft\nu_3$, and the functions $\mathcal{I}\curvearrowleft\inn$, $\mathcal{L}\curvearrowleft\dep$ and $\mathcal{O}\curvearrowleft\out$.
\end{proof}
The following Lemma will be important later on, referenced numerous times, and found in \cite[Proposition~2.6]{grohs2019spacetime}. For completion, we will include a simplified version of the proof here.
\begin{lemma}\label{comp_prop}
Let $\nu_1, \nu_2\in\neu$. Let it also be that $\out\lp\nu_1\rp=\inn\lp\nu_2\rp$. It is then the case that:
\item for all $\act\in C \lp\R, \R\rp$ that $\real_{\act}\lp\nu_1\bullet\nu_2\rp\lp x \rp\in C \lp\R^{\inn\lp\nu_2\rp},\R^{\out\lp\nu_1\rp}\rp$ and further:
Note that Items (i)---(iii) are a simple consequence of Definition \ref{5.2.1}. Specifically, given neural networks $\nu_1,\nu_2\in\neu$, and $\dep\lp\nu_1\rp= n$ and $\dep\lp\nu_2\rp= m$, note that for all four cases, we have that the depth of the composed neural network $\nu_1\bullet\nu_2$ is given by $n-1+m-1=n+m-1$ proving Item (i). Note that the outer neural network loses its last layer, yielding Item (ii) in all four cases. Finally since, for all $\nu\in\neu$ it is the case that $\hid\lp\nu\rp=\dep\lp\nu\rp-1$, Item (i) yields Item (iii).
Now, suppose it is the case that $\nu_3=\nu_1\bullet\nu_2$ and that:
This can be proved via induction on $k\in\N\cap\lb L_2, L_1+L_2\rp$. Consider that our base case of $k=L_2$ in (\ref{(5.2.13)}) is fulfilled by (\ref{(5.2.12)}). Now note that for all $k \in\N\cap\lb L_2,\infty\rp\cap\lp0,L_1+L_2-1\rp$ with:
We will introduce here the important concept of stacking of ANNs. Given an input vector $x\in\R^d$, it is sometimes very helpful to imagine two neural networks working on them simultaneously, whence we have stacking. Because vectors are ordered tuples, stacking $\nu_1$ and $\nu_2$ is not necessarily the same as stacking $\nu_2$ and $\nu_1$.
\begin{definition}[Stacking of ANNs of same depth]\label{5.2.5}\label{def:stacking}
Let $L,n\in\N$, and let $\nu_1,\nu_2,\hdots, \nu_n \in\neu$, such that $\dep\lp\nu_1\rp=\dep\lp\nu_2\rp=\cdots=\dep\lp\nu_n\rp= L$. As such, for all $i \in\{1,\hdots,n\}$, let it also be the case that $\lay\lp\nu_i\rp=\lp\lp W_1^i,b^i_1\rp, \lp W^i_2,b^i_2\rp,\hdots, \lp W_L^i,b_L^i\rp\rp$. We then denote by $\boxminus^n_{i=1}\nu_i$, the neural network whose layer architecture is given by:
For an \texttt{R} implementation see Listing \ref{par}
\end{remark}
\begin{lemma}\label{inst_of_stk}
Let $\nu_1,\nu_2\in\neu$, with $\dep\lp\nu_1\rp=\dep\lp\nu_2\rp$, $x_1\in\R^{m_1}$, $x_2\in\R^{m_2}$, and $\mathfrak{x}\in\R^{m_1+m_2}$. Let $\inst_{\rect}\lp\nu_1\rp: \R^{m_1}\rightarrow\R^{n_1}$, and $\inst_{\rect}:\R^{m_2}\rightarrow\R^{n_2}$. It is then the case that $\real_{\rect}\lp\nu_1\boxminus\nu_2\rp\lp\mathfrak{x}\rp=\inst_{\rect}\lp\nu_1\rp\lp x_1\rp\frown\inst_{\rect}\lp\nu_2\rp\lp x_2\rp$.
Let $\lay\lp\nu_1\rp=\lp\lp W_1,b_1\rp,\lp W_2,b_2\rp,\hdots, \lp W_L,b_L\rp\rp$ and \\$\lay\lp\nu_2\rp=\lp\lp\fW_1, \fb_1\rp, \lp\fW_2,\fb_2\rp,\hdots,\lp\fW_L,\fb_L\rp\rp$, and as such it is the case according to Definition \ref{def:stacking} that:
Note that for all, $\act\in\lp\R,\R\rp$, $j \in\{1,2,\hdots,L-1\}$ and for all $x \in\R^{\columns(W_j)+\columns(\fW_j)}$, $x_1\in\R^{\columns\lp W_j\rp}$, $x_2\in\R^{\columns\lp\fW_j \rp}$, $y \in\R^{\rows\lp W_j\rp+\rows\lp\fW_j\rp}$, $y_1\in\R^{\rows\lp W_j\rp}$, $y_2\in\R^{\rows\lp\fW_j\rp}$, where $y_1=\act\lp\lb W_j \cdot x_1+ b_1\rb_{*,*}\rp$, $y_2=\act\lp\lb\fW_j\cdot x_2+\fb_j\rb_{*,*}\rp$, $y=\act\lp\lb\diag\lp W_j, \fW_j\rp\cdot x +\lp b_j \frown\fb_j \rp\rb_{*,*}\rp$ it is the case that, Corollary \ref{concat_fun_fun_concat} tells us that:
Note that this is repeated from one layer to the next, yielding that $\real_{\rect}\lp\nu_1\boxminus\nu_2\rp\lp\mathfrak{x}\rp=\inst_{\rect}\lp\nu_1\rp\lp x_1\rp\frown\inst_{\rect}\lp\nu_2\rp\lp x_2\rp$.
\end{proof}
\begin{remark}\label{5.3.2}\label{rem:stk_remark}
Given $n,L \in\N$, $\nu_1,\nu_2,...,\nu_n \in\neu$ such that $L =\dep(\nu_1)=\dep(\nu_2)=...=\dep(\nu_n)$ it is then the case, as seen from (\ref{5.4.2}) that:
Let $n,L \in\N$, $\nu_1,\nu_2,\hdots, \nu_n \in\neu$ satisfty that $L =\dep\lp\nu_1\rp=\dep\lp\nu_2\rp=\cdots=\dep\lp\nu_n \rp$. It is then the case that:
Since it is the case that for all $j \in\{1,2,...,n \}$ that: $\lay\lp\nu_j\rp=\lp l_0,l_1,...,l_L \rp$, where $l_0,l_1,...,l_L,L \in\N$, we may say that:
Let $\nu_1,\nu_2\in\neu$, such that $\dep\lp\nu_1\rp=\dep\lp\nu_2\rp= L$. It is then the case that $\param\lp\nu_1\rp+\param\lp\nu_2\rp\les\param\lp\nu_1\boxminus\nu_2\rp$.
Let $\nu_1,\nu_2,\nu_3\in\neu$. Let $\param\lp\nu_2\rp\les\param\lp\nu_3\rp$. It is then the case that $\param\lp\nu_1\boxminus\nu_2\rp\les\param\lp\nu_1\boxminus\nu_3\rp$.
\end{corollary}
\begin{proof}
Lemma \ref{lem:paramparal_geq_param_sum} tells us that:
Let $m_1,m_2,n_1,n_2\in\N$. Let $\nu_1,\nu_2\in\neu$, such that $\real_{\rect}\lp\nu_1\rp\in C \lp\R^{m_1},\R^{n_1}\rp$ and $\real_{\rect}\lp\nu_2\rp\in C\lp\R^{m_2},\R^{n_2}\rp$. It is then the case that $\lp\real_{\act}(\nu_1\boxminus\nu_2)\rp\lp\begin{bmatrix}
\end{bmatrix}\rp$ for $x \in\R^{m_1}, x' \in\R^{n_1}$, upto transposition.
\end{lemma}
\begin{proof}
Note that this is a consequence of the commutativity of summation in the exponents of (\ref{(5.3.3)}), and the fact that switching $\nu_1$ and $\nu_2$ with a transposition results in a transposed output for transposed input.
\end{proof}
\begin{lemma}\label{5.3.4}
Let $\act\in C \lp\R, \R\rp$, $n \in\N$, and $\nu=\boxminus_{i=1}^n \nu_i$ satisfy the condition that $\dep(\nu_1)=\dep(\nu_2)=...=\dep(\nu_n)$. It is then the case that $\real_{\act}\lp\nu\rp\in C \lp\R^{\sum_{i=1}^n \inn(\nu_i)}, \R^{\sum^n_{i=1}\out(\nu_i)}\rp$
Let $L =\dep(\nu_1)$, and let $l_{i,0},l_{i,1}...l_{i,L}\in\N$ satisfy for all $i \in\{1,2,...,n\}$ that $\lay(\nu_i)=\lp l_{i,0}, l_{i,1},...,l_{i,L}\rp$. Furthermore let $\lp\lp W_{i,1},b_{i,1}\rp, \lp W_{i,2},b_{i,2}\rp , ..., \lp W_{i,L},b_{i,L}\rp\rp\in\\\lp\bigtimes^L_{j=1}\lb\R^{l_{i,j}\times l_{i,j-1}}\times\R^{l_{i,j}}\rb\rp$ satisfy for all $i \in\{1,2,...,n\}$ that:
Let $\alpha_j \in\N$ with $j \in\{0,1,...,L\}$ satisfy that $\alpha_j =\sum^n_{i=1} l_{i,j}$ and let \\$\lp\lp A_1,b_1\rp, \lp A_2,b_2\rp,...,\lp A_L,b_L \rp\rp\in\lp\bigtimes^L_{j=1}\lb\R^{\alpha_{j}\times\alpha_{j-1}}\times\R^{\alpha_{j}}\rb\rp$ satisfy that:
See Remark 5.3.2. Let $x_{i,0},x_{i,1},...,x_{i,L-1}\in\lp\R^{l_{i,0}}\times\R^{l_{i,1}}\times\cdots\times\R^{l_{i,L-1}}\rp$ satisfy for all $i \in\{1,2,...,n\}$$k \in\N\cap\lp0,L \rp$ that:
Note that (\ref{5.3.5}) demonstrates that $\inn\lp\boxminus_{i=1}^n\nu_i \rp=\alpha_0$ and $\out\lp\boxminus^n_{i=1}\nu_i \rp=\alpha_L$. This and Item(ii) of Lemma \ref{5.1.8}, and the fact that for all $i \in\{1,2,...,n\}$it is the case that $\inn(\nu_i)= l_{i,0}$ and $\out(\nu_i)= l_{i,L}$ ensures that:
\begin{align}
\real_{\act}\lp\boxminus^n_{i=1}\rp\in C \lp\R^{\alpha_0}, \R^{\alpha_L}\rp&= C\lp\R^{\sum^n_{i=1}l_{i,0}}, \R^{\sum_{i=1}^n l_{i,L}}\rp\nonumber\\
&= C \lp\R^{\sum^n_{i=1}\inn(\nu_i)}, \R^{\sum_{i=1}^n \out(\nu_i)}\rp\nonumber
We will often encounter neural networks that we want to stack but have unequal depth. Definition \ref{5.2.5} only deals with neural networks of the same depth. We will facilitate this situation by introducing a form of padding for our shorter neural network. Hence, they come out to the same length before stacking them. This padding will be via the tunneling neural network, as shown below.
We define the tunneling neural network, denoted as $\tun_n$ for $n\in\N$ and $d\in\N$ by:
\begin{align}
\tun^d_n = \begin{cases}
\aff_{\mathbb{I}_d,0}&:n= 1 \\
\id_d &: n=2 \\
\bullet^{n-2}\id_d & n \in\N\cap [3,\infty)
\end{cases}
\end{align}
We will drop the requirement for $d$ and $\tun_n$ by itself will be used to denote $\tun_n^1$.
\end{definition}
\begin{remark}
We will discuss some properties of the $\tun^d_n$ network in Section \ref{sec_tun}.
\end{remark}
\begin{definition}
Let $n \in\N$, and $\nu_1,\nu_2,...,\nu_n \in\neu$. We will define the stacking of unequal length neural networks, denoted $\DDiamond^n_{i=1}\nu_i$ as the neural network given by:
Affine neural networks present an important class of neural networks. By virtue of them being only one layer deep, they may be instantiated with any activation function whatsoever and still retain their affine transformative properties, see Definition \ref{def:inst}. In addition, when composing, they are subsumed into the function being somposed to, i.e. they do not change the depth of a neural network once composed into it, see Lemma \ref{comp_prop}.
Let $m,n \in\N$, $W \in\R^{m \times n}$, $b \in\R^m$.We denote by $\aff_{W,b}\in\lp\R^{m\times n}\times\R^m \rp\subsetneq\neu$ the neural network given by $\aff_{W,b}=((W,b))$.
Note that $(i)$ is a consequence of Definition \ref{5.1.2} and \ref{5.3.1}. Note next that $\aff_{W,b}=(W,b)\in(\R^{m\times n}\times\R^m)\subsetneq\neu$. Note that ($\ref{5.1.11}$) then tells us that $\real_{\act}(\aff_{W,b})= Wx+b$ which in turn proves $(ii)$ and $(iii)$
Given $W\in\R^{m\times n}$, and $b \in\R^{m \times1}$, it is the case that according to Definition (\ref{paramdef}) we have: $\param(\aff_{W,b})= m\times n + m$
\end{remark}
\begin{remark}
For an \texttt{R} implementation see Listing \ref{affn}
\item For all $\act\in C(\R,\R)$, $m\in\N$, $W \in\R^{m \times\out(\nu)}$, $B \in\R^m$, we have that $\real_{\act}(\aff_{W,B}\bullet\nu)\in C\lp\R^{\inn(\nu)},\R^m\rp$.
\item For all $\act\in C(\R,\R)$, $m\in\N$, $W \in\R^{m \times\out(\nu)}$, $B \in\R^m$, $x \in\R^{\inn(\nu)}$ that:
\begin{align}
\lp\real\lp\aff_{W,b}\bullet\nu\rp\rp\lp x \rp= W \lp\real_{\act}\lp\nu\rp\rp\lp x \rp +b
\end{align}
\item For all $n\in\N$, $W \in\R^{\inn(\nu)\times n}$, $b \in\R^{\inn(\nu)}$ that:
\begin{align}
\lay(\nu\bullet\aff_{W,b}) = \lp n, \wid_1(\nu), \wid_2(\nu),...,\wid_{\dep(\nu)}(\nu) \rp\in\N^{\dep(\nu)+1}
\end{align}
\item For all $\act\in C(\R,\R)$, $n\in\N$, $W \in\R^{\inn(\nu)\times n}$, $b \in\R^{\inn(\nu)}$ that $\real_{\act}\lp\nu\bullet\aff_{W,b}\rp\in C \lp\R^n, \R^{\out(\nu)}\rp$ and,
\item For all $\act\in C(\R,\R)$, $n\in\N$, $W \in\R^{\inn(\nu)\times n}$, $b \in\R^{\inn(\nu)}$, $x \in\R^n$ that:
\begin{align}
\lp\real_{\act}\lp\nu\bullet\aff_{W,b}\rp\rp\lp x \rp = \lp\real_{\act}\lp\nu\rp\rp\lp Wx+b \rp
\end{align}
\end{enumerate}
\end{lemma}
\begin{proof}
From Lemma \ref{5.3.2} we see that $\real_{\act}(\aff_{W,b})\in C(\R^n,\R^m)$ given by $\real_{\act}(\aff_{W,b})= Wx + b$. This and Lemma \ref{comp_prop} prove $(i)-(vi)$.
\end{proof}
\begin{corollary}\label{affcor}
Let $m,n \in\N$, and $W \in\R^{m \times n}$ and $b \in\R^m$. Let $\nu\in\neu$. It is then the case that:
\begin{enumerate}[label = (\roman*)]
\item for all $\aff_{W,b}\in\neu$ with $\inn\lp\aff_{W,b}\rp=\out\lp\nu\rp$ that:
Let it be the case that $\lay\lp\nu\rp=\lp l_0,l_1,...,l_L\rp$ for $l_0,l_1,...,l_L,L \in\N$. Lemma \ref{5.3.3}, Item (i), and Lemma \ref{comp_prop} then tells us that:
Let $\mathfrak{a}_1,\mathfrak{a}_2$ be two affine neural networks as defined in Definition \ref{def:aff}. It is then the case that $\mathfrak{a}_1\boxminus\mathfrak{a}_2$ is also an affine neural network
\end{lemma}
\begin{proof}
This follows straightforwardly from Definition \ref{def:stacking}, where, given that $\mathfrak{a}_1=\lp\lp W_1,b_1\rp\rp$, and $\mathfrak{a}_2=\lp\lp W_2,b_2\rp\rp$, their stackings is the neural network $\lp\lp\diag\lp W_1,W_2\rp,b_1\frown b_2\rp\rp$, which is clearly an affine neural network.
Let $n,k \in\N$ and $\sm_{n,k}\in\neu$, it is then the case for all $n,k \in\N$ that:
\begin{enumerate}[label = (\roman*)]
\item$\dep\lp\sm_{n,k}\rp=1$
\item$\param\lp\sm_{n,k}\rp= nk^2+k$
\end{enumerate}
\end{lemma}
\begin{proof}
(i) is a consequence of Definition $\ref{5.3.1}$ and (ii) follows from the structure of $\sm_{n,k}$.
\end{proof}
\begin{definition}[Sum of ANNs of the same depth and same end widths]\label{def:nn_sum}
Let $u,v \in\Z$ with $u \leqslant v$. Let $\nu_u,\nu_{u+1},...,\nu_v \in\neu$ satisfy for all $i \in\N\cap[u,v]$ that $\dep(\nu_i)=\dep(\nu_u)$, $\inn(\nu_i)=\inn(\nu_u)$, and $\out(\nu_i)=\out(\nu_u)$. We then denote by $\oplus^n_{i=u}\nu_i$ or alternatively $\nu_u \oplus\nu_{u+1}\oplus\hdots\oplus\nu_v$ the neural network given by:
Let $\nu_1, \nu_2\in\neu$ satisfy that $\dep(\nu_1)=\dep(\nu_2)= L$, $\inn(\nu_1)=\inn(\nu_2)$, and $\out(\nu_1)=\out(\nu_2)$, and $\lay(\nu_1)=\lp l_{1,1},l_{1,2},...l_{1,L}\rp$ and $\lay\lp\nu_2\rp=\lp l_{2,1}, l_{2,2},...,l_{2,L}\rp$ it is then the case that:
Let $\lay\lp\nu_1\rp=\lp l_0,l_1,...,l_L\rp$ where for all $i \in\{0,1,...,L \}$ it is the case that $l_i,L \in\N$. Corollary \ref{cor:sameparal} then tells us that:
Let $\nu_1, \nu_2\in\neu$ satisfy that $\dep(\nu_1)=\dep(\nu_2)= L$, $\inn(\nu_1)=\inn(\nu_2)$, and $\out(\nu_1)=\out(\nu_2)$, and $\lay(\nu_1)=\lp l_{1,1},l_{1,2},...l_{1,L}\rp$ and $\lay\lp\nu_2\rp=\lp l_{2,1}, l_{2,2},...,l_{2,L}\rp$ it is then the case that:
\begin{align}
\dep\lp\nu_1 \oplus\nu_2 \rp =L
\end{align}
\end{lemma}
\begin{proof}
Note that $\dep\lp\cpy_{n,k}\rp=1=\dep\lp\sm_{n,k}\rp$ for all $n,k \in\N$. Note also that $\dep\lp\nu_1\boxminus\nu_2\rp=\dep\lp\nu_1\rp=\dep\lp\nu_2\rp$ and that for $\nu,\mu\in\neu$ it is the case that $\dep\lp\nu\bullet\mu\rp=\dep\lp\nu\rp+\dep\lp\mu\rp-1$. Thus:
Let $\nu_1,\nu_2\in\neu$, such that $\dep(\nu_1)=\dep(\nu_2)= L$, $\inn(\nu_1)=\inn(\nu_2)= l_0$, and $\out(\nu_1)=\out(\nu_2)= l_L$. It is then the case that $\real(\nu_1\oplus\nu_2)=\real(\nu_2\oplus\nu_1)$, i.e., the instantiated sum of ANNs of the same depth and same end widths is commutative.
\end{lemma}
\begin{proof}
Let $\nu_1=\lp(W_1,b_1),(W_2,b_2),...,(W_L,b_L)\rp$ and let $\nu_2=\lp(W'_1,b'_1),(W'_2,b'_2),...,(W_L', b_L')\rp$. Note that Definition $\ref{5.2.5}$ then tells us that:
\begin{align}
\nu_1 \boxminus\nu_2 = \lp\lp
\begin{bmatrix}
W_1 & 0 \\
0 & W_1'
\end{bmatrix}, \begin{bmatrix}
b_1 \\
b_1'
\end{bmatrix}
\rp,\lp\begin{bmatrix}
W_2 & 0 \\
0 & W_2'
\end{bmatrix}, \begin{bmatrix}
b_2\\
b_2'
\end{bmatrix}\rp,..., \right. \nonumber\\
\left.
\lp\begin{bmatrix}
W_L & 0 \\
0 & W_L'
\end{bmatrix}, \begin{bmatrix}
b_L \\
b_L'
\end{bmatrix}\rp\rp\nonumber
\end{align}
Note also that by Claims $\ref{5.4.4}$ and $\ref{5.4.5}$ and Definition \ref{5.3.1} we know that:
Let $ l_0,l_1,...,l_L \in\N$. Let $\nu\in\neu$ with $\lay(\nu)=\lp l_0,l_1,...,l_L \rp$. There then exists a neural network $\zero_{l_0,l_1,...,l_L}\in\neu$ such that $\real(\nu\oplus\zero_{l_0,l_1,...,l_L})=\real(\zero_{l_0,l_1,...,l_L}\oplus\nu)=\nu$.
\end{lemma}
\begin{proof}
Let $\nu=\lp\lp W_1, b_1\rp, \lp W_2, b_2\rp,..., \lp W_L,b_L \rp\rp$, where $W_1\in\R^{l_1\times l_0}$, $b_1\in\R^{l_1}$, $W_2\in\R^{l_2\times l_1}$, $b_2\in\R^{l_2},...,W_L \in\R^{l_L \times l_{L-1}}$, $b_L \in\R^{l_L}$. Denote by $\zero_{l_0,l_1,...,l_L}$ the neural network which for all $l_0,l_1,...,l_L \in\N$ is given by:
Given neural networks $\nu_1,\nu_2,\nu_3\in\neu$ with fixed depth $L$, fixed starting width of $l_0$ and fixed finishing width of $l_L$, it is then the case that $\real\lp\lp\nu_1\oplus\nu_2\rp\oplus\nu_3\rp=\real\lp\nu_1\oplus\lp\nu_2\oplus\nu_3\rp\rp$, i.e. the instantiation with a continuous activation function of $\oplus$ is associative.
\end{lemma}
\begin{proof}
Let $\nu_1=\lp\lp W^1_1,b^1_1\rp, \lp W^1_2,b^1_2\rp, ..., \lp W^1_L,b^1_L \rp\rp$, $\nu_2=\lp\lp W^2_1,b^2_1\rp, \lp W^2_2,b^2_2\rp,..., \lp W^2_L, b^2_L \rp\rp$, and $\nu_3=\lp\lp W^3_1,b^3_1\rp ,\lp W^3_2,b^3_2\rp,..., \lp W^3_L,b^3_L \rp\rp$. Then (\ref{5.4.12}) tells us that:
Let $\nu, \mu\in\neu$, with the same length and end-widths. It is then the case that $\real_{\act}\lp\nu\oplus\mu\rp=\real_{\act}\lp\nu\rp+\real_{\act}\lp\mu\rp$.
\end{lemma}
\begin{proof}
Let $\nu=\lp\lp W_1,b_1\rp, \lp W_2,b_2\rp,...,\lp W_L,b_L \rp\rp$ and $\mu=\lp\lp W'_1,b'_1\rp, \lp W_2',b_2'\rp,...,\lp W_L',b_L' \rp\rp$. Note now that by (\ref{5.4.12}) we have that:
\subsection{Sum of ANNs of Unequal Depth But Same End-widths}
\begin{definition}[Sum of ANNs of different depths but same end widths]
Let $n\in\N$. Let $\nu_1,\nu_2,...,\nu_n \in\neu$ such that they have the same end widths. We define the neural network $\dplus_{i=1}^n\nu_i \in\neu$, the neural network sum of neural networks of unequal depth as:
Note that Lemma \ref{6.2.2} tellls us that for all $n\in\N$ it is the case that $\real_{\rect}\lp\tun_n\rp\lp x\rp= x$. This combined with Lemma \ref{comp_prop} then tells us that for all $n\in\N$ it is the case for all $\nu\in\neu$ that:
\section{Linear Combinations of ANNs and Their Properties}
\begin{definition}[Scalar left-multiplication with an ANN]\label{slm}
Let $\lambda\in\R$. We will denote by $(\cdot)\triangleright(\cdot): \R\times\neu\rightarrow\neu$ the function that satisfy for all $\lambda\in\R$ and $\nu\in\neu$ that $\lambda\triangleright\nu=\aff_{\lambda\mathbb{I}_{\out(\nu)},0}\bullet\nu$.
\end{definition}
\begin{definition}[Scalar right-multiplication with an ANN]
Let $\lambda\in\R$. We will denote by $(\cdot)\triangleleft(\cdot): \neu\times\R\rightarrow\neu$ the function satisfying for all $\nu\in\neu$ and $\lambda\in\R$ that $\nu\triangleleft\lambda=\nu\bullet\aff_{\lambda\mathbb{I}_{\inn(\nu)},0}$.
\end{definition}
\begin{remark}
Note that whereas $\lambda\in\R$, the actual neural network in question, properly speaking, must always be referred to as $\lambda\triangleright$ or $\triangleleft\lambda$, and we shall do so whenever this comes up in any neural network diagrams. This is by analogy with, for example, $\log_\lambda$ or $\sqrt[\lambda]{}$ for $\lambda\neq0$, where the argument $\lambda$ is generally always written except for $\lambda=10$ for the logarithm or $\lambda=2$ for the root.
\end{remark}
\begin{remark}
For an \texttt{R} implementation, see Listing \ref{scalar_mult}
\end{remark}
\begin{lemma}\label{5.6.3}
Let $\lambda\in\R$ and $\nu\in\neu$. it is then the case that:
\begin{enumerate}[label = (\roman*)]
\item$\lay(\lambda\triangleright\nu)=\lay(\nu)$
\item For all $\act\in C(\R, \R)$ that $\real_{\act}(\lambda\triangleright\nu)\in C \lp\R^{\inn(\nu)}, \R^{\out(\nu)}\rp$
\item For all $\act\in C(\R,\R)$, and $x \in\R^{\inn(\nu)}$ that:
Let $\nu\in\neu$ such that $\lay(\nu)=\lp l_1,l_2,...,l_L \rp$ and $\dep(\nu)= L$ where $l_1,l_2,...,l_L,L \in\N$. Then Item (i) of Lemma $\ref{5.3.2}$ tells us that:
Which proves (i). Item (ii)\textemdash(iii) of Lemma $\ref{5.3.2}$ then prove that for all $\act\in C(\R,\R)$, $x \in\R^{\inn(\nu)}$, that $\real_{\act}\lp\lambda\triangleright\nu\rp\in C \lp\R^{\inn(\nu),\out(\nu)}\rp$ given by:
\lp\real_{\act}\lp\lambda\triangleright\nu\rp\rp\lp x \rp&= \lp\real_{\act}\lp\aff_{\lambda\mathbb{I}_{\out(\nu),0}}\bullet\nu\rp\rp\lp x \rp\nonumber\\
&= \lambda\mathbb{I}_{\out(\nu)}\lp\lp\real_{\act}\lp\nu\rp\rp\lp x \rp\rp = \lambda\lp\lp\real_{\act}\lp\nu\rp\rp\lp x \rp\rp
\end{align}
This establishes Items (ii)\textemdash(iii), completing the proof.
\end{proof}
\begin{lemma}\label{5.6.4}
Let $\lambda\in\R$ and $\nu\in\neu$. It is then the case that:
\begin{enumerate}[label = (\roman*)]
\item$\lay(\nu\triangleleft\lambda)=\lay(\nu)$
\item For all $\act\in C \lp\R, \R\rp$ that $\real_{\act}(\nu\triangleleft\lambda)\in C \lp\R^{\inn(\nu)}, \R^{\out(\nu)}\rp$
\item For all $\act\in C \lp\R, \R\rp$, and $x \in\R^{\inn(\nu)}$ that:
\begin{align}
\real_{\act}\lp\nu\triangleleft\lambda\rp = \real_{\act}(\nu)\lp\lambda x \rp
\end{align}
\end{enumerate}
\end{lemma}
\begin{proof}
Let $\nu\in\neu$ such that $\lay(\nu)=\lp l_1,l_2,...,l_L \rp$ and $\dep(\nu)= L$ where $\l_1,l_2,...,l_L, L \in\N$. Then Item (i) of Lemma \ref{5.3.2} tells us that:
Which proves $(i)$. Item (v)\textemdash(vi) of Lemma \ref{5.3.3} then prove that for all $\act\in C(\R,\R)$, $x \in\R^{\inn(\nu)}$ that $\real_{\act}\lp\nu\triangleleft\lambda\rp\in C\lp\R^{\inn(\nu),\out(\nu)}\rp$ given by:
\lp\real_{\act}\lp\nu\triangleleft\lambda\rp\rp\lp x \rp&= \lp\real_{\act}\lp\nu\bullet\aff_{\lambda\mathbb{I}_{\inn(\nu),0}}\rp\rp\lp x \rp\nonumber\\
&= \lp\real_{\act}\lp\nu\rp\rp\lp\aff_{\lambda\mathbb{I}_{\inn(\nu)}}\rp\lp x \rp\nonumber\\
Let $\nu,\mu\in\neu$ with the same length and end-widths, and $\lambda\in\R$. It is then the case, for all $\act\in C \lp\R, \R\rp$ that:
\begin{align}
\real_{\act}\lp\lp\nu\oplus\mu\rp\triangleleft\lambda\rp\lp x \rp&= \real_{\act}\lp\lp\nu\triangleleft\lambda\rp\oplus\lp\mu\triangleleft\lambda\rp\rp\lp x \rp\nonumber\\
&= \lp\real_{\act}\lp\nu\rp\rp\lp\lambda x \rp + \lp\real_{\act}\lp\mu\rp\rp\lp\lambda x \rp\nonumber
\end{align}
\end{lemma}
\begin{proof}
Let $\nu=\lp\lp W_1,b_1\rp, \lp W_2,b_2\rp,...,\lp W_L,b_L \rp\rp$ and $\mu=\lp\lp W'_1,b'_1\rp, \lp W'_2,b'_2\rp,...,\lp W'_L,b'_L \rp\rp$. Then from Lemma \ref{5.6.4} and (\ref{5.4.12}) we have that:
\begin{align}
&\lp\real_{\act}\lp\nu\oplus\mu\rp\triangleleft\lambda\rp\lp x \rp\nonumber\\&= \lp\real_{\act}\lp\nu\oplus\mu\rp\rp\lp\lambda x \rp\nonumber\\
Let $\nu,\mu\in\neu$ with the same length and end-widths, and $\lambda\in\R$. It is then the case, for all $\act\in C \lp\R, \R\rp$ that:
\begin{align}
\real_{\act}\lp\lambda\triangleright\lp\nu\oplus\mu\rp\rp\lp x \rp&= \real_{\act}\lp\lp\lambda\triangleright\nu\rp\oplus\lp\lambda\triangleright\mu\rp\rp\lp x \rp\nonumber\\
&= \lambda\cdot\lp\real_{\act}\lp\nu\rp\rp\lp x \rp + \lambda\cdot\lp\real_{\act}\lp\mu\rp\rp\lp x \rp\nonumber
\end{align}
\end{lemma}
\begin{proof}
Let $\nu=\lp\lp W_1,b_1\rp, \lp W_2,b_2\rp,...,\lp W_L,b_L \rp\rp$ and $\mu=\lp\lp W'_1,b'_1\rp, \lp W'_2,b'_2\rp,...,\lp W'_L,b'_L \rp\rp$. Then from Lemma \ref{5.6.4} and (\ref{5.4.12}) we have that:
\begin{align}
&\real_{\act}\lp\lambda\lp\nu\oplus\mu\rp\rp\lp x \rp\nonumber\\&= \real_{\act}\lp\lambda\triangleright\lp\nu\oplus\mu\rp\rp\lp\lambda x \rp\nonumber\\
This, together with Lemma \ref{5.5.11}, completes the proof.
\end{proof}
\begin{lemma}\label{5.6.5}
Let $u,v \in\Z$ with $u \leqslant v$ and $n = v-u+1$. Let $\lambda_u,\lambda_{u+1},..., \lambda_v \in\R$. Let $\nu_u, \nu_{u+1},...,\nu_v, \mu\in\neu$, $B_{u}, B_{u+1},...,B_v \in\R^{\inn(\mu)}$ satisfy that $\lay(\nu_u)=\lay(\nu_{u+1})= ...=\lay(\nu_v)$ and further that:
\item that for all $\act\in C \lp\R ,\R\rp$, that $\real_{\act}(\mu)\in C \lp\R^{\inn(\nu_u)}, \R^{\out(\nu_u)}\rp$, and
\item for all $\act\in C \lp\R, \R\rp$ and $x \in\R^{\inn(\nu_u)}$ that:
\begin{align}
\lp\real_{\act}\lp\mu\rp\rp\lp x \rp = \sum^v_{i=u} c_i \lp\real_{\act}\lp\nu_i \rp\rp\lp x + B_i \rp
\end{align}
\end{enumerate}
\end{lemma}
\begin{proof}
Assume hypothesis that $\lay(\nu_u)=\lay(\nu_{u+1})= ... =\lay(\nu_v)$. Note that Item (i) of Lemma \ref{5.3.2} gives us that for all $i \in\{u,u+1,...,v\}$ that:
This establishes item (i). Items (v) and (vi) from Lemma \ref{5.3.3} tells us that for all $i \in\{ u,u+1,...,v\}$, $\act\in C(\R,\R)$, $x \in\R^{\inn(\nu_u)}$, it is the case that $\real_{\act}\lp\nu_i \bullet\aff_{\mathbb{I}_{\inn(\nu_i)}, B_i}\rp\in C \lp\R^{\inn(\nu_u)}, \R^{\out(\nu_u)}\rp$ and further that:
\begin{align}
\lp\real_{\act}\lp\nu_i \bullet\aff_{\mathbb{I}_{\inn(\nu_i)},b_i}\rp\rp\lp x \rp = \lp\real_{\act}\lp\nu_i \rp\rp\lp x + b_i \rp
\end{align}
This along with \cite[Lemma~3.14]{Grohs_2022} ensures that for all $i \in\{u,u+1,...,v\}$, $\act\in C \lp\R, \R\rp$, $x \in\R^{\inn(\nu_u)}$, it is the case that:
\begin{align}
\real_{\act}\lp c_i \triangleright\lp\nu_i \bullet\aff_{\mathbb{I}_{\inn(\nu_i)},B_i}\rp\rp\in C \lp\R^{\inn(\nu_u)}, \R^{\out(\nu_u)}\rp
\end{align}
and:
\begin{align}
\lp\real_{\act}\lp c_i \triangleright\lp\nu_i \bullet\aff_{\mathbb{I}_{\inn(\nu_i)},b_i}\rp\rp\rp\lp x \rp = c_i \lp\real_{\act}\lp\nu_i \rp\rp\lp x + b_i \rp
\end{align}
Now observe that \cite[Lemma~3.28]{Grohs_2022} and (\ref{5.6.13}) ensure that for all $\act\in C \lp\R, \R\rp$, $x \in\R^{\inn(\nu_u)}$, it is the case that $\real_{\act}\lp\mu\rp\in C \lp\R^{\inn(\nu_u)}, \R^{\out(\nu_u)}\rp$ and that:
\begin{align}
\lp\real_{\act}\lp\mu\rp\rp\lp x \rp&= \lp\real_{\act}\lp\oplus^v_{i=u}\lp c_i \triangleright\lp\nu_i \bullet\aff_{\mathbb{I}_{\inn(\nu_i)},b_i}\rp\rp\rp\rp\lp x \rp\nonumber\\
&= \sum^v_{i=u}\lp\real_{\act}\lp c_i \triangleright\lp\nu_i \bullet\aff_{\mathbb{I}_{\inn(\nu_i)},b_i}\rp\rp\rp\lp x \rp\nonumber\\
This establishes items (ii)--(iii); thus, the proof is complete.
\end{proof}
\begin{lemma}\label{5.6.6}
Let $u,v \in\Z$ with $u \leqslant v$. Let $\lambda_u,\lambda_{u+1},..., \lambda_v \in\R$. Let $\nu_u, \nu_{u+1},...,\nu_v, \mu\in\neu$, $B_{u}, B_{u+1},...,B_v \in\R^{\inn(\mu)}$ satisfy that $\lay(\nu_u)=\lay(\nu_{u+1})= ...=\lay(\nu_v)$ and further that:
\item that for all $\act\in C \lp\R ,\R\rp$, that $\real_{\act}(\mu)\in C \lp\R^{\inn(\nu_u)}, \R^{\out(\nu_u)}\rp$, and
\item for all $\act\in C \lp\R, \R\rp$ and $x \in\R^{\inn(\nu_u)}$ that:
\begin{align}
\lp\real_{\act}\lp\mu\rp\rp\lp x \rp = \sum^v_{i=u}\lp\real_{\act}\lp\nu_i \rp\rp\lp c_ix + b_i \rp
\end{align}
\end{enumerate}
\end{lemma}
\begin{proof}
Assume hypothesis that $\lay(\nu_u)=\lay(\nu_{u+1})= ... =\lay(\nu_v)$. Note that Item (i) of Lemma \ref{5.3.2} gives us that for all $i \in\{u,u+1,...,v\}$ that:
This establishes Item (i). Items (i) and (ii) from Lemma \ref{5.3.3} tells us that for all $i \in\{ u,u+1,...,v\}$, $\act\in C(\R,\R)$, $x \in\R^{\inn(\nu_u)}$, it is the case that $\real_{\act}\lp\nu_i \bullet\aff_{\mathbb{I}_{\inn(\nu_i)}, B_i}\rp\in C \lp\R^{\inn(\nu_u)}, \R^{\out(\nu_u)}\rp$ and further that:
\begin{align}
\lp\real_{\act}\lp\aff_{\mathbb{I}_{\inn(\nu_i)},b_i}\bullet\nu_i\rp\rp\lp x \rp = \lp\real_{\act}\lp\nu_i \rp\rp\lp x \rp + b_i
\end{align}
This along with Lemma \ref{5.6.4} ensures that for all $i \in\{u,u+1,...,v\}$, $\act\in C \lp\R, \R\rp$, $x \in\R^{\inn(\nu_u)}$, it is the case that:
\begin{align}
\real_{\act}\lp\lp\aff_{\mathbb{I}_{\inn(\nu_i)},b_i}\bullet\nu_i\rp\triangleleft c_i\rp\in C \lp\R^{\inn(\nu_u)}, \R^{\out(\nu_u)}\rp
\end{align}
and:
\begin{align}
\lp\real_{\act}\lp\lp\aff_{\mathbb{I}_{\inn(\nu_i)},b_i}\bullet\nu_i \rp\triangleleft c_i \rp\rp\lp x \rp = \lp\real_{\act}\lp\nu_i \rp\rp\lp c_i x + b_i \rp
\end{align}
Now observe that \cite[Lemma~3.28]{Grohs_2022} and (\ref{5.5.14}) ensure that for all $\act\in C \lp\R, \R\rp$, $x \in\R^{\inn(\nu_u)}$, it is the case that $\real_{\act}\lp\mu\rp\in C \lp\R^{\inn(\nu_u)}, \R^{\out(\nu_u)}\rp$ and that:
\begin{align}
\lp\real_{\act}\lp\mu\rp\rp\lp x \rp&= \lp\real_{\act}\lp\oplus^v_{i=u}\lp\aff_{\mathbb{I}_{\inn(\nu_i)},b_i}\bullet\nu_i \rp\rp\triangleleft c_i \rp\lp x \rp\\
&= \sum^v_{i=u}\lp\real_{\act}\lp\lp\aff_{\mathbb{I}_{\inn(\nu_i)},b_i}\bullet\nu_i\rp\triangleleft c_i \rp\rp\lp x \rp\\
Note that Item(i) from Lemma \ref{5.6.5} establish Item(i) and (\ref{5.5.20}); in addition, items (v) \textemdash (vi) from Lemma \ref{5.3.3} tell us that for all $i \in\N\cap[u,v]$, $x \in\R^{\inn(\nu_u}$, it holds that \\
$\real_{\act}\lp\nu_i \bullet\aff_{\mathbb{I}_{\inn(\nu_i)}, B_i}\in C \lp\R^{\inn(\nu_u)}, \R^{\out(\nu_u)}\rp\rp$ and further that:
\lp\real_{\act}\lp\nu_i\bullet\aff_{\mathbb{I}_{\inn(\nu_i)},B_i}\rp\rp\lp x \rp = \lp\real_{\act}\lp\nu_i \rp\rp\lp x + b_k \rp
\end{align}
This, Lemma \ref{5.6.3} and \cite[Lemma~2.14, Item~(ii)]{grohs2019spacetime} show that for all $i \in\N\cap[u,v]$, $x \in\R^{\inn(\nu_u)}$, it holds that:
This combined with \cite[Lemma~3.28]{Grohs_2022} and (\ref{5.6.13}) demonstrate that for all $x \in\R^{\inn(\nu_u)}$ it holds that $\real_{\act}\lp\mu\rp\in C\lp\R^{\inn(\nu_u)}, \R^{\out(\nu_u)}\rp$ and that:
\begin{align}
\lp\real_{\act}\lp\mu\rp\rp\lp x \rp&= \lp\real_{\act}\lp\boxplus^v_{i = u, \mathfrak{I}}\lp c_i \triangleright\lp\nu_i \bullet\aff_{\mathbb{I}_{\inn(\nu_i)}}\rp\rp\rp\rp\lp x \rp\nonumber\\
&= \lp\real_{\act}\lp\oplus^v_{i=u}\ex_{L,\mathfrak{I}}\lp c_i \triangleright\lp\nu_i \bullet\aff_{\mathbb{I}_{\inn(\nu_i)},b_i}\rp\rp\rp\rp\lp x \rp\nonumber\\
\item it holds that $\real_{\act}(\mu)\in C \lp\R^{\inn(\nu_u)}, \R^{\out(\nu_u)}\rp$, and that,
\item it holds for all $ x \in\R^{\inn(\nu_u)}$ that:
\begin{align}
\lp\real_{\act}\lp\mu\rp\rp\lp x \rp = \sum^v_{i=u}\lp\real_{\act}\lp\nu_i \rp\rp\lp c_ix + b_i\rp
\end{align}
\end{enumerate}
\end{lemma}
\begin{proof}
Note that Item(i) from Lemma \ref{5.6.6} establish Item(i) and (\ref{5.5.20}); in addition, items (ii) and (iii) from Lemma \ref{5.3.3} tell us that for all $i \in\N\cap[u,v]$, $x \in\R^{\inn(\nu_u}$, it holds that $\real_{\act}\lp\aff_{\mathbb{I}_{\inn(\nu_i)}, B_i}\bullet\nu_i \in C \lp\R^{\inn(\nu_u)}, \R^{\out(\nu_u)}\rp\rp$ and further that:
\begin{align}
\lp\real_{\act}\lp\aff_{\mathbb{I}_{\inn(\nu_i)},B_i}\bullet\nu_i \rp\rp\lp x \rp = \lp\real_{\act}\lp\nu_i \rp\rp\lp x \rp + b_k
\end{align}
This, Lemma \ref{5.6.4} and \cite[Lemma~2.14, Item~(ii)]{grohs2019spacetime} show that for all $i \in\N\cap[u,v]$, $x \in\R^{\inn(\nu_u)}$, it holds that:
\lp\real_{\act}\lp\ex_{L,\mathfrak{I}}\lp\lp\aff_{\mathbb{I}_{\inn(\nu_i)},b_i}\bullet\nu_i\rp\triangleleft c_i\rp\rp\rp\lp x \rp&= \lp\real_{\act}\lp c_i \triangleright\lp\nu_i \bullet\aff_{\mathbb{I}_{\inn(\nu_i)},b_i}\rp\rp\rp\lp x \rp\nonumber\\
&= \lp\real_{\act}\lp\nu_i \rp\rp\lp c_ix+b_i \rp
\end{align}
This and \cite[Lemma~3.28]{Grohs_2022} and (\ref{5.6.23}) demonstrate that for all $x \in\R^{\inn(\nu_u)}$ it holds that $\real_{\act}\lp\mu\rp\in C\lp\R^{\inn(\nu_u)}, \R^{\out(\nu_u)}\rp$ and that:
\begin{align}
\lp\real_{\act}\lp\mu\rp\rp\lp x \rp&= \lp\real_{\act}\lp\boxplus^v_{i = u, \mathfrak{I}}\lp\lp\aff_{\mathbb{I}_{\inn(\nu_i)}}\bullet\nu_i\rp\triangleleft c_i\rp\rp\rp\lp x \rp\nonumber\\
&= \lp\real_{\act}\lp\oplus^v_{i=u}\ex_{L,\mathfrak{I}}\lp\lp\aff_{\mathbb{I}_{\inn(\nu_i)},b_i}\bullet\nu_i\rp\triangleleft c_i\rp\rp\rp\lp x \rp\nonumber\\
Conceptually, it will be helpful to construct what are called ``neural network diagrams''. They take inspiration from diagrams typically seen in the literature, for instance, \cite{vaswani_attention_2017}, \cite{arik_tabnet_2021}, and \cite{8099678}. They are constructed as follows.
Lines with arrows indicate the flow of data:
\begin{center}
\begin{tikzcd}
{}\arrow[rr, "x"]&&{}\\
{}&&{}\arrow[ll, "x"]
\end{tikzcd}
\end{center}
Named neural networks are always enclosed in boxes with \textsf{serif} fonts:
\begin{center}
\begin{tikzpicture}
% Create a rectangular node with text inside
\node[draw, rectangle] at (0, 0) {$\aff_{a,b}$};
\end{tikzpicture}
\end{center}
Where possible, we seek to label the arrows going in and going out of a boxed neural network with the appropriate operations that take place:
\begin{center}
\begin{tikzpicture}
% Create a rectangular node with text inside
\node[draw, rectangle] (box) at (0, 0) {$\aff_{a,b}$};
% Draw an arrow from left to right going into the box