\chapter{Introduction and Basic Notions About Neural Networks} We seek here to introduce a unified framework for artificial neural networks. This framework borrows from the work presented in \cite{grohsetal}, which was in turn inspired by work done in \cite{petersen_optimal_2018}. The most recent exposition of this framework can be found in \cite{bigbook}, and it is this exposition that our work will be based on and extended upon. With this framework in place, we wish to study ANNs from the perspective of trying to see the number of parameters required to define a neural network to solve certain PDEs. The \textit{curse of dimensionality} here refers to the number of parameters and depths of neural networks necessary to approximate functions to a certain accuracy. Specifically a scheme is said to have beat the curse of dimensionality if the number of parameters and depths necessary to approximate an underlying function to an accuracy (specifically the upper bound on the the 1-norm difference between the approximant and the function over the entire domain), only grows polynomially or at-least sub-exponentially on $\frac{1}{\ve}$. \section{The Basic Definition of ANNs and instantiations of ANNs} \begin{definition}[Rectifier Function] Let $d \in \N$ and $x \in \R^d$. We denote by $\rect: \R \rightarrow \R$ the function given by: \begin{align} \rect(x) = \max \left\{ 0,x\right\} \end{align} \end{definition} \begin{remark} By analogy the multidimensional rectifier function, defined for $x = \lb x_1 \: x_2 \: \cdots \right.\\ \left. \: x_n\rb^\intercal \in \R^n$ is: \begin{align} \rect ([x]_*) = \left[ \max\{ 0,x_1\} \: \max \{ 0,x_2\}\: \cdots \max\{ 0,x_n\}\right]^\intercal \end{align} \end{remark} \begin{definition}[Artificial Neural Networks]\label{5.1.2}\label{def:nn_def} Denote by $\neu$ the set given by: \begin{align} \neu = \bigcup_{L\in \N} \bigcup_{l_0,l_1,...,l_L \in \N} \lp \bigtimes^L_{k=1} \lb \R^{l_k \times l_{k-1}} \times \R^{l_k}\rb \rp \end{align} An artificial neural network is a tuple $\lp \nu, \param, \dep, \inn, \out, \hid, \lay, \wid \rp $ where $\nu \in \neu$ and is equipped with the following functions (referred to as auxiliary functions) satisfying for all \\$\nu \in \lp \bigtimes^L_{k=1} \lb \R^{l_k \times l_{k-1}} \times \R^{l_k}\rb \rp$ that: \begin{enumerate}[label = (\roman*)] \item $\param: \neu \rightarrow \N$ denoting the number of parameters of $\nu$, given by: \begin{align}\label{paramdef} \param(\nu) = \sum^L_{k=1}l_k \lp l_{k-1}+1 \rp \end{align} \item $\dep: \neu \rightarrow \N$ denoting the number of layers of $\nu$ other than the input layer given by: \begin{align} \dep(\nu) = L \end{align} \item $\inn:\neu \rightarrow \N$ denoting the width of the input layer, given by: \begin{align} \inn(\nu) = l_0 \end{align} \item $\out: \neu \rightarrow \N$ denoting the width of the output layer, given by: \begin{align} \out(\nu) = l_L \end{align} \item $\hid: \neu \rightarrow \N_0$ denoting the number of hidden layers (i.e., layers other than the input and output), given by: \begin{align} \hid(\nu) = L-1 \end{align} \item $\lay: \neu \rightarrow \bigcup_{L \in \N} \N^L$ denoting the width of layers as an $(L+1)$-tuple, given by: \begin{align} \lay(\nu) = \lp l_0,l_1,l_2,...,l_L \rp \end{align} We sometimes refer to this as the layer configuration or layer architecture of $\nu$. \item $\wid_i: \neu \rightarrow \N_0$ denoting the width of layer $i$, given by: \begin{align} \label{widthdef} \wid_i(\nu) = \begin{cases} l_i & i \leqslant L \\ 0 & i > L \end{cases} \end{align} \end{enumerate} \end{definition} Note that this implies that $\nu = ((W_1,b_1),(W_2,b_2),...(W_L,b_L)) \in \lp \bigtimes^L_{k=1} \lb \R^{l_k \times l_{k-1}} \times \R^{l_k}\rb \rp$. Note that we denote by $\we_{(\cdot ), \nu}: (\we_{n,\nu})_{n\in \{1,2,...,L\}}: \{1,2,...,L\} \rightarrow \lp \bigcup_{m,k \in \N}\R^{m \times k} \rp $ and also $\bi_{(\cdot),\nu}: \lp \bi_{n,\nu} \rp_{\{1,2,...,L\}}: \{1,2,...,L\} \rightarrow \lp \bigcup_{m \in \N}\R^m \rp$ the functions that satisfy for all $n \in \{1,2,...,L\}$ that $\we_{i,\nu} = W_i$ i.e. the weights matrix for neural network $\nu$ at layer $i$ and $\bi_{i,\nu} = b_i$, i.e. the bias vector for neural network $\nu$ at layer $i$. We will call $l_0$ the \textit{starting width} and $l_L$ the \textit{finishing width}. Together, they will be referred to as \textit{end-widths}. \begin{remark} Notice that our definition varies somewhat from the conventional ones found in \cite{petersen_optimal_2018} and \cite{grohs2019spacetime} in that whereas the former talk about auxiliary functions as existing within the set $\neu$ we will talk about these auxiliary functions as something elements of $\neu$ are endowed with. In other words, elements of $\neu$ may exist whose depths and parameter counts, for instance, are undefined and non-determinate. Note that we develop this definition to closely align to popular deep-learning frameworks such as \texttt{PyTorch}, \texttt{TensorFlow}, and \texttt{Flux}, where, in principle, it is always possible to know the parameter count, depth, number of layers, and other auxiliary information. We will often say let $\nu\in \neu$, and it is implied that the tuple $\nu$ with the auxiliary functions is what is being referred to. This is analogous to when we say that $X$ is a topological but we mean the pair $\lp X,\tau\rp$, i.e. $X$ endowed with topology $\tau$, or when we say that $Y$ is a measurable space when we mean the triple $\lp X,\Omega, \mu\rp$, i.e. $X$, endowed with $\sigma-$algebra $\Omega$, and measure $\mu$. \end{remark} \begin{definition}[Instantiations of Artificial Neural Networks with Activation Functions]\label{def:rlz}\label{def:inst} Let $\act \in C \lp \R, \R \rp$, we denote by $\real_{\act}: \neu \rightarrow \lp \bigcup_{k,l \in \N} C \lp \R^k, \R^l \rp \rp$ the function satisfying for all $L \in \N$, $l_0,l_1,...,l_L \in \N$, $\nu = \lp \lp W_1, b_1 \rp , \lp W_2, b_2\rp ,...,\lp W_L, b_L \rp \rp \in \lp \bigtimes^L_{k=1} \lb \R^{l_k \times l_{k-1}} \times \R^{l_k}\rb \rp$, $x_0 \in \R^{l_0}, x_1 \in \R^{l_1},...,x_{L-1} \in \R^{l_L-1}$ and with $\forall k \in \N \cap (0,L):x_k = \act \lp \lb W_kx_k+b_k \rb_{*,*} \rp$such that: \begin{align}\label{5.1.11} \real_{\act}\lp \nu \rp \in C \lp \R^{l_0}, \R^{l_L} \rp & \text{ and } & \lp \real_{\act}\lp \nu\rp \rp \lp x_0 \rp = W_Lx_{L-1}+b_L \end{align} \end{definition} \begin{figure} \begin{center} \includegraphics[scale=0.5]{nn-example.png} \end{center} \caption{A neural network $\nu$ with $\lay \lp \nu \rp = \lp 6,8,6,3\rp$} \end{figure} \begin{remark} For an R implementation see Listings \ref{nn_creator}, \ref{aux_fun}, \ref{activations}, and \ref{instantiation}. \end{remark} \begin{lemma}\label{5.1.8} Let $\nu \in \neu$, it is then the case that: \begin{enumerate}[label = (\roman*)] \item $\lay(\nu) \in \N^{\dep(\nu)+1}$, and \item for all $\act \in C \lp \R, \R \rp$, $\real_{\act} \in C \lp \R^{\inn(\nu)},\R^{\out(\nu)}\rp $ \end{enumerate} \end{lemma} \begin{proof} By assumption: \begin{align} \nu \in \neu = \bigcup_{L\in \N} \bigcup_{\lp l_0,l_1,...,l_L \rp \in \N^{L+1}} \lp \bigtimes^L_{k=1} \lb \R^{l_k \times l_{k-1}} \times \R^{l_k}\rb \rp \end{align} This ensures that there exist $l_0,l_1,...,l_L,L \in \N$ such that: \begin{align} \nu \in \lp \bigtimes^L_{j=1} \lb \R^{l_j \times l_{j-1}} \times \R^{l_j} \rb \rp \end{align} This also ensures that $\lay(\nu) = \lp l_0,l_1,...,l_L \rp \in \N^{L+1} = \N^{\dep(\nu)+1}$ and further that $\inn(\nu) = l_0$, $\out(\nu) = l_L$, and that $\dep(\nu) = L$. Together with ($\ref{5.1.11}$), this proves the lemma. \end{proof} \section{Compositions of ANNs} The first operation we want to be able to do is to compose neural networks. Note that the composition is not done in an obvious way; for instance, note that the last layer of the first component of the composition is superimposed with the first layer of the second component of the composition. \begin{definition}[Compositions of ANNs]\label{5.2.1}\label{def:comp} We denote by $\lp \cdot \rp \bullet \lp \cdot \rp: \{ \lp \nu_1,\nu_2 \rp \in \neu \times \neu: \inn(\nu_1) = \out (\nu_1) \} \rightarrow \neu$ the function satisfying for all $L,M \in \N, l_0,l_1,...,l_L, m_0, m_1,...,m_M \in \N$, $\nu_1 = \lp \lp W_1, b_1 \rp, \lp W_2, b_2 \rp,...,\lp W_L,b_L \rp \rp \in \lp \bigtimes^L_{k=1} \lb \R^{l_k \times l_{k-1}} \times \R^{l_k}\rb \rp$, and $\nu_2 = \\ \lp \lp W'_1, b'_1 \rp, \lp W'_2, b'_2 \rp,... \lp W'_M, b'_M \rp \rp \in \lp \bigtimes^M_{k=1} \lb \R^{m_k \times m_{k-1}} \times \R^{m_k}\rb \rp$ with $l_0 = \inn(\nu_1)= \out(\nu_2) = m_M$ and : \begin{align}\label{5.2.1} &\nu_1 \bullet \nu_2 =\\ &\begin{cases} (( W'_1,b'_1 ), ( W'_2,b'_2 ), ...( W'_{M-1}, b'_{M-1}), ( W_1W'_M, W_1b'_{M} + b_1), (W_2, b_2 ),\\..., ( W_L,b_L )) & :( L> 1 ) \land ( M > 1 ) \\ ((W_1W'_1,W_1b'_1+b_1),(W_2,b_2), (W_3,b_3),...,(W_Lb_L)) & :(L>1) \land (M=1) \\ ((W'_1, b'_1),(W'_2,b'_2), ..., (W'_{M-1}, b'_{M-1})(W_1, b'_M + b_1)) &:(L=1) \land (M>1) \\ ((W_1W'_1, W_1b'_1+b_1)) &:(L=1) \land (M=1) \end{cases} \end{align} \end{definition} \begin{remark} For an \texttt{R} implementation see Listing \ref{comp} \end{remark} \begin{lemma}\label{depthofcomposition} Let $\nu, \mu \in \neu$ be such that $\out(\mu) = \inn(\nu)$. It is then the case that: \begin{enumerate}[label = (\roman*)] \item $\dep( \nu \bullet \mu) = \dep(\nu) + \dep(\mu) - 1$ \item For all $i \in \{1,2,...,\dep(\nu \bullet\mu)\}$ that: \begin{align} &\lp \we_{i,(\nu \bullet \mu)}, \bi_{i,(\nu \bullet \mu)} \rp \nonumber\\ &= \begin{cases} \lp \we_{i,\mu}, \bi_{i, \mu} \rp & : i< \dep(\mu)\\ \lp \we_{1,\nu}\we_{\dep(\mu),\mu}, \we_{1,\nu}\bi_{\dep(\mu),\mu} + \bi_{1,\nu}\rp & : i = \dep (\mu)\\ \lp \we_{i-\dep(\mu)+1,\nu} \bi_{i-\dep(\mu)+1,\nu}\rp & :i> \dep(\mu) \end{cases} \nonumber \end{align} \end{enumerate} \end{lemma} \begin{proof} This is a consequence of (\ref{5.2.1}), which implies (i)\textemdash (ii). \end{proof} \begin{lemma} \label{5.2.3} Let $\nu_1,\nu_2,\nu_3 \in \neu$ satisfy that $\inn(\nu_1) = \out(\nu_2)$ and $\inn(\nu_2) = \out(\nu_3)$, it is then the case\\ that: \begin{align} \lp \nu_1 \bullet \nu_2 \rp \bullet \nu_3 = \nu_1 \bullet \lp \nu_2 \bullet \nu_3 \rp \end{align} \end{lemma} \begin{proof} This is a consequence of \cite[Lemma~2.8]{grohs2019spacetime} with $\Phi_1 \curvearrowleft \nu_1$, $\Phi_2 \curvearrowleft \nu_2$, and $\Phi_3 \curvearrowleft \nu_3$, and the functions $\mathcal{I} \curvearrowleft \inn$, $\mathcal{L} \curvearrowleft \dep$ and $\mathcal{O} \curvearrowleft \out$. \end{proof} The following Lemma will be important later on, referenced numerous times, and found in \cite[Proposition~2.6]{grohs2019spacetime}. For completion, we will include a simplified version of the proof here. \begin{lemma}\label{comp_prop} Let $\nu_1, \nu_2 \in \neu$. Let it also be that $\out\lp \nu_1\rp = \inn \lp \nu_2\rp$. It is then the case that: \begin{enumerate}[label = (\roman*)] \item $\dep \lp \nu_1 \bullet \nu_2 \rp = \dep\lp \nu_1\rp + \dep \lp \nu_2\rp -1$ \item $\lay\lp \nu_1 \bullet \nu_2\rp = \lp \wid_1\lp \nu_2\rp, \wid_2 \lp \nu_2\rp,\hdots, \wid_{\hid\lp \nu_2\rp},\wid_1\lp \nu_1\rp, \wid_2\lp \nu_1\rp,\hdots, \wid_{\dep\lp \phi_1\rp}\lp \nu_1\rp\rp$ \item $\hid \lp \nu_1 \bullet \nu_2\rp = \hid \lp \nu_1\rp + \hid\lp \nu_2\rp$ \item $\param \lp \nu_1 \bullet \nu_2\rp \les \param\lp \nu_1\rp + \param \lp \nu_2\rp + \wid_1 \lp \nu_1\rp\cdot \wid_{\hid\lp \nu_2\rp}\lp \nu_2\rp$ \item for all $\act \in C \lp \R, \R\rp$ that $\real_{\act}\lp \nu_1 \bullet \nu_2\rp \lp x \rp \in C \lp \R^{\inn \lp \nu_2\rp},\R^{\out\lp \nu_1\rp}\rp$ and further: \begin{align} \real_{\act} \lp \nu_1 \bullet \nu_2\rp = \lb \real_{\act}\lp \nu_1\rp\rb \circ \lb \real_{\act}\lp \nu_2 \rp\rb \end{align} \end{enumerate} \end{lemma} \begin{proof} Note that Items (i)---(iii) are a simple consequence of Definition \ref{5.2.1}. Specifically, given neural networks $\nu_1,\nu_2 \in \neu$, and $\dep\lp \nu_1\rp = n$ and $\dep \lp \nu_2\rp = m$, note that for all four cases, we have that the depth of the composed neural network $\nu_1 \bullet \nu_2$ is given by $n-1+m-1+1=n+m-1$ proving Item (i). Note that the outer neural network loses its last layer, yielding Item (ii) in all four cases. Finally since, for all $\nu \in \neu$ it is the case that $\hid \lp \nu\rp =\dep \lp \nu\rp-1$, Item (i) yields Item (iii). Now, suppose it is the case that $\nu_3 = \nu_1\bullet \nu_2$ and that: \begin{align} \nu_1 &= \lp \lp W_{1,1},b_{1,1}\rp, \lp W_{1,2},b_{1,2}\rp,\hdots, \lp W_{1,L_1},b_{1,L_1}\rp\rp \nonumber \\ \nu_2 &= \lp \lp W_{2,1},b_{2,1}\rp, \lp W_{2,2},b_{2,2}\rp,\hdots, \lp W_{2,L_2},b_{2,L_2}\rp\rp \nonumber \\ \nu_3 &= \lp \lp W_{3,1},b_{3,1}\rp, \lp W_{3,2},b_{3,2}\rp,\hdots, \lp W_{3,L_2},b_{3,L_2}\rp\rp \nonumber \\ \end{align} And that: \begin{align} &\lay \lp \nu_1\rp = \lp l_{1,1},l_{1,2},\hdots, l_{1,L_1}\rp \nonumber\\ &\lay \lp \nu_2\rp = \lp l_{2,1},l_{2,2},\hdots, l_{2.L_2}\rp \nonumber \\ &\lay \lp \nu_1 \bullet \nu_2\rp = \lp l_{3,1},l_{3,2}, \hdots, l_{3,L_3}\rp \end{align} and further let $x_0 \in \R^{l_{2,0}},x_1 \in \R^{l_{2,1}},\hdots,x_{L_2-1}\in \R^{l_{2,L_2-1}}$ satisfy the condition that: \begin{align}\label{comp_x} \forall k \in \N \cap \lp 0,L_2\rp: x_k = \act \lp \lb W_{2,k}x_{k-1} + b_{2,k}\rb_{*,*}\rp \end{align} also let $y_0 \in \R^{l_{1,0}}$, $y_1 \in \R^{l_{1,1}},\hdots, y_{L_1-1} \in \R^{l_{2,L_2-1}}$ satisfy: \begin{align}\label{comp_y} \forall k\in \N \cap \lp 0,L_1\rp:y_k = \act\lp \lb W_{1,k}y_{k-1}+b_{1,k}\rb_{*,*}\rp \end{align} and finally let $z_0 \in \R^{l_{3,0}}, z_1 \in \R^{l_{3,1}},\hdots, z_{L_3-1} \in \R^{l_{3,L_3-1}}$ satisfy: \begin{align}\label{comp_z} \forall k \in \N \cap \lp 0,L_3\rp: z_k = \act\lp \lb W_{3,k}z_{k-1} + b_{3,k}\rb_{*,*}\rp \end{align} Note then that by Item (i) of Definition \ref{5.1.2} we have that: \begin{align} \param \lp \nu_1 \bullet \nu_2\rp &= \sum^{L_3}_{k=1} l_{3,k}\lp l_{3,k-1} +1\rp \nonumber \\ &=\lb \sum^{L_2-1}_{k=1} l_{3,k} \lp l_{3,k-1} +1\rp\rb + l_{3,L_2}\lp l_{3,L_2-1}+1\rp+\lb \sum^{L_3}_{k=L_2+1} l_{3,k}\lp l_{3,k-1} +1\rp\rb \nonumber \\ &= \lb \sum^{L_2-1}_{k=1}l_{2,j}\lp l_{2,j-1}+1\rp\rb + l_{1,1}\lp l_{2,L-1}+1\rp + \lb \sum^{L_3}_{k=L_2+1} l_{j-L_2+1}\lp l_{1,j-L_2}+1\rp\rb\nonumber \\ &= \lb \sum^{L_2-1}_{k=1} l_{2,j} \lp l_{2,k-1}+1\rp\rb + \lb \sum^{L_1}_{k=2}l_{1,j} \lp l_{1,k-1} +1\rp\rb + l_{1,1}\lp l_{2,L_2-1} + 1\rp \nonumber \\ &= \lb \sum^{L_2}_{k=1}l_{2,k}\lp l_{2,k-1}+1\rp \rb + \lb \sum_{k=1}^{L_1} l_{1,k}\lp l_{1,k-1}+1\rp\rb + l_{1,1}\lp l_{2,L_2-1} +1\rp \nonumber\\ &- l_{2,L_2} \lp l_{2,L_2-1} +1\rp -l_{1,1}\lp l_{1,0}+1\rp \nonumber \\ &= \param\lp \nu_1\rp + \param \lp \nu_2\rp + l_{1,1}\cdot l_{2,L_2-1} \end{align} Thus establishing Item (iv). Note by Definition \ref{5.2.1}, and the fact that $\act \in C \lp \R, \R \rp$ it is the case that \begin{align}\label{comp_cont} \real_{\act}\lp \nu_1 \bullet \nu_2\rp \in C \lp \R^{\inn \lp \nu_1\rp},\R^{\out\lp \nu_2\rp} \rp \end{align} Next note that by definition, it is the case that: \begin{align} \lay \lp \nu_1 \bullet \nu_2\rp = \lp l_{2,0},l_{2,1},\hdots, l_{2,L_2-1},l_{1,1},l_{1,2},\hdots,l_{1,L_1}\rp \end{align} And further that: \begin{align} \forall k \in \N \cap \lp 0,L_2\rp : \lp W_{3,k},b_{3,k}\rp &= \lp W_{2,k},b_{2,k}\rp \nonumber \\ \lp W_{3,L_2},b_{3,L_2} \rp &= \lp W_{1,1} \cdot W_{2,L_2}, W_{1,1}b_{2,L_2} + b_{1,1}\rp \nonumber\\ \text{ and } \forall k \in \N \cap \lp L_2,L_1+L_2\rp: \lp W_{3,k},b_{3,k}\rp &= \lp W_{1,j+1-L_2},b_{1,j+1-L_2}\rp \end{align} Since for all $k\in \N \cap \lb 0,L_2\rp$ it is the case that $z_j = x_j$ and the fact that $y_0 = W_{2,l_2}x_{L_2-1} + b_{2,L_2}$ ensures us that: \begin{align}\label{(5.2.12)} W_{3,L_2} z_{L_2-1} + b_{3,L_2} &= W_{3,L_2}x_{L_2-1} + b_{3,L_2} \nonumber \\ &=W_{1,1}W_{2,L_2}x_{L_2-1} + W_{1,1}b_{2,L_2} + b_{1,1} \nonumber \\ &=W_{1,1} \lp W_{2,L_2}x_{L_2-1} + b_{2,L_2}\rp + b_{1,1} = W_{1,1}y_0 + b_{1,1} \end{align} We next claim that for all $k\in \N \cap \lb L_2, L_1+L_2\rp$ it is the case that: \begin{align}\label{(5.2.13)} W_{3,k}z_{k-1} + b_{3,k} = W_{1,k+1-L_2}y_{k-L_2} + b_{1,k+1-L_2} \end{align} This can be proved via induction on $k\in \N \cap \lb L_2, L_1+L_2\rp$. Consider that our base case of $k=L_2$ in (\ref{(5.2.13)}) is fulfilled by (\ref{(5.2.12)}). Now note that for all $k \in \N \cap \lb L_2,\infty\rp \cap \lp 0,L_1+L_2-1\rp$ with: \begin{align} W_{3,k}z_{k-1} +b_{3,k} = W_{1,k+1-L_2}y_{k-L_2} + b_{1,k+1-L_2} \end{align} it holds that: \begin{align} W_{3,k+1}z_k + b_{3,k+1} &= W_{3,k+1}\lp \lb W_{3,k}z_{k-1} + b_{3,k}\rb_{*,*}\rp + b_{3,k+1} \nonumber \\ &= W_{1,k+2-L_2}\lp \lb W_{1,k+1-L_2}y_{k-L_2}\rb + b_{1,k+1-L_2}\rp + b_{1,k+2-L_2} \nonumber \\ &= W_{1,k+2-L_2}y_{k+1-L_2} + b_{1,k+2-L_2} \end{align} Whence induction proves (\ref{(5.2.13)}). This, along with the fact that $L_3 = L_1+L_2-1$ then indicates that: \begin{align} W_{3,L_3}z_{L_3-1} + b_{3,L_3} = W_{3,L_1+L_2-1}z_{L_1+L_2-2} + b_{3,L_1+L_2-1} = W_{1,L_1}y_{L_1-1} + b_{1,L_1} \end{align} Finally, the fact that $\nu_3 = \nu_1 \bullet \nu_2$, in addition with (\ref{comp_x}),(\ref{comp_y}), and (\ref{comp_z}) then tells us that: \begin{align} \lb \real_{\act}\lp \nu_1 \bullet \nu_2\rp\rb \lp x_0\rp &= \lb \real_{\act}\lp \nu_3\rp\rb \lp x_0\rp = \lb \real_{\act}\lp \nu_3\rp\rb \lp z_0\rp = W_{3,L_3}z_{L_3-1} + b_{3,L_3} \nonumber \\ &= W_{1,L_1}y_{L_1-1} + b_{1,L_1} = \lb \real_{\act}\lp \nu_1\rp\rb\lp y_0\rp \nonumber \\ &=\lb \real_{\act}\lp \nu_1\rp\rb \lp \lb W_{2,L_2}x_{L_2-1} + b_{2,L_2}\rb_{*,*}\rp \nonumber \\ &=\lb \real_{\act}\lp \nu_1\rp\rb \lp \lb \real_{\act}\lp \nu_2\rp\rb\lp x_0\rp\rp = \lb \real_{\act}\lp \nu_1\rp\rb \circ \lb \real_{\act}\lp \nu_2 \rp\rb \lp x_0\rp \end{align} This and (\ref{comp_cont}) then prove Item (v), hence proving the lemma. \end{proof} \section{Stacking of ANNs} We will introduce here the important concept of stacking of ANNs. Given an input vector $x\in \R^d$, it is sometimes very helpful to imagine two neural networks working on them simultaneously, whence we have stacking. Because vectors are ordered tuples, stacking $\nu_1$ and $\nu_2$ is not necessarily the same as stacking $\nu_2$ and $\nu_1$. We will thus forego the phrase "parallelization" used in e.g. \cite{grohs2019spacetime} and \cite{bigbook}, and opt to use the term "stacking". This because parallelization implies commutativity, but it is clearly not the case that $\nu_1 \boxminus \nu_2$ is the same as $\nu_2 \boxminus \nu_1$. \subsection{Stacking of ANNs of Equal Depth} \begin{definition}[Stacking of ANNs of same depth]\label{5.2.5}\label{def:stacking} Let $L,n\in \N$, and let $\nu_1,\nu_2,\hdots, \nu_n \in \neu$, such that $\dep\lp \nu_1\rp= \dep \lp \nu_2\rp= \cdots = \dep\lp \nu_n\rp = L$. As such, for all $i \in \{1,\hdots,n\}$, let it also be the case that $\lay\lp \nu_i\rp = \lp \lp W_1^i,b^i_1\rp, \lp W^i_2,b^i_2\rp,\hdots, \lp W_L^i,b_L^i\rp \rp$. We then denote by $\boxminus^n_{i=1}\nu_i$, the neural network whose layer architecture is given by: \begin{align*} \lay \lp \boxminus^n_{i=1}\nu_i\rp = \lp \lp \diag\lp W_1^1,W_1^2,\hdots,W_1^n\rp , b_1^1 \frown b_1^2,\frown \cdots \frown b_1^n\rp,\right.\\ \left.\lp \diag\lp W_2^1,W_2^2,\hdots,W_2^n\rp , b_2^1 \frown b_2^2,\frown \cdots \frown b_2^n\rp, \right.\\ \left. \vdots \hspace{4cm}\right.\\ \left. \lp \diag\lp W_L^1,W_L^2,\hdots,W_L^n\rp , b_L^1 \frown b_L^2,\frown \cdots \frown b_L^n\rp\rp \end{align*} \end{definition} \begin{remark} For an \texttt{R} implementation see Listing \ref{stk} \end{remark} \begin{lemma}\label{inst_of_stk} Let $\nu_1,\nu_2\in \neu$, with $\dep\lp \nu_1\rp = \dep\lp \nu_2\rp$, $x_1 \in \R^{m_1}$, $x_2 \in \R^{m_2}$, and $\mathfrak{x} \in \R^{m_1+m_2}$. Let $\inst_{\rect}\lp \nu_1\rp: \R^{m_1} \rightarrow \R^{n_1}$, and $\inst_{\rect}:\R^{m_2} \rightarrow \R^{n_2}$. It is then the case that $\real_{\rect}\lp \nu_1\boxminus\nu_2\rp\lp \mathfrak{x}\rp = \inst_{\rect}\lp \nu_1\rp\lp x_1\rp \frown \inst_{\rect}\lp \nu_2\rp\lp x_2\rp$. \end{lemma} \begin{proof} Let $\lay\lp \nu_1\rp = \lp \lp W_1,b_1 \rp,\lp W_2,b_2\rp,\hdots, \lp W_L,b_L\rp\rp$ and \\ $\lay \lp \nu_2\rp = \lp \lp \fW_1, \fb_1\rp, \lp \fW_2,\fb_2\rp,\hdots,\lp \fW_L,\fb_L\rp\rp$, and as such it is the case according to Definition \ref{def:stacking} that: \begin{align*} \lay \lp \nu_1 \boxminus\nu_2\rp = \lp \lp \diag\lp W_1,\fW_1\rp , b_1 \frown \fb_1\rp,\right.\\ \left.\lp \diag\lp W_2,\fW_2\rp , b_2 \frown \fb_2\rp, \right.\\ \left. \vdots \hspace{2.5cm}\right.\\ \left. \lp \diag\lp W_L,\fW_L\rp , b_L^1 \frown \fb_L\rp\rp \end{align*} Note that for all, $\act \in \lp \R,\R\rp$, $j \in \{1,2,\hdots,L-1\}$ and for all $x \in \R^{\columns(W_j)+\columns(\fW_j)}$, $x_1 \in \R^{\columns\lp W_j\rp}$, $x_2 \in \R^{\columns\lp \fW_j \rp}$, $y \in \R^{\rows\lp W_j\rp + \rows \lp \fW_j\rp}$, $y_1 \in \R^{\rows\lp W_j\rp}$, $y_2 \in \R^{\rows \lp \fW_j\rp}$, where $y_1 = \act\lp \lb W_j \cdot x_1 + b_1\rb_{*,*}\rp$, $y_2 = \act \lp \lb \fW_j\cdot x_2+\fb_j\rb_{*,*}\rp$, $y=\act\lp\lb \diag\lp W_j, \fW_j\rp \cdot x + \lp b_j \frown \fb_j \rp\rb_{*,*}\rp$ it is the case that, Corollary \ref{concat_fun_fun_concat} tells us that: \begin{align} y=\act\lp\lb \diag\lp W_j, \fW_j\rp \cdot x + \lp b_j \frown \fb_j \rp\rb_{*,*}\rp &= \act \lp \lb \lp W_j \cdot x_1+ b_j \rp \frown \lp \fW_j \cdot x_2+\fb_j\rp \rb_{*,*}\rp \nonumber \\ &= \act\lp \lb W_j\cdot x_1+b_j\rb_{*,*}\rp\frown \act \lp \lb \fW_j\cdot x_2+\fb_j\rb_{*,*}\rp \nonumber\\ &= y_1 \frown y_2 \end{align} Note that this is repeated from one layer to the next, yielding that $\real_{\rect}\lp \nu_1\boxminus\nu_2\rp\lp \mathfrak{x}\rp = \inst_{\rect}\lp \nu_1\rp\lp x_1\rp \frown \inst_{\rect}\lp \nu_2\rp\lp x_2\rp$. \end{proof} \begin{remark}\label{5.3.2}\label{rem:stk_remark} Given $n,L \in \N$, $\nu_1,\nu_2,...,\nu_n \in \neu$ such that $L = \dep (\nu_1) = \dep(\nu_2) =...= \dep(\nu_n)$ it is then the case, as seen from (\ref{5.4.2}) that: \begin{align}\label{(5.3.3)} \boxminus_{i=1}^n \nu_i \in \lp \bigtimes^L_{k=1} \lb \R^{\lp \sum^n_{j=1} \wid_k(\nu_j) \rp \times \lp \sum_{j=1}^n \wid_{k-1} \lp \nu_j \rp \rp} \times \R ^{\lp \sum^n_{j=1} \wid_k \lp \nu_j \rp \rp }\rb \rp \end{align} \end{remark} \begin{lemma}\label{paramofparallel} Let $n,L \in \N$, $\nu_1,\nu_2,\hdots, \nu_n \in \neu$ satisfty that $L = \dep \lp \nu_1 \rp = \dep \lp \nu_2\rp = \cdots = \dep \lp\nu_n \rp$. It is then the case that: \begin{align} \param \lp \lb \boxminus_{i=1}^n \nu_i \rb \rp \les \frac{1}{2}\lb \sum^n_{i=1} \param \lp \nu_i\rp\rb^2 \end{align} \begin{proof} Note that by Remark \ref{5.3.2} we have that: \begin{align} \param \lp \lb \boxminus_{i=1}^n \nu_i\rb\rp &= \sum^L_{k=1} \lb \sum_{i=1}^n l_{i,k}\rb \lb \lp \sum^n_{i=1} l_{i,k-1} \rp +1 \rb \nonumber \\ &= \sum^L_{k=1} \lb \sum^n_{i=1}l_{i,k}\rb \lb \lp \sum^n_{j=1} l_{j,k-1}\rp+1\rb \nonumber\\ &\les \sum^n_{i=1} \sum^n_{j=1} \sum^L_{k=1} l_{i,k} \lp l_{j,k-1}+1 \rp \nonumber \\ &\les \sum^n_{i=1} \sum^n_{j=1} \sum^L_{\ell=1} l_{i,k} \lp l_{j,\ell-1} +1\rp \nonumber \\ &=\sum^n_{i=1} \sum^n_{j=1} \lb \sum^L_{k=1}l_{i,k}\rb \lb \sum^L_{\ell=1} \lp l_{j,\ell-1} +1\rp\rb \nonumber \\ &\les \sum^n_{i=1} \sum^n_{j=1}\lb \sum^L_{k=1} \frac{1}{2} l_{i,k} \lp l_{i,k-1}+1\rp\rb \lb \sum^L_{\ell=1} l_{j,\ell}\lp l_{j,\ell-1}+1 \rp \rb \nonumber \\ &= \sum^n_{i=1} \sum^n_{j=1} \frac{1}{2} \param \lp \nu_i \rp \param \lp \nu_j\rp = \frac{1}{2} \lb \sum^n_{i=1}\param \lp \nu_i\rp\rb^2 \end{align} This completes the proof of the lemma. \end{proof} \end{lemma} \begin{corollary}\label{cor:sameparal} Let $n\in \N$. Let $\nu_1,\nu_2,...,\nu_n \in \neu$ satisfy that $\lay \lp \nu_1\rp = \lay \lp \nu_2\rp= \cdots =\lay \lp \nu_n\rp$. It is then the case that: \begin{align} \param \lp \boxminus_{i=1}^n \nu_i\rp \les n^2\param \lp \nu_1\rp \end{align} \end{corollary} \begin{proof} Since it is the case that for all $j \in \{1,2,...,n \}$ that: $\lay\lp \nu_j\rp=\lp l_0,l_1,...,l_L \rp$, where $l_0,l_1,...,l_L,L \in \N$, we may say that: \begin{align} \param \lp \boxminus_{j=1}^n \nu_j \rp &= \sum^L_{j=1}\lp nl_j\rp \lb \lp nl_{j-1} \rp +1\rb \les \sum^L_{j=1}\lp nl_j \rp\lb \lp nl_{j-1}\rp + n \rb \nonumber \\ &=n^2 \lb \sum^L_{j=1}l_j \lp l_{j-1}+1\rp\rb = n^2\param\lp \nu_1\rp \end{align} \end{proof} \begin{lemma}\label{lem:paramparal_geq_param_sum} Let $\nu_1,\nu_2 \in \neu$, such that $\dep \lp \nu_1\rp = \dep \lp \nu_2\rp = L$. It is then the case that $\param\lp \nu_1\rp + \param \lp \nu_2\rp \les \param \lp \nu_1 \boxminus \nu_2\rp$. \end{lemma} \begin{proof} Remark \ref{rem:stk_remark} tells us that: \begin{align} \nu_1 \boxminus \nu_2 \in \lp \bigtimes^L_{k=1} \lb \R^{\lp \wid_k(\nu_1) + \wid_k\lp \nu_2\rp \rp \times \lp \wid_{k-1} \lp \nu_1\rp +\wid_{k-1}\lp\nu_2\rp \rp} \times \R^{ \wid_k \lp \nu_1 \rp + \wid_k\lp \nu_2\rp }\rb \rp \end{align} The definition of $\param()$ from Defition \ref{def:nn_def}, and the fact that $\wid_i \ges 1$ for all $i \in \{1,2,\hdots, L\}$ tells us then that: \begin{align} \param\lp \nu_1 \boxminus \nu_2\rp &= \sum_{k=1}^L \lb \lp \wid_k \lp \nu_1\rp + \wid_k\lp \nu_2\rp\rp \times \lp \wid_{k-1}\lp \nu_1\rp+ \wid_{k-1}\lp \nu_2\rp +1\rp \rb \nonumber \\ &= \sum^L_{k=1} \lb \wid_k \lp \nu_1\rp \wid_{k-1}\lp \nu_1\rp+ \wid_k \lp \nu_1\rp\wid_{k-1}\lp \nu_2\rp \right. \nonumber\\ &\left. + \wid_k \lp \nu_1\rp + \wid_k\lp \nu_2\rp\wid_{k-1}\lp \nu_1\rp + \wid_k\lp \nu_2\rp\wid_{k-1}\lp \nu_2\rp + \wid_k\lp \nu_2\rp\rb \nonumber \\ &\ges \sum_{k=1}^L \lb \wid_k \lp \nu_1\rp\wid_{k-1}\lp \nu_1\rp + \wid_k \lp \nu_1\rp + \wid_k\lp \nu_2\rp\wid_{k-1}\lp \nu_2\rp+ \wid_k\lp \nu_2 \rp\rb \nonumber \\ &=\sum_{k=1}^L\lb \wid_k \lp \nu_1 \rp\lp \wid_{k-1}\lp \nu_1\rp+1\rp\rb + \sum_{k=1}^L\lb \wid_k\lp \nu_2 \rp\lp \wid_{k-1}\lp \nu_2\rp+1\rp\rb \nonumber \\ &= \param \lp \nu_1\rp + \param \lp \nu_2\rp \end{align} \end{proof} \begin{corollary}\label{cor:bigger_is_better} Let $\nu_1,\nu_2,\nu_3 \in \neu$. Let $\param \lp \nu_2 \rp \les \param\lp \nu_3\rp$. It is then the case that $\param\lp \nu_1 \boxminus \nu_2\rp \les \param\lp \nu_1 \boxminus \nu_3\rp$. \end{corollary} \begin{proof} Lemma \ref{lem:paramparal_geq_param_sum} tells us that: \begin{align} 0 &\les \param \lp \nu_1\rp + \param\lp \nu_3\rp \les \param \lp \nu_1 \boxminus \nu_3\rp \label{lin1} \\ 0 &\les \param \lp \nu_1\rp + \param\lp \nu_2\rp \les \param \lp \nu_1 \boxminus \nu_2\rp \label{lin2} \end{align} Subtracting (\ref{lin1}) from (\ref{lin2}) gives us that: \begin{align} 0 \les \param\lp \nu_3\rp - \param\lp \nu_2\rp &\les \param\lp \nu_1 \boxminus\nu_3\rp - \param\lp \nu_1\boxminus\nu_2\rp \nonumber\\ \param\lp \nu_1 \boxminus\nu_2\rp &\les \param\lp \nu_1 \boxminus \nu_2\rp \nonumber \end{align} This completes the proof of the Corollary. \end{proof} \begin{lemma}\label{5.4.3} Let $m_1,m_2,n_1,n_2\in\N$. Let $\nu_1,\nu_2 \in \neu$, such that $\real_{\rect} \lp \nu_1\rp \in C \lp \R^{m_1},\R^{n_1}\rp$ and $\real_{\rect} \lp \nu_2\rp \in C\lp \R^{m_2},\R^{n_2}\rp$. It is then the case that $\lp \real_{\act}(\nu_1 \boxminus \nu_2)\rp \lp \begin{bmatrix} x \\ x' \end{bmatrix}\rp = \lp \real_{\act}(\nu_2 \boxminus \nu _1) \rp \lp \begin{bmatrix} x' \\ x \end{bmatrix}\rp$ for $x \in \R^{m_1}, x' \in \R^{n_1}$, upto transposition. \end{lemma} \begin{proof} Note that this is a consequence of the commutativity of summation in the exponents of (\ref{(5.3.3)}), and the fact that switching $\nu_1$ and $\nu_2$ with a transposition results in a transposed output for transposed input. \end{proof} \begin{lemma}\label{5.3.4} Let $\act \in C \lp \R, \R \rp$, $n \in \N$, and $\nu = \boxminus_{i=1}^n \nu_i$ satisfy the condition that $\dep(\nu_1) = \dep(\nu_2) =...=\dep(\nu_n)$. It is then the case that $\real_{\act} \lp \nu \rp \in C \lp \R^{\sum_{i=1}^n \inn(\nu_i)}, \R^{\sum^n_{i=1}\out(\nu_i)} \rp $ \end{lemma} \begin{proof} Let $L = \dep(\nu_1)$, and let $l_{i,0},l_{i,1}...l_{i,L} \in \N$ satisfy for all $i \in \{ 1,2,...,n\}$ that $\lay(\nu_i) = \lp l_{i,0}, l_{i,1},...,l_{i,L} \rp $. Furthermore let $\lp \lp W_{i,1},b_{i,1}\rp, \lp W_{i,2},b_{i,2} \rp , ..., \lp W_{i,L},b_{i,L} \rp \rp \in \\ \lp \bigtimes^L_{j=1} \lb \R^{l_{i,j} \times l_{i,j-1}} \times \R^{l_{i,j}} \rb \rp $ satisfy for all $i \in \{ 1,2,...,n\}$ that: \begin{align} \nu_i = \lp \lp W_{i,1},b_{i,1} \rp , \lp W_{i,2}, b_{i,2}\rp ,...,\lp W_{i,L},b_{i,L} \rp \rp \end{align} Let $\alpha_j \in \N$ with $j \in \{0,1,...,L\}$ satisfy that $\alpha_j = \sum^n_{i=1} l_{i,j}$ and let \\ $\lp \lp A_1,b_1 \rp, \lp A_2,b_2 \rp,...,\lp A_L,b_L \rp \rp \in \lp \bigtimes^L_{j=1} \lb \R^{\alpha_{j} \times \alpha_{j-1}} \times \R^{\alpha_{j}} \rb \rp $ satisfy that: \begin{align}\label{5.3.5} \boxminus_{i=1}^n \nu_i = \lp \lp A_1,b_1 \rp, \lp A_2,b_2 \rp,...,\lp A_L,b_L \rp \rp \end{align} See Remark 5.3.2. Let $x_{i,0},x_{i,1},...,x_{i,L-1} \in \lp \R^{l_{i,0}} \times \R^{l_{i,1}}\times \cdots \times \R^{l_{i,L-1}} \rp$ satisfy for all $i \in \{1,2,...,n\}$ $k \in \N \cap \lp 0,L \rp $ that: \begin{align} x_{i,j} = \mult^{l_{i,j}}_{\act} \lp W_{i,j}x_{i,j-1} + b_{i,j} \rp \end{align} Note that (\ref{5.3.5}) demonstrates that $\inn \lp \boxminus_{i=1}^n\nu_i \rp =\alpha_0$ and $\out \lp \boxminus^n_{i=1} \nu_i \rp = \alpha_L$. This and Item(ii) of Lemma \ref{5.1.8}, and the fact that for all $i \in \{1,2,...,n\}$it is the case that $\inn(\nu_i) = l_{i,0}$ and $\out(\nu_i) = l_{i,L}$ ensures that: \begin{align} \real_{\act} \lp \boxminus^n_{i=1} \rp \in C \lp \R^{\alpha_0}, \R^{\alpha_L} \rp &= C\lp \R^{\sum^n_{i=1}l_{i,0}}, \R^{\sum_{i=1}^n l_{i,L}} \rp \nonumber\\ &= C \lp \R^{\sum^n_{i=1} \inn(\nu_i)}, \R^{\sum_{i=1}^n \out(\nu_i)} \rp \nonumber \end{align} This proves the lemma. \end{proof} \subsection{Stacking of ANNs of Unequal Depth} We will often encounter neural networks that we want to stack but have unequal depth. Definition \ref{5.2.5} only deals with neural networks of the same depth. We will facilitate this situation by introducing a form of padding for our shorter neural network. Hence, they come out to the same length before stacking them. This padding will be via the tunneling neural network, as shown below. \begin{definition}[Identity Neural Network]\label{7.2.1} Let $d\in \N$. We will denote by $\id_d \in \neu$ the neural network satisfying for all $d \in \N$ that: \begin{enumerate}[label = (\roman*)] \item \begin{align} \id_1 = \lp \lp \begin{bmatrix} 1 \\ -1 \end{bmatrix}, \begin{bmatrix} 0 \\ 0 \end{bmatrix}\rp \lp \begin{bmatrix} 1 \quad -1 \end{bmatrix},\begin{bmatrix} 0\end{bmatrix}\rp \rp \in \lp \lp \R^{2 \times 1} \times \R^2 \rp \times \lp \R^{1\times 2} \times \R^1 \rp \rp \end{align} \item \begin{align}\label{7.2.2} \id_d = \boxminus^d_{i=1} \id_1 \end{align} For $d \in \N \cap \lb 2,\infty\rp$. \end{enumerate} \begin{remark} We will discuss some properties of $\id_d$ in Section \ref{sec_tun}. \end{remark} \end{definition} \begin{definition}[The Tunneling Neural Network] We define the tunneling neural network, denoted as $\tun_n$ for $n\in \N$ and $d\in \N$ by: \begin{align} \tun^d_n = \begin{cases} \aff_{\mathbb{I}_d,0} &:n= 1 \\ \id_d &: n=2 \\ \bullet^{n-2} \id_d & n \in \N \cap [3,\infty) \end{cases} \end{align} We will drop the requirement for $d$ and $\tun_n$ by itself will be used to denote $\tun_n^1$. \end{definition} \begin{remark} We will discuss some properties of the $\tun^d_n$ network in Section \ref{sec_tun}. We will also discuss properties of wider tunneling neural network in Lemma \ref{tun_mult}. \end{remark} \begin{definition} Let $n \in \N$, and $\nu_1,\nu_2,...,\nu_n \in \neu$. We will define the stacking of unequal length neural networks, denoted $\DDiamond^n_{i=1}\nu_i$ as the neural network given by: \begin{align} \DDiamond^n_{i=1}\nu_i = \boxminus^n_{i=1} \lb \tun_{\max_i \left\{\dep \lp \nu_i \rp\right\} +1 - \dep \lp \nu_i\rp}^{\out \lp \nu_i\rp} \bullet \nu_i \rb \end{align} \end{definition} Diagrammatically, this can be thought of as shown below. \begin{figure} \begin{center} \tikzset{every picture/.style={line width=0.75pt}} %set default line width to 0.75pt \begin{tikzpicture}[x=0.75pt,y=0.75pt,yscale=-1,xscale=1] %uncomment if require: \path (0,475); %set diagram left start at 0, and has height of 475 %Shape: Rectangle [id:dp6580978944544137] \draw (199.5,139) -- (490,139) -- (490,179) -- (199.5,179) -- cycle ; %Shape: Rectangle [id:dp353023162160477] \draw (420,205) -- (490,205) -- (490,245) -- (420,245) -- cycle ; %Shape: Rectangle [id:dp37062952177240926] \draw (200.5,205) -- (403,205) -- (403,245) -- (200.5,245) -- cycle ; %Straight Lines [id:da022591094656464583] \draw (419,224) -- (404.5,224) ; \draw [shift={(402.5,224)}, rotate = 360] [color={rgb, 255:red, 0; green, 0; blue, 0 } ][line width=0.75] (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29) ; %Straight Lines [id:da41864611287252906] \draw (198.5,160) -- (101.5,160) ; \draw [shift={(99.5,160)}, rotate = 360] [color={rgb, 255:red, 0; green, 0; blue, 0 } ][line width=0.75] (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29) ; %Straight Lines [id:da6288954656732593] \draw (198.5,222) -- (101.5,222) ; \draw [shift={(99.5,222)}, rotate = 360] [color={rgb, 255:red, 0; green, 0; blue, 0 } ][line width=0.75] (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29) ; %Straight Lines [id:da8590579460958981] \draw (526,158) -- (493.5,158) ; \draw [shift={(491.5,158)}, rotate = 360] [color={rgb, 255:red, 0; green, 0; blue, 0 } ][line width=0.75] (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29) ; %Straight Lines [id:da8647539484496881] \draw (527,221) -- (494.5,221) ; \draw [shift={(492.5,221)}, rotate = 360] [color={rgb, 255:red, 0; green, 0; blue, 0 } ][line width=0.75] (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29) ; % Text Node \draw (337,155) node [anchor=north west][inner sep=0.75pt] {$\nu _{1}$}; % Text Node \draw (445,220) node [anchor=north west][inner sep=0.75pt] {$\nu _{2}$}; % Text Node \draw (296,220) node [anchor=north west][inner sep=0.75pt] {$\mathsf{Tun}$}; \end{tikzpicture} \end{center} \caption{Diagrammmatic representation of the stacking of unequal depth neural networks} \end{figure} \begin{lemma} Let $\nu_1,\nu_2 \in \neu$. It is then the case that: \begin{align} \param \lp \nu_1\DDiamond\nu_2 \rp \les 2\cdot \lp \max \left\{ \param\lp \nu_1\rp, \param\lp \nu_2\rp\right\}\rp^2 \end{align} \end{lemma} \begin{proof} This is a straightforward consequence of Lemma \ref{paramofparallel}. \end{proof} \section{Affine Linear Transformations as ANNs and Their Properties.} Affine neural networks present an important class of neural networks. By virtue of them being only one layer deep, they may be instantiated with any activation function whatsoever and still retain their affine transformative properties, see Definition \ref{def:inst}. In addition, when composing, they are subsumed into the function being somposed to, i.e. they do not change the depth of a neural network once composed into it, see Lemma \ref{comp_prop}. \begin{definition}\label{5.3.1}\label{def:aff} Let $m,n \in \N$, $W \in \R^{m \times n}$, $b \in \R^m$.We denote by $\aff_{W,b} \in \lp \R^{m\times n} \times \R^m \rp \subsetneq \neu$ the neural network given by $\aff_{W,b} = ((W,b))$. \end{definition} \begin{lemma}\label{5.3.2}\label{aff_prop} Let $m,n \in \N$, $W \in \R^{m\times n}$, $b \in \R^m$. It is then the case that: \begin{enumerate}[label = (\roman*)] \item $\lay (\aff_{W,b}) = (n,m) \in \N^2$. \item for all $\act \in C ( \R,\R)$ it is the case that $\real_{\act} (\aff_{W,b}) \in C (\R^n, \R^m)$ \item for all $\act \in C(\R,\R)$, $x \in \R^n$ we have $(\real_{\act}(\aff_{W,b}))(x) = Wx+b$ \end{enumerate} \end{lemma} \begin{proof} Note that $(i)$ is a consequence of Definition \ref{5.1.2} and \ref{5.3.1}. Note next that $\aff_{W,b} = (W,b) \in (\R^{m\times n} \times \R^m) \subsetneq \neu$. Note that ($\ref{5.1.11}$) then tells us that $\real_{\act} (\aff_{W,b}) = Wx+b$ which in turn proves $(ii)$ and $(iii)$ \end{proof} \begin{remark}\label{remark:5.4.3}\label{param_of_aff} Given $W\in \R^{m\times n}$, and $b \in \R^{m \times 1}$, it is the case that according to Definition (\ref{paramdef}) we have: $\param(\aff_{W,b})= m\times n + m$ \end{remark} \begin{remark} For an \texttt{R} implementation see Listing \ref{affn} \end{remark} \begin{lemma}\label{5.3.3}\label{aff_effect_on_layer_architecture} Let $\nu \in \neu$. It is then the case that: \begin{enumerate}[label = (\roman*)] \item For all $m\in \N$, $W \in \R^{m\times \out(\nu)}$ \begin{align} \lay(\aff_{W,B} \bullet\nu) = \lp \wid_0(\nu), \wid_1(\nu),...,\wid_{\dep(\nu)-1}(\nu),m \rp \in \N^{\dep(\nu)+1} \end{align} \item For all $\act \in C(\R,\R)$, $m\in \N$, $W \in \R^{m \times \out(\nu)}$, $B \in \R^m$, we have that $\real_{\act} (\aff_{W,B} \bullet\nu) \in C\lp \R^{\inn(\nu)},\R^m\rp$. \item For all $\act \in C(\R,\R)$, $m\in \N$, $W \in \R^{m \times \out(\nu)}$, $B \in \R^m$, $x \in \R^{\inn(\nu)}$ that: \begin{align} \lp \real \lp \aff_{W,b} \bullet \nu \rp \rp \lp x \rp= W \lp \real_{\act}\lp \nu \rp \rp \lp x \rp +b \end{align} \item For all $n\in \N$, $W \in \R^{\inn(\nu) \times n}$, $b \in \R^{\inn(\nu)}$ that: \begin{align} \lay(\nu \bullet \aff_{W,b}) = \lp n, \wid_1(\nu), \wid_2(\nu),...,\wid_{\dep(\nu)}(\nu) \rp \in \N^{\dep(\nu)+1} \end{align} \item For all $\act \in C(\R,\R)$, $n\in \N$, $W \in \R^{\inn(\nu) \times n}$, $b \in \R^{\inn(\nu)}$ that $\real_{\act} \lp \nu \bullet \aff_{W,b} \rp \in C \lp \R^n, \R^{\out(\nu)} \rp $ and, \item For all $\act \in C(\R,\R)$, $n\in \N$, $W \in \R^{\inn(\nu) \times n}$, $b \in \R^{\inn (\nu)}$, $x \in \R^n$ that: \begin{align} \lp \real_{\act} \lp \nu \bullet \aff_{W,b} \rp \rp \lp x \rp = \lp \real_{\act} \lp \nu \rp \rp \lp Wx+b \rp \end{align} \end{enumerate} \end{lemma} \begin{proof} From Lemma \ref{5.3.2} we see that $\real_{\act}(\aff_{W,b}) \in C(\R^n,\R^m)$ given by $\real_{\act}(\aff_{W,b}) = Wx + b$. This and Lemma \ref{comp_prop} prove $(i)-(vi)$. \end{proof} \begin{corollary}\label{affcor} Let $m,n \in \N$, and $W \in \R^{m \times n}$ and $b \in \R^m$. Let $\nu\in \neu$. It is then the case that: \begin{enumerate}[label = (\roman*)] \item for all $\aff_{W,b} \in \neu$ with $\inn \lp \aff_{W,b} \rp = \out \lp \nu \rp$ that: \begin{align} \param \lp \aff_{W,b} \bullet \nu \rp \les \lb \max\left\{ 1, \frac{\out \lp \aff_{W,b}\rp}{l_L}\right\}\rb \param \lp \nu\rp \end{align} \item for all $\aff_{W,b} \in \neu$ with $\out\lp \aff_{W,b}\rp = \inn\lp \nu\rp$ that: \begin{align} \param \lp \nu \bullet \aff_{W,b}\rp \les \lb \max\left\{ 1, \frac{\inn \lp \aff_{W,b}\rp+1}{\inn\lp \nu\rp+1}\right\}\rb \param \lp \nu\rp \end{align} \end{enumerate} \end{corollary} \begin{proof} Let it be the case that $\lay \lp \nu\rp = \lp l_0,l_1,...,l_L\rp$ for $l_0,l_1,...,l_L,L \in \N$. Lemma \ref{5.3.3}, Item (i), and Lemma \ref{comp_prop} then tells us that: \begin{align} &\param \lp \aff_{W,b} \bullet \nu \rp\\ &= \lb \sum^{L-1}_{m=1} l_m \lp l_{m-1}+1\rp\rb + \out \lp \aff_{W,b}\rp \lp l_{L-1}+1\rp \nonumber \\ &= \lb \sum^{L-1}_{m=1} l_m \lp l_{m-1}+1 \rp\rb+ \lb \frac{\out\lp \aff_{W,b}\rp}{l_L}\rb l_L\lp l_{L-1}+1 \rp \nonumber \\ &\les \lb \max \left\{ 1, \frac{\out(\aff_{W,b})}{l_L}\right\}\rb \lb \sum^{L-1}_{m=1} l_m \lp l_{m-1}+1\rp\rb + \lb \max\left\{ 1,\frac{\out\lp \aff_{W,b}\rp}{l_L}\right\}\rb l_L \lp l_{L-1}+1\rp \nonumber\\ &= \lb \max\left\{ 1, \frac{\out \lp \aff_{W,b}\rp}{l_L}\right\}\rb \lb \sum^L_{m=1}l_m \lp l_{m-1} +1\rp\rb = \lb \max\left\{ 1, \frac{\out \lp \aff_{W,b}\rp}{l_L}\right\}\rb \param \lp \nu\rp \nonumber \end{align} and further that: \begin{align} &\param \lp \nu \bullet\aff_{W,b} \rp \\ &= \lb \sum^{L}_{m=2} l_m \lp l_{m-1}+1\rp\rb + l_{1}\lp \inn \lp \aff_{W,b}\rp+1\rp \nonumber \\ &= \lb \sum^{L}_{m=2} l_m \lp l_{m-1}+1 \rp\rb+ \lb \frac{\inn \lp \aff_{W,b}\rp+1}{l_0+1}\rb l_1\lp l_{0}+1 \rp \nonumber \\ &\les \lb \max \left\{ 1, \frac{\inn(\aff_{W,b})+1}{l_0+1}\right\}\rb \lb \sum^{L}_{m=2} l_m \lp l_{m-1}+1\rp\rb + \lb \max\left\{ 1,\frac{\inn\lp \aff_{W,b}\rp+1}{l_0+1}\right\}\rb l_1 \lp l_{0}+1\rp \nonumber\\ &= \lb \max\left\{ 1, \frac{\inn \lp \aff_{W,b}\rp+1}{l_0+1}\right\}\rb \lb \sum^L_{m=1}l_m \lp l_{m-1} +1\rp\rb = \lb \max\left\{ 1, \frac{\inn \lp \aff_{W,b}\rp+1}{\inn\lp \nu\rp+1}\right\}\rb \param \lp \nu\rp \nonumber \end{align} This completes the proof of the lemma. \end{proof} \begin{lemma}\label{aff_stack_is_aff} Let $\mathfrak{a}_1,\mathfrak{a}_2$ be two affine neural networks as defined in Definition \ref{def:aff}. It is then the case that $\mathfrak{a}_1 \boxminus \mathfrak{a}_2$ is also an affine neural network \end{lemma} \begin{proof} This follows straightforwardly from Definition \ref{def:stacking}, where, given that $\mathfrak{a}_1 = \lp \lp W_1,b_1\rp \rp$, and $\mathfrak{a}_2 = \lp \lp W_2,b_2\rp \rp$, their stackings is the neural network $\lp \lp \diag\lp W_1,W_2\rp,b_1 \frown b_2\rp\rp$, which is clearly an affine neural network. \end{proof} \section{Sums of ANNs of Same End-widths} \begin{definition}[The $\cpy_{n,k}$ Network]\label{def:cpy} We define the neural network, $\cpy_{n,k} \in \neu$ for $n,k\in \N$ as the neural network given by: \begin{align} \cpy_{n,k} = \aff_{\underbrace{\lb \mathbb{I}_{k} \: \mathbb{I}_k \: \cdots \: \mathbb{I}_k \rb^T}_{n-\text{many}},\mymathbb{0}_{nk}} \end{align} Where $k$ represents the dimensions of the vectors being copied and $n$ is the number of copies of the vector being made. \end{definition} \begin{remark} See Listing \ref{affn}. \end{remark} \begin{lemma}\label{dep_cpy}\label{lem:param_cpy} Let $n,k \in \N$ and let $\cpy_{n,k} \in \neu$, it is then the case for all $n,k \in \N$ that: \begin{enumerate}[label = (\roman*)] \item $\dep \lp \cpy_{n,k} \rp = 1$ \item $\param\lp \cpy_{n,k} \rp = nk^2+nk$ \end{enumerate} \begin{proof} Note that $(i)$ is a consequence of Definition \ref{5.3.1}, and (ii) follows from the structure of $\cpy_{n,k}$. \end{proof} \end{lemma} \begin{definition}[The $\sm_{n,k}$ Network]\label{def:sm} We define the neural network $\sm_{n,k}$ for $n,k \in \N$ as the neural network given by: \begin{align} \sm_{n,k} = \aff_{\underbrace{\lb \mathbb{I}_k \: \mathbb{I}_k \: \cdots \: \mathbb{I}_k\rb}_{n-\text{many}}, \mymathbb{0}_{k}} \end{align} Where $k$ represents the dimensions of the vectors being added and $n$ is the number of vectors being added. \end{definition} \begin{remark} See again, Listing \ref{affn} \end{remark} \begin{lemma}\label{lem:5.5.4}\label{lem:param_sm} Let $n,k \in \N$ and $\sm_{n,k} \in \neu$, it is then the case for all $n,k \in \N$ that: \begin{enumerate}[label = (\roman*)] \item $\dep \lp \sm_{n,k} \rp = 1$ \item $\param\lp \sm_{n,k} \rp = nk^2+k$ \end{enumerate} \end{lemma} \begin{proof} (i) is a consequence of Definition $\ref{5.3.1}$ and (ii) follows from the structure of $\sm_{n,k}$. \end{proof} \begin{definition}[Sum of ANNs of the same depth and same end widths]\label{def:nn_sum} Let $u,v \in \Z$ with $u \leqslant v$. Let $\nu_u,\nu_{u+1},...,\nu_v \in \neu$ satisfy for all $i \in \N \cap [u,v]$ that $\dep(\nu_i) = \dep(\nu_u)$, $\inn(\nu_i) = \inn(\nu_u)$, and $\out(\nu_i) = \out(\nu_u)$. We then denote by $\oplus^n_{i=u} \nu_i$ or alternatively $\nu_u \oplus\nu_{u+1} \oplus \hdots \oplus\nu_v$ the neural network given by: \begin{align}\label{5.4.3} \oplus^v_{i=u}\nu_i \coloneqq \lp \sm_{v-u+1,\out(\nu_2)} \bullet \lb \boxminus^v_{i=u}\nu_i \rb \bullet \cpy_{(v-u+1),\inn(\nu_1)} \rp \end{align} \end{definition} \begin{remark} For an \texttt{R} implementation, see Listing \ref{nn_sum}. \end{remark} \begin{remark} We may diagrammatically refer to this network as: \begin{figure}[h] \begin{center} \tikzset{every picture/.style={line width=0.75pt}} %set default line width to 0.75pt \begin{tikzpicture}[x=0.75pt,y=0.75pt,yscale=-1,xscale=1] %uncomment if require: \path (0,433); %set diagram left start at 0, and has height of 433 %Shape: Rectangle [id:dp9509582141653736] \draw (470,170) -- (540,170) -- (540,210) -- (470,210) -- cycle ; %Shape: Rectangle [id:dp042468147108538634] \draw (330,100) -- (400,100) -- (400,140) -- (330,140) -- cycle ; %Shape: Rectangle [id:dp46427980442406214] \draw (330,240) -- (400,240) -- (400,280) -- (330,280) -- cycle ; %Straight Lines [id:da8763809527154822] \draw (470,170) -- (401.63,121.16) ; \draw [shift={(400,120)}, rotate = 35.54] [color={rgb, 255:red, 0; green, 0; blue, 0 } ][line width=0.75] (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29) ; %Straight Lines [id:da9909123473315302] \draw (470,210) -- (401.63,258.84) ; \draw [shift={(400,260)}, rotate = 324.46] [color={rgb, 255:red, 0; green, 0; blue, 0 } ][line width=0.75] (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29) ; %Straight Lines [id:da8497218496635237] \draw (570,190) -- (542,190) ; \draw [shift={(540,190)}, rotate = 360] [color={rgb, 255:red, 0; green, 0; blue, 0 } ][line width=0.75] (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29) ; %Shape: Rectangle [id:dp11197066111784415] \draw (210,170) -- (280,170) -- (280,210) -- (210,210) -- cycle ; %Straight Lines [id:da5201326815013356] \draw (330,120) -- (281.41,168.59) ; \draw [shift={(280,170)}, rotate = 315] [color={rgb, 255:red, 0; green, 0; blue, 0 } ][line width=0.75] (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29) ; %Straight Lines [id:da4370325799656589] \draw (330,260) -- (281.41,211.41) ; \draw [shift={(280,210)}, rotate = 45] [color={rgb, 255:red, 0; green, 0; blue, 0 } ][line width=0.75] (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29) ; %Straight Lines [id:da012890543438617508] \draw (210,190) -- (182,190) ; \draw [shift={(180,190)}, rotate = 360] [color={rgb, 255:red, 0; green, 0; blue, 0 } ][line width=0.75] (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29) ; % Text Node \draw (481,182.4) node [anchor=north west][inner sep=0.75pt] {$\mathsf{Cpy}_{n}{}_{,}{}_{k}$}; % Text Node \draw (351,110.4) node [anchor=north west][inner sep=0.75pt] {$\nu _{1}$}; % Text Node \draw (351,252.4) node [anchor=north west][inner sep=0.75pt] {$\nu _{2}$}; % Text Node \draw (574,180.4) node [anchor=north west][inner sep=0.75pt] {$x$}; % Text Node \draw (441,132.4) node [anchor=north west][inner sep=0.75pt] {$x$}; % Text Node \draw (437,232.4) node [anchor=north west][inner sep=0.75pt] {$x$}; % Text Node \draw (221,180.4) node [anchor=north west][inner sep=0.75pt] {$\mathsf{Sum}_{n}{}_{,}{}_{k}$}; \end{tikzpicture} \end{center} \caption{Neural Network diagram of a neural network sum.} \end{figure} \end{remark} \subsection{Neural Network Sum Properties} \begin{lemma}\label{paramsum} Let $\nu_1, \nu_2 \in \neu$ satisfy that $\dep(\nu_1) = \dep(\nu_2) = L$, $\inn(\nu_1) = \inn(\nu_2)$, and $\out(\nu_1) = \out(\nu_2)$, and $\lay(\nu_1) = \lp l_{1,1},l_{1,2},...l_{1,L} \rp$ and $\lay \lp \nu_2 \rp = \lp l_{2,1}, l_{2,2},...,l_{2,L} \rp $ it is then the case that: \begin{align} \param \lp \nu_1 \oplus \nu_2 \rp &= \param \lp \aff_{\lb \mathbb{I}_{\out(\nu_2)} \: \mathbb{I}_{\out(\nu_2)}\rb, \mymathbb{0}_{\out(\nu_2)} }\bullet \lb \nu_1 \boxminus \nu_2\rb \bullet \aff_{\lb\mathbb{I}_{\inn(\nu_1)}\: \mathbb{I}_{\inn(\nu_1)}\rb^T,\mymathbb{0}_{2\cdot\inn(\nu_1)}} \rp \\ &\les \frac{1}{2}\lp \param \lp \nu_1\rp + \param \lp \nu_2\rp\rp^2\nonumber \end{align} \end{lemma} \begin{proof} Note that by Lemma \ref{paramofparallel} we have that: \begin{align} \param \lp \nu_1 \boxminus \nu_2 \rp = \frac{1}{2}\lp \param \lp \nu_1\rp + \param \lp \nu_2\rp\rp^2 \end{align} Note also that since $\cpy$ and $\sm$ are affine neural networks, from Corollary \ref{affcor} we get that: \begin{align}\label{(5.5.6)} \param \lp \lb \nu_1 \boxminus \nu_2\rb \bullet \cpy_{2,\inn(\nu_1)}\rp &\les \max \left\{ 1, \frac{\inn\lp \nu_1\rp+1}{2\inn\lp \nu_1\rp+1}\right\} \frac{1}{2}\lp \param \lp \nu_1\rp + \param \lp \nu_2\rp\rp^2 \nonumber\\ &= \frac{1}{2}\lp \param \lp \nu_1\rp + \param \lp \nu_2\rp\rp^2 \end{align} and further that: \begin{align} \param \lp \sm_{2,\out \lp \nu_1 \boxminus \nu_2\rp} \bullet \lb \nu_1 \boxminus \nu_2\rb \bullet \cpy_{2,\inn\lp \nu_1\rp} \rp &\les \lb \max\left\{ 1, \frac{\out \lp \aff_{W,b}\rp}{2\out\lp \nu_1\rp}\right\}\rb \frac{1}{2} \lp \param \lp \nu_1\rp + \param \lp \nu_2 \rp\rp^2 \nonumber \\ &= \frac{1}{2}\lp \param \lp \nu_1\rp + \param \lp \nu_2\rp\rp^2 \end{align} \end{proof} \begin{corollary}\label{corsum} Let $n\in \N$. Let $\nu_1,\nu_2,...,\nu_n \in \neu$ satisfy that $\lay \lp \nu_1\rp = \lay \lp \nu_2\rp= \cdots =\lay \lp \nu_n\rp$. It is then the case that: \begin{align} \param \lp \bigoplus_{i=1}^n \nu_i\rp \les n^2 \cdot \param \lp \nu_1\rp \end{align} \end{corollary} \begin{proof} Let $\lay \lp \nu_1\rp = \lp l_0,l_1,...,l_L\rp$ where for all $i \in \{0,1,...,L \}$ it is the case that $l_i,L \in \N$. Corollary \ref{cor:sameparal} then tells us that: \begin{align} \param \lp \boxminus_{i=1}^n \nu_i\rp \les n^2 \param \lp \nu_i\rp \end{align} Then from Corollary \ref{affcor}, and (\ref{(5.5.6)}) we get that: \begin{align} \param \lp \lb \boxminus_{i=1}^n \nu_i \rb\bullet \cpy_{2,\inn\lp \nu_1\rp}\rp \les n^2\param \lp \nu_1\rp \end{align} And further that: \begin{align} \param \lp \sm_{2,\out \lp \boxminus_{i=1}^n \nu_i\rp} \bullet \lb \boxminus_{i=1}^n \nu_i\rb \bullet \cpy_{2,\inn\lp \nu_1\rp} \rp \les n^2\param \lp\nu_1 \rp \end{align} \end{proof} \begin{lemma}\label{depth_prop} Let $\nu_1, \nu_2 \in \neu$ satisfy that $\dep(\nu_1) = \dep(\nu_2) = L$, $\inn(\nu_1) = \inn(\nu_2)$, and $\out(\nu_1) = \out(\nu_2)$, and $\lay(\nu_1) = \lp l_{1,1},l_{1,2},...l_{1,L} \rp$ and $\lay \lp \nu_2 \rp = \lp l_{2,1}, l_{2,2},...,l_{2,L} \rp $ it is then the case that: \begin{align} \dep \lp \nu_1 \oplus \nu_2 \rp =L \end{align} \end{lemma} \begin{proof} Note that $\dep \lp \cpy_{n,k} \rp = 1 = \dep\lp \sm_{n,k} \rp$ for all $n,k \in \N$. Note also that $\dep \lp \nu_1 \boxminus \nu_2 \rp = \dep\lp \nu_1 \rp = \dep \lp \nu_2 \rp $ and that for $\nu,\mu \in \neu$ it is the case that $\dep \lp \nu \bullet \mu\rp = \dep\lp \nu \rp + \dep \lp \mu \rp-1$. Thus: \begin{align} \dep \lp \nu_1 \oplus \nu_1 \rp = \dep\lp \nu_1 \oplus \nu_2 \rp &= \dep \lp \aff_{\lb \mathbb{I}_{\out(\nu_2)} \: \mathbb{I}_{\out(\nu_2)}\rb, \mymathbb{0}_{\out(\nu_2)} }\bullet \lb \nu_1 \boxminus \nu_2\rb \bullet \aff_{\lb\mathbb{I}_{\inn(\nu_1)}\: \mathbb{I}_{\inn(\nu_1)}\rb^T,\mymathbb{0}_{2\cdot\inn(\nu_1)}} \rp\nonumber \\ &= L\nonumber \end{align} \end{proof} \begin{lemma}\label{5.4.6} Let $\nu_1,\nu_2 \in \neu$, such that $\dep(\nu_1) = \dep(\nu_2)= L$, $\inn(\nu_1) = \inn(\nu_2) = l_0$, and $\out(\nu_1) = \out(\nu_2) = l_L$. It is then the case that $\real(\nu_1 \oplus \nu_2) = \real(\nu_2 \oplus \nu_1)$, i.e., the instantiated sum of ANNs of the same depth and same end widths is commutative. \end{lemma} \begin{proof} Let $\nu_1 = \lp (W_1,b_1),(W_2,b_2),...,(W_L,b_L) \rp$ and let $\nu_2 = \lp (W'_1,b'_1),(W'_2,b'_2),...,(W_L', b_L') \rp $. Note that Definition $\ref{5.2.5}$ then tells us that: \begin{align} \nu_1 \boxminus \nu_2 = \lp\lp \begin{bmatrix} W_1 & 0 \\ 0 & W_1' \end{bmatrix}, \begin{bmatrix} b_1 \\ b_1' \end{bmatrix} \rp,\lp \begin{bmatrix} W_2 & 0 \\ 0 & W_2' \end{bmatrix}, \begin{bmatrix} b_2\\ b_2' \end{bmatrix}\rp,..., \right. \nonumber\\ \left. \lp \begin{bmatrix} W_L & 0 \\ 0 & W_L' \end{bmatrix}, \begin{bmatrix} b_L \\ b_L' \end{bmatrix} \rp \rp \nonumber \end{align} Note also that by Claims $\ref{5.4.4}$ and $\ref{5.4.5}$ and Definition \ref{5.3.1} we know that: \begin{align} \aff_{\lb \mathbb{I}_{\inn \lp \nu_2 \rp } \: \mathbb{I}_{\inn \lp \nu_2 \rp} \rb^T,\mymathbb{0}_{2\inn(\nu_2),1}} = \lp \begin{bmatrix} \mathbb{I}_{\inn(\nu_2)} \\ \mathbb{I}_{\inn(\nu_2)} \end{bmatrix},\mymathbb{0}_{2\inn(\nu_2),1}\rp \end{align} and: \begin{align} \aff_{\lb \mathbb{I}_{\out(\nu_1)} \: \mathbb{I}_{\out(\nu_1)} \rb,\mymathbb{0}_{2\out(\nu_1),1}} = \lp \begin{bmatrix} \mathbb{I}_{\out(\nu_1)} \: \mathbb{I}_{\out(\nu_1)} \end{bmatrix} ,\mymathbb{0}_{2\out(\nu_1),1} \rp \end{align} Applying Definition \ref{5.2.1}, specifically the second case, (\ref{5.4.3}) and ($\ref{5.4.4}$) yields that: \begin{align} &\lb \nu_1 \boxminus \nu_2 \rb \bullet \aff_{\lb \mathbb{I}_{\inn \lp \nu_2 \rp } \: \mathbb{I}_{\inn \lp \nu_2 \rp} \rb^T,\mymathbb{0}_{2\inn(\nu_2),1}} \nonumber \\ &= \lp \lp \begin{bmatrix} W_1 & 0 \\ 0 & W'_1 \end{bmatrix} \begin{bmatrix} \mathbb{I}_{\inn(\nu_1)} \\ \mathbb{I}_{\inn(\nu_1)} \end{bmatrix}, \begin{bmatrix} b_1 \\ b_1' \end{bmatrix} \rp, \lp \begin{bmatrix} W_2 & 0 \\ 0 & W'_2 \end{bmatrix}, \begin{bmatrix} b_2 \\ b_2' \end{bmatrix} \rp \right.,..., \nonumber \left. \lp \begin{bmatrix} W_L & 0 \\ 0 & W'_L \end{bmatrix}, \begin{bmatrix} b_L \\ b_L' \end{bmatrix} \rp \rp \nonumber \\ &= \lp \lp \begin{bmatrix} W_1\\ W'_1 \end{bmatrix} ,\begin{bmatrix} b_1 \\ b_1' \end{bmatrix} \rp, \lp \begin{bmatrix} W_2 & 0\\ 0 & W'_2 \end{bmatrix}, \begin{bmatrix} b_2 \\ b_2' \end{bmatrix} \rp \right.,..., \nonumber \left. \lp \begin{bmatrix} W_L & 0\\ 0 & W'_L \end{bmatrix}, \begin{bmatrix} b_L \\ b_L' \end{bmatrix} \rp \rp \nonumber \end{align} Applying Claim \ref{5.4.5} and especially the third case of Definition \ref{5.2.1} to to the above then gives us: \begin{align} &\aff_{\lb \mathbb{I}_{\out(\nu_1)} \: \mathbb{I}_{\out(\nu_1)} \rb,0}\bullet \lb \nu_1 \boxminus \nu_2 \rb \bullet \aff_{\lb \mathbb{I}_{\inn \lp \nu_2 \rp } \: \mathbb{I}_{\inn \lp \nu_2 \rp} \rb^T,0} = \nonumber\\ &\lp \lp \begin{bmatrix} W_1 \\ W'_1 \end{bmatrix} ,\begin{bmatrix} B_1 \\ B_1' \end{bmatrix} \rp, \lp \begin{bmatrix} W_2 & 0\\ 0 & W'_2 \end{bmatrix} ,\begin{bmatrix} b_2 \\ b_2' \end{bmatrix} \rp \right.,..., \nonumber \\ &\left. \lp \begin{bmatrix} \mathbb{I}_{\out(\nu_2)} \: \mathbb{I}_{\out(\nu_2)} \end{bmatrix}\begin{bmatrix} W_L & 0 \\ 0 & W'_L \end{bmatrix}, \begin{bmatrix} \mathbb{I}_{\out(\nu_2)} \: \mathbb{I}_{\out(\nu_2)} \end{bmatrix}\begin{bmatrix} b_L \\ b_L' \end{bmatrix} \rp \rp \nonumber \\ & =\lp \lp \begin{bmatrix} W_1\\ W'_1 \end{bmatrix} ,\begin{bmatrix} b_1 \\ b_1' \end{bmatrix} \rp, \lp \begin{bmatrix} W_2 & 0\\ 0 & W'_2 \end{bmatrix}, \begin{bmatrix} b_2 \\ b_2' \end{bmatrix} \rp \right.,..., \left. \lp \begin{bmatrix} W_L \quad W'_L \label{5.4.10} \end{bmatrix}, b_L + b_L' \rp \rp \end{align} Now note that: \begin{align} \nu_2 \boxminus \nu_1 = \lp\lp \begin{bmatrix} W_1' & 0 \\ 0 & W_1 \end{bmatrix}, \begin{bmatrix} b_1' \\ b_1 \end{bmatrix} \rp,\lp \begin{bmatrix} W_2' & 0 \\ 0 & W_2 \end{bmatrix}, \begin{bmatrix} b_2'\\ b_2 \end{bmatrix}\rp,..., \right. \nonumber\\ \left. \lp \begin{bmatrix} W_L' & 0 \\ 0 & W_L \end{bmatrix}, \begin{bmatrix} b_L' \\ b_L \end{bmatrix} \rp \rp \nonumber \end{align} And thus: \begin{align} &\aff_{\lb \mathbb{I}_{\out(\nu_2)} \: \mathbb{I}_{\out(\nu_2)} \rb,0}\bullet \lb \nu_2 \boxminus \nu_1 \rb \bullet \aff_{\lb \mathbb{I}_{\inn \lp \nu_1 \rp } \: \mathbb{I}_{\inn \lp \nu_1 \rp} \rb^T,0} \nonumber\\ &= \lp \lp \begin{bmatrix} W'_1\\ W_1 \end{bmatrix} ,\begin{bmatrix} b'_1 \\ b_1 \end{bmatrix} \rp, \lp \begin{bmatrix} W'_2 & 0\\ 0 & W_2 \end{bmatrix}, \begin{bmatrix} b'_2 \\ b_2 \end{bmatrix} \rp \right.,..., \left. \lp \begin{bmatrix} W'_L \quad W_L \end{bmatrix}, \begin{bmatrix} b_L' + b_L \end{bmatrix} \rp \rp \label{5.4.11} \end{align} Let $x \in \R^{\inn(\nu_1)}$, note then that: \begin{align} \begin{bmatrix} W_1 \\ W'_1 \end{bmatrix}x + \begin{bmatrix} b_1\\ b'_1 \end{bmatrix} = \begin{bmatrix} W_1x+b_1 \\ W'_1x+b_1' \end{bmatrix} \nonumber \end{align} The full instantiation of (\ref{5.4.10}) with activation function $\fa \in C \lp \R, \R\rp$ is then given by: \begin{align} \begin{bmatrix} W_L \quad W'_L \end{bmatrix}\begin{bmatrix} \act\lp W_{L-1}(...\act(W_2\lp \act\lp W_1x+b_1 \rp\rp + b_2) + ... )+ b_{L-1} \rp\\ \act\lp W'_{L-1}(...\act(W'_2\lp\act\lp W'_1x + b'_1 \rp \rp + b'_2)+...)+b'_{L-1}\rp \end{bmatrix} + b_L+b'_L \label{5.4.12} \end{align} The full instantiation of (\ref{5.4.11}) is then given by: \begin{align} \begin{bmatrix} W_L' \quad W_L \end{bmatrix}\begin{bmatrix} \act\lp W'_{L-1}(...\act(W'_2\lp \act \lp W'_1x+b'_1 \rp\rp + b'_2) + ... )+ b'_{L-1}\rp \\ \act\lp W_{L-1}(...\act(W_2 \lp\act\lp W_1x + b_1 \rp\rp + b_2)+...)+b_{L-1} \rp \end{bmatrix} + b_L+b'_L \label{5.4.13} \end{align} Since (\ref{5.4.12}) and (\ref{5.4.13}) are the same this proves that $\nu_1 \oplus \nu_2 = \nu_2 \oplus \nu_1$. \end{proof} \begin{remark} This is a special case of \cite[Lemma~3.28]{Grohs_2022}. \end{remark} \begin{lemma}\label{5.4.7} Let $ l_0,l_1,...,l_L \in \N$. Let $\nu \in \neu$ with $\lay(\nu) = \lp l_0,l_1,...,l_L \rp$. There then exists a neural network $\zero_{l_0,l_1,...,l_L} \in \neu$ such that $\real(\nu \oplus \zero_{l_0,l_1,...,l_L}) = \real(\zero_{l_0,l_1,...,l_L} \oplus \nu) = \nu $. \end{lemma} \begin{proof} Let $\nu = \lp \lp W_1, b_1 \rp, \lp W_2, b_2 \rp,..., \lp W_L,b_L \rp \rp$, where $W_1 \in \R^{l_1\times l_0}$, $b_1 \in \R^{l_1}$, $W_2 \in \R^{l_2 \times l_1}$, $b_2 \in \R^{l_2},...,W_L \in \R^{l_L \times l_{L-1}}$, $b_L \in \R^{l_L}$. Denote by $\zero_{l_0,l_1,...,l_L}$ the neural network which for all $l_0,l_1,...,l_L \in \N$ is given by: \begin{align} \zero_{l_0,l_1,...,l_L} = \lp \lp \mymathbb{0}_{l_1, l_0}, \mymathbb{0}_{l_1} \rp, \lp \mymathbb{0}_{l_2,l_1},\mymathbb{0}_{l_2} \rp,...,\lp \mymathbb{0}_{l_{L},l_{L-1}}, \mymathbb{0}_{l_L} \rp \rp \end{align} Thus, by (\ref{5.4.12}), we have that: \begin{align} \real(\zero_{l_0,l_1,...,l_L} \oplus\nu) &= \begin{bmatrix} 0 \quad W_L \end{bmatrix} \begin{bmatrix} 0 \nonumber \\ W_{L-1}(...(W_2 \lp W_1x+b_1 \rp +b_2)+...)+b_{L-1} \end{bmatrix} + b_L \\ &= W_L(W_{L-1}(...W_2\lp W_1x+b_1 \rp +b_2)+...)+b_{L-1})+b_L \end{align} \begin{align} \real(\nu \oplus \zero_{l_0,l_1,...,l_L}) &= \begin{bmatrix} W_L \quad 0 \end{bmatrix} \begin{bmatrix} W_{L-1}(...(W_2 \lp W_1x+b_1 \rp +b_2)+...)+b_{L-1} \nonumber \\ 0 \end{bmatrix} + b_L \\ &= W_L(W_{L-1}(...W_2\lp W_1x+b_1 \rp +b_2)+...)+b_{L-1})+b_L \end{align} And finally: \begin{align}\label{5.4.17} \real(\nu) = W_L(W_{L-1}(...W_2\lp W_1x+b_1 \rp +b_2)+...)+b_{L-1})+b_L \end{align} This completes the proof. \end{proof} \begin{lemma}\label{5.4.8} Given neural networks $\nu_1,\nu_2,\nu_3 \in \neu$ with fixed depth $L$, fixed starting width of $l_0$ and fixed finishing width of $l_L$, it is then the case that $\real\lp \lp \nu_1 \oplus \nu_2 \rp \oplus \nu_3 \rp = \real \lp \nu_1 \oplus \lp \nu_2 \oplus \nu_3 \rp \rp$, i.e. the instantiation with a continuous activation function of $\oplus$ is associative. \end{lemma} \begin{proof} Let $\nu_1 = \lp \lp W^1_1,b^1_1 \rp, \lp W^1_2,b^1_2 \rp, ..., \lp W^1_L,b^1_L \rp \rp$, $\nu_2 = \lp \lp W^2_1,b^2_1 \rp, \lp W^2_2,b^2_2 \rp,..., \lp W^2_L, b^2_L \rp \rp$, and $\nu_3 = \lp \lp W^3_1,b^3_1 \rp ,\lp W^3_2,b^3_2 \rp,..., \lp W^3_L,b^3_L \rp \rp$. Then (\ref{5.4.12}) tells us that: \begin{align} \real(\nu_1 \oplus \nu_2) =\begin{bmatrix} W^1_L \quad W^2_L \end{bmatrix}\begin{bmatrix} W^1_{L-1}\lp...\lp W^1_2\lp W^1_1x+b^1_1 \rp + b^1_2 \rp + ... \rp + b^1_{L-1} \\ W^2_{L-1}\lp...\lp W^2_2 \lp W^2_1x + b^2_1 \rp + b^2_2 \rp +...\rp+b^2_{L-1} \end{bmatrix} + b^1_L+b^2_L \nonumber \end{align} And thus: \begin{align}\label{5.4.18} &\real \lp \lp \nu_1 \oplus \nu_2 \rp \oplus \nu_3 \rp \lp x \rp = \nonumber\\ &\real \lp \begin{bmatrix} \mathbb{I} \quad W^3_L \end{bmatrix}\begin{bmatrix} \begin{bmatrix} W^1_L \quad W^2_L \end{bmatrix}\begin{bmatrix} W^1_{L-1}\lp...\lp W^1_2\lp W^1_1x+b^1_1 \rp + b^1_2 \rp + ... \rp + b^1_{L-1} \\ W^2_{L-1}\lp...\lp W^2_2 \lp W^2_1x + b^2_1 \rp + b^2_2 \rp +...\rp+b^2_{L-1} \end{bmatrix} + b^1_L+b^2_L \\ W^3_{L-1}\lp...\lp W^3_2 \lp W^3_1x + b^3_1 \rp + b^3_2 \rp +...\rp+b^3_{L-1} \end{bmatrix} +b^3_L \rp \end{align} Similarly, we have that: \begin{align}\label{5.4.19} &\real_{\act} \lp \nu_1 \oplus \lp \nu_2 \oplus \nu_3 \rp \rp \lp x \rp = \nonumber\\ &\real \lp \begin{bmatrix} W^1_L & \mathbb{I} \end{bmatrix}\begin{bmatrix} W^1_{L-1}\lp...\lp W^1_2\lp W^1_1x+b^1_1 \rp + b^1_2 \rp + ... \rp + b^1_{L-1} \\ \begin{bmatrix} W^2_L \quad W^3_L \end{bmatrix}\begin{bmatrix} W^2_{L-1}\lp...\lp W^2_2\lp W^2_1x+b^2_1 \rp + b^2_2 \rp + ... \rp + b^2_{L-1} \\ W^3_{L-1}\lp...\lp W^3_2 \lp W^3_1x + b^3_1 \rp + b^3_2 \rp +...\rp+b^3_{L-1} \end{bmatrix} + b^2_L+b^3_L \end{bmatrix} +b^1_L \rp \end{align} Note that the associativity of matrix-vector multiplication ensures that (\ref{5.4.18}) and (\ref{5.4.19}) are the same. \end{proof} \begin{definition}[Commutative Semi-group] %TODO: Modify the monoid definition; the additive identity is not unique. A set $X$ equipped with a binary operation $*$ is called a monoid if: \begin{enumerate}[label = (\roman*)] \item for all $x,y,z \in X$ it is the case that $(x *y)*z = x*(y*z)$ and \item for all $x,y \in X$ it is the case that $x*y=y*x$ \end{enumerate} \end{definition} \begin{theorem} For fixed depth and layer widths, the set of instantiated neural networks $\nu \in \neu$ form a commutative semi-group under the operation of $\oplus$. \end{theorem} \begin{proof} This is a consequence of Lemmas \ref{5.4.6}, \ref{5.4.7}, and \ref{5.4.8}. \end{proof} \begin{lemma}\label{5.5.11}\label{nn_sum_is_sum_nn} Let $\nu, \mu \in \neu$, with the same length and end-widths. It is then the case that $\real_{\act} \lp \nu \oplus \mu \rp = \real_{\act}\lp \nu \rp + \real_{\act}\lp \mu \rp$. \end{lemma} \begin{proof} Let $\nu = \lp \lp W_1,b_1 \rp, \lp W_2,b_2 \rp,...,\lp W_L,b_L \rp \rp$ and $\mu = \lp \lp W'_1,b'_1 \rp, \lp W_2',b_2'\rp,...,\lp W_L',b_L' \rp \rp $. Note now that by (\ref{5.4.12}) we have that: \begin{align}\label{5.5.20} \real_{\act}\lp \nu \rp = W_L \act \lp W_{L-1}(...\act (W_2 \act \lp W_1x+b_1 \rp + b_2) + ... )+ b_{L-1}\rp + b_L \end{align} And: \begin{align} \real_{\act}\lp \mu \rp = W'_L\act \lp W'_{L-1}(...\act (W_2'\act \lp W_1'x+b_1' \rp + b_2') + ... )+ b_{L-1}'\rp + b_L' \end{align} In addition, because of the block matrix structure of the weights of our summands: \begin{align} \real_{\act}\lp \nu \oplus \mu \rp \lp x \rp&=\begin{bmatrix} W_L \quad W'_L \end{bmatrix}\begin{bmatrix} \act \lp W_{L-1}(...\act ( W_2\act \lp W_1x+b_1 \rp + b_2) + ... )+ b_{L-1} \rp \\ \act \lp W'_{L-1}(...\act( W'_2 \act \lp W'_1x + b'_1 \rp + b'_2)+...)+b'_{L-1} \rp \end{bmatrix} + b_L+b'_L \label{5.4.12} \nonumber\\ &= W_L \act \lp W_{L-1}(...\act (W_2 \act \lp W_1x+b_1 \rp + b_2) + ... )+ b_{L-1}\rp + b_L \nonumber \\ &+ W'_L\act \lp W'_{L-1}(...\act (W_2'\act \lp W_1'x+b_1' \rp + b_2') + ... )+ b_{L-1}'\rp + b_L' \nonumber \\ &=\real_{\act} \lp \nu\rp \lp x \rp + \real_{\act} \lp \mu\rp \lp x \rp \end{align} This proves the lemma. \end{proof} \begin{lemma}\label{nn_sum_cont} Let $n\in \N$. Let $\nu_1,\nu_2,...,\nu_n \in \neu$. It is then the case that: \begin{align} \real_{\act}\lp \bigoplus^n_{i=1} \nu_i \rp = \sum^n_{i=1} \real_{\rect} \lp \nu_i\rp \end{align} \end{lemma} \begin{proof} This is the consequence of a finite number of applications of Lemma \ref{5.5.11}. This proves the Lemma. \end{proof} \subsection{Sum of ANNs of Unequal Depth But Same End-widths} \begin{definition}[Sum of ANNs of different depths but same end widths] Let $n\in \N$. Let $\nu_1,\nu_2,...,\nu_n \in \neu$ such that they have the same end widths. We define the neural network $\dplus_{i=1}^n\nu_i \in \neu$, the neural network sum of neural networks of unequal depth as: \begin{align} \dplus^n_{i=1}\nu_i \coloneqq \lp \sm_{n,\out(\nu_2)} \bullet \lb \DDiamond^v_{i=u}\nu_i \rb \bullet \cpy_{n,\inn(\nu_1)} \rp \end{align} \end{definition} \begin{lemma}\label{lem:diamondplus} Let $n\in \N$. Let $\nu_1,\nu_2 \in \neu$ and assume also that they have the same end-widths. It is then the case that: \begin{align} \real_{\rect}\lp \nu_1 \dplus \nu_2\rp \lp x\rp = \real_{\rect}\lp \nu_1\rp + \real_{\rect}\lp \nu_2\rp \end{align} \end{lemma} \begin{proof} Note that Lemma \ref{6.2.2} tellls us that for all $n\in \N$ it is the case that $\real_{\rect} \lp \tun_n\rp \lp x\rp = x$. This combined with Lemma \ref{comp_prop} then tells us that for all $n\in \N$ it is the case for all $\nu \in \neu$ that: \begin{align} \real_{\rect} \lp \tun_n \bullet \nu \rp \lp x\rp = \real_{\rect} \lp \nu \rp \lp x\rp \end{align} Thus, this means that: \begin{align} \real_{\rect} \lp \nu_1 \dplus \nu_2\rp \lp x \rp &= \lp \sm_{n,\out(\nu_2)} \bullet \lb \nu_1 \DDiamond \nu_2\rb \bullet \cpy_{n,\inn(\nu_1)} \rp \nonumber\\ &= \real_{\rect}\lp \nu_1\rp \lp x \rp + \real_{\rect} \lp \nu_2\rp \lp x \rp \end{align} This then proves the lemma. \end{proof} \begin{lemma} Let $n \in \N$. Let $\nu_1,\nu_2,...,\nu_n \in \neu$. Let it also be the case that they have the same end-widths. It is then the case that: \begin{align} \real_{\rect}\lp \dplus^n_{i=1}\nu_i\rp \lp x\rp = \sum^n_{i=1}\real_{\rect}\lp \nu_i\rp \lp x \rp \end{align} \end{lemma} \begin{proof} This is a consequence of a finite number of applications of Lemma \ref{lem:diamondplus}. This proves the Lemma. \end{proof} \begin{remark} We may represent this kind of sum as the neural network diagram shown below: \begin{figure}[h] \begin{center} \tikzset{every picture/.style={line width=0.75pt}} %set default line width to 0.75pt \begin{tikzpicture}[x=0.75pt,y=0.75pt,yscale=-1,xscale=1] %uncomment if require: \path (0,433); %set diagram left start at 0, and has height of 433 %Shape: Rectangle [id:dp9509582141653736] \draw (470,170) -- (540,170) -- (540,210) -- (470,210) -- cycle ; %Shape: Rectangle [id:dp042468147108538634] \draw (200,100) -- (400,100) -- (400,140) -- (200,140) -- cycle ; %Shape: Rectangle [id:dp46427980442406214] \draw (330,240) -- (400,240) -- (400,280) -- (330,280) -- cycle ; %Straight Lines [id:da8763809527154822] \draw (470,170) -- (401.63,121.16) ; \draw [shift={(400,120)}, rotate = 35.54] [color={rgb, 255:red, 0; green, 0; blue, 0 } ][line width=0.75] (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29) ; %Straight Lines [id:da9909123473315302] \draw (470,210) -- (401.63,258.84) ; \draw [shift={(400,260)}, rotate = 324.46] [color={rgb, 255:red, 0; green, 0; blue, 0 } ][line width=0.75] (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29) ; %Straight Lines [id:da8497218496635237] \draw (570,190) -- (542,190) ; \draw [shift={(540,190)}, rotate = 360] [color={rgb, 255:red, 0; green, 0; blue, 0 } ][line width=0.75] (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29) ; %Shape: Rectangle [id:dp11197066111784415] \draw (80,170) -- (150,170) -- (150,210) -- (80,210) -- cycle ; %Straight Lines [id:da5201326815013356] \draw (200,130) -- (151.56,168.75) ; \draw [shift={(150,170)}, rotate = 321.34] [color={rgb, 255:red, 0; green, 0; blue, 0 } ][line width=0.75] (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29) ; %Straight Lines [id:da4370325799656589] \draw (330,260) -- (312,260) ; \draw [shift={(310,260)}, rotate = 360] [color={rgb, 255:red, 0; green, 0; blue, 0 } ][line width=0.75] (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29) ; %Straight Lines [id:da012890543438617508] \draw (80,190) -- (52,190) ; \draw [shift={(50,190)}, rotate = 360] [color={rgb, 255:red, 0; green, 0; blue, 0 } ][line width=0.75] (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29) ; %Shape: Rectangle [id:dp2321426611089945] \draw (200,240) -- (310,240) -- (310,280) -- (200,280) -- cycle ; %Straight Lines [id:da03278204116412775] \draw (200,260) -- (151.41,211.41) ; \draw [shift={(150,210)}, rotate = 45] [color={rgb, 255:red, 0; green, 0; blue, 0 } ][line width=0.75] (10.93,-3.29) .. controls (6.95,-1.4) and (3.31,-0.3) .. (0,0) .. controls (3.31,0.3) and (6.95,1.4) .. (10.93,3.29) ; % Text Node \draw (481,182.4) node [anchor=north west][inner sep=0.75pt] {$\mathsf{Cpy}_{n}{}_{,}{}_{k}$}; % Text Node \draw (301,110.4) node [anchor=north west][inner sep=0.75pt] {$\nu _{1}$}; % Text Node \draw (351,252.4) node [anchor=north west][inner sep=0.75pt] {$\nu _{2}$}; % Text Node \draw (574,180.4) node [anchor=north west][inner sep=0.75pt] {$x$}; % Text Node \draw (441,132.4) node [anchor=north west][inner sep=0.75pt] {$x$}; % Text Node \draw (437,232.4) node [anchor=north west][inner sep=0.75pt] {$x$}; % Text Node \draw (91,180.4) node [anchor=north west][inner sep=0.75pt] {$\mathsf{Sum}_{n}{}_{,}{}_{k}$}; % Text Node \draw (238,252.4) node [anchor=north west][inner sep=0.75pt] {$\mathsf{Tun}$}; \end{tikzpicture} \caption{Neural network diagram of a neural network sum of unequal depth networks.} \end{center} \end{figure} \end{remark} \section{Linear Combinations of ANNs and Their Properties} \begin{definition}[Scalar left-multiplication with an ANN]\label{slm} Let $\lambda \in \R$. We will denote by $(\cdot ) \triangleright (\cdot ): \R \times \neu \rightarrow \neu$ the function that satisfy for all $\lambda \in \R$ and $\nu \in \neu$ that $\lambda \triangleright \nu = \aff_{\lambda \mathbb{I}_{\out(\nu)},0} \bullet \nu$. \end{definition} \begin{definition}[Scalar right-multiplication with an ANN] Let $\lambda \in \R$. We will denote by $(\cdot) \triangleleft (\cdot): \neu \times \R \rightarrow \neu$ the function satisfying for all $\nu \in \neu$ and $\lambda \in \R$ that $\nu \triangleleft \lambda = \nu \bullet \aff_{\lambda \mathbb{I}_{\inn(\nu)},0}$. \end{definition} \begin{remark} Note that whereas $\lambda \in \R$, the actual neural network in question, properly speaking, must always be referred to as $\lambda \triangleright$ or $\triangleleft\lambda$, and we shall do so whenever this comes up in any neural network diagrams. This is by analogy with, for example, $\log_\lambda$ or $\sqrt[\lambda ]{}$ for $\lambda \neq 0$, where the argument $\lambda$ is generally always written except for $\lambda = 10$ for the logarithm or $\lambda = 2$ for the root. \end{remark} \begin{remark} For an \texttt{R} implementation, see Listing \ref{scalar_mult} \end{remark} \begin{lemma}\label{5.6.3} Let $\lambda \in \R$ and $\nu \in \neu$. it is then the case that: \begin{enumerate}[label = (\roman*)] \item $\lay(\lambda \triangleright \nu) = \lay(\nu)$ \item For all $\act \in C(\R, \R)$ that $\real_{\act}(\lambda \triangleright \nu) \in C \lp \R^{\inn(\nu)}, \R^{\out(\nu)} \rp $ \item For all $\act \in C(\R,\R)$, and $x \in \R^{\inn(\nu)}$ that: \begin{align} \real_{\act} \lp \lambda \triangleright \nu \rp = \lambda \real_{\act}(\nu) \end{align} \end{enumerate} \end{lemma} \begin{proof} Let $\nu \in \neu$ such that $\lay(\nu) = \lp l_1,l_2,...,l_L \rp$ and $\dep(\nu) = L$ where $l_1,l_2,...,l_L,L \in \N$. Then Item (i) of Lemma $\ref{5.3.2}$ tells us that: \begin{align} \lay \lp \aff_{\mathbb{I}_{\out(\nu)},0}\rp = \lp \out(\nu), \out(\nu) \rp \end{align} This and Item (i) from Lemma \ref{5.3.3} gives us that: \begin{align} \lay \lp \lambda \triangleright \nu \rp = \lay \lp \aff_{\lambda \mathbb{I}_{\out(\nu)},0} \bullet \nu \rp = \lp l_0, l_1,...,l_{L-1}, \out(\nu) \rp = \lay(\nu) \end{align} Which proves (i). Item (ii)\textemdash(iii) of Lemma $\ref{5.3.2}$ then prove that for all $\act \in C(\R,\R)$, $x \in \R^{\inn(\nu)}$, that $\real_{\act} \lp \lambda \triangleright \nu \rp \in C \lp \R^{\inn(\nu),\out(\nu)} \rp$ given by: \begin{align} \lp \real_{\act} \lp \lambda \triangleright \nu \rp \rp \lp x \rp &= \lp \real_{\act} \lp \aff_{\lambda \mathbb{I}_{\out(\nu),0}} \bullet \nu \rp \rp \lp x \rp \nonumber\\ &= \lambda \mathbb{I}_{\out(\nu)} \lp \lp \real_{\act} \lp \nu \rp \rp \lp x \rp \rp = \lambda \lp \lp \real_{\act} \lp \nu \rp \rp \lp x \rp \rp \end{align} This establishes Items (ii)\textemdash(iii), completing the proof. \end{proof} \begin{lemma}\label{5.6.4} Let $\lambda \in \R$ and $\nu \in \neu$. It is then the case that: \begin{enumerate}[label = (\roman*)] \item $\lay(\nu \triangleleft \lambda) = \lay(\nu)$ \item For all $\act \in C \lp \R, \R \rp$ that $\real_{\act}(\nu \triangleleft \lambda) \in C \lp \R^{\inn(\nu)}, \R^{\out(\nu)} \rp$ \item For all $\act \in C \lp \R, \R \rp$, and $x \in \R^{\inn(\nu)}$ that: \begin{align} \real_{\act} \lp \nu \triangleleft \lambda \rp = \real_{\act}(\nu)\lp \lambda x \rp \end{align} \end{enumerate} \end{lemma} \begin{proof} Let $\nu \in \neu$ such that $\lay(\nu) = \lp l_1,l_2,...,l_L \rp$ and $\dep(\nu) = L$ where $\l_1,l_2,...,l_L, L \in \N$. Then Item (i) of Lemma \ref{5.3.2} tells us that: \begin{align} \lay \lp \aff_{\mathbb{I}_{\inn(\nu)},0} \rp = \lp \inn(\nu), \inn(\nu) \rp \end{align} This and Item (iv) of Lemma \ref{5.3.3} tells us that: \begin{align} \lay(\nu \triangleleft\lambda) = \lay \lp \nu \bullet \aff_{\lambda \mathbb{I}_{\inn(\nu)}}\rp = \lp \inn(\nu), l_1,l_2,...,l_L \rp = \lay(\nu) \end{align} Which proves $(i)$. Item (v)\textemdash(vi) of Lemma \ref{5.3.3} then prove that for all $\act \in C(\R,\R)$, $x \in \R^{\inn(\nu)}$ that $\real_{\act} \lp \nu \triangleleft \lambda \rp \in C\lp \R^{\inn(\nu),\out(\nu)} \rp$ given by: \begin{align} \lp \real_{\act} \lp \nu \triangleleft \lambda \rp \rp \lp x \rp &= \lp \real_{\act} \lp \nu \bullet \aff_{\lambda \mathbb{I}_{\inn(\nu),0}} \rp \rp \lp x \rp \nonumber\\ &= \lp \real_{\act} \lp \nu \rp \rp \lp \aff_{\lambda \mathbb{I}_{\inn(\nu)}} \rp \lp x \rp \nonumber\\ &= \lp \real_{\act} \lp \nu \rp \rp \lp \lambda x \rp \end{align} This completes the proof. \end{proof} \begin{lemma}\label{scalar_right_mult_distribution} Let $\nu,\mu \in \neu$ with the same length and end-widths, and $\lambda \in \R$. It is then the case, for all $\act \in C \lp \R, \R \rp$ that: \begin{align} \real_{\act} \lp \lp \nu \oplus \mu \rp \triangleleft \lambda \rp \lp x \rp &= \real_{\act} \lp \lp \nu \triangleleft \lambda \rp \oplus \lp \mu \triangleleft \lambda \rp \rp \lp x \rp \nonumber\\ &= \lp \real_{\act}\lp \nu \rp \rp \lp \lambda x \rp + \lp \real_{\act} \lp \mu \rp \rp \lp \lambda x \rp \nonumber \end{align} \end{lemma} \begin{proof} Let $\nu = \lp \lp W_1,b_1 \rp, \lp W_2,b_2 \rp,...,\lp W_L,b_L \rp \rp$ and $\mu = \lp \lp W'_1,b'_1 \rp, \lp W'_2,b'_2 \rp,...,\lp W'_L,b'_L \rp \rp$. Then from Lemma \ref{5.6.4} and (\ref{5.4.12}) we have that: \begin{align} &\lp \real_{\act}\lp \nu \oplus \mu \rp \triangleleft \lambda \rp \lp x \rp \nonumber\\ &= \lp \real_{\act} \lp \nu \oplus \mu \rp \rp \lp \lambda x \rp \nonumber\\ &= \begin{bmatrix} W_L \quad W'_L \end{bmatrix}\begin{bmatrix} \act \lp W_{L-1}(...(\act \lp W_2\lp \act \lp W_1\lambda x+b_1 \rp \rp + b_2)\rp + ... )+ b_{L-1}\rp \\ \act \lp W'_{L-1}(...(\act \lp W'_2\lp \act \lp W'_1\lambda x+b'_1 \rp \rp + b'_2)\rp + ... )+ b'_{L-1}\rp \\ \end{bmatrix} + b_L+b'_L \nonumber \end{align} Note that: \begin{align} \lp \real_{\act} \lp \nu \rp \rp \lp \lambda x \rp = W_L \cdot \act \lp W_{L-1}(...(\act \lp W_2\lp \act \lp W_1\lambda x+b_1 \rp \rp + b_2)\rp + ... )+ b_{L-1}\rp + b_L \end{align} and that: \begin{align} \lp \real_{\act} \lp \mu \rp \rp \lp \lambda x \rp = W'_L\cdot\act \lp W'_{L-1}(...(\act \lp W'_2\lp \act \lp W'_1\lambda x+b'_1 \rp \rp + b'_2)\rp + ... )+ b'_{L-1}\rp + b'_L \end{align} This, together with Lemma \ref{5.5.11}, completes the proof. \end{proof} \begin{lemma}\label{scalar_left_mult_distribution} Let $\nu,\mu \in \neu$ with the same length and end-widths, and $\lambda \in \R$. It is then the case, for all $\act \in C \lp \R, \R \rp$ that: \begin{align} \real_{\act} \lp \lambda \triangleright\lp \nu \oplus \mu \rp \rp \lp x \rp &= \real_{\act} \lp \lp \lambda \triangleright\nu \rp \oplus \lp \lambda \triangleright\mu \rp \rp \lp x \rp \nonumber\\ &= \lambda \cdot \lp \real_{\act}\lp \nu \rp \rp \lp x \rp + \lambda \cdot \lp \real_{\act} \lp \mu \rp \rp \lp x \rp \nonumber \end{align} \end{lemma} \begin{proof} Let $\nu = \lp \lp W_1,b_1 \rp, \lp W_2,b_2 \rp,...,\lp W_L,b_L \rp \rp$ and $\mu = \lp \lp W'_1,b'_1 \rp, \lp W'_2,b'_2 \rp,...,\lp W'_L,b'_L \rp \rp$. Then from Lemma \ref{5.6.4} and (\ref{5.4.12}) we have that: \begin{align} & \real_{\act}\lp \lambda \lp \nu \oplus \mu \rp \rp \lp x \rp \nonumber\\ &= \real_{\act} \lp \lambda \triangleright \lp \nu \oplus \mu \rp\rp \lp \lambda x \rp \nonumber\\ &= \lambda \cdot \begin{bmatrix} W_L \quad W'_L \end{bmatrix}\begin{bmatrix} \inst_{\rect} \lp W_{L-1}(...(\inst_{\rect} \lp W_2\lp \inst_{\rect} \lp W_1 x+b_1 \rp \rp + b_2)\rp + ... )+ b_{L-1}\rp \\ \inst_{\rect} \lp W'_{L-1}(...(\inst_{\rect} \lp W'_2\lp \inst_{\rect} \lp W'_1x+b'_1 \rp \rp + b'_2)\rp + ... )+ b'_{L-1}\rp \\ \end{bmatrix} + b_L+b'_L \nonumber \end{align} Note that: \begin{align} \lambda\cdot\lp \real_{\act} \lp \nu \rp \rp \lp x \rp = W_L \cdot \inst_{\rect} \lp W_{L-1}(...(\inst_{\rect} \lp W_2\lp \inst_{\rect} \lp W_1x+b_1 \rp \rp + b_2)\rp + ... )+ b_{L-1}\rp + b_L \end{align} and that: \begin{align} \lambda \cdot \lp \real_{\act} \lp \mu \rp \rp \lp x \rp = W'_L\cdot\inst_{\rect} \lp W'_{L-1}(...(\inst_{\rect} \lp W'_2\lp \inst_{\rect} \lp W'_1 x+b'_1 \rp \rp + b'_2)\rp + ... )+ b'_{L-1}\rp + b'_L \end{align} This, together with Lemma \ref{5.5.11}, completes the proof. \end{proof} \begin{lemma}\label{5.6.5} Let $u,v \in \Z$ with $u \leqslant v$ and $n = v-u+1$. Let $\lambda_u,\lambda_{u+1},..., \lambda_v \in \R$. Let $\nu_u, \nu_{u+1},...,\nu_v, \mu \in \neu$, $B_{u}, B_{u+1},...,B_v \in \R^{\inn(\mu)}$ satisfy that $\lay (\nu_u) = \lay(\nu_{u+1}) = ...= \lay(\nu_v)$ and further that: \begin{align} \mu = \lb \oplus^v_{i=u} \lp c_i \triangleright \lp \nu_i \bullet \aff_{\mathbb{I}_{\inn(\nu_1)},B_i} \rp \rp \rb \end{align} It then holds: \begin{enumerate}[label = (\roman*)] \item That: \begin{align} \lay (\mu) &= \lp \inn(\nu_u), \sum^v_{i=u} \wid_1 \lp \nu_u \rp , \sum^v_{i=u} \wid_2 \lp \nu_u \rp,..., \sum^v_{i=u} \wid_{\dep(\nu_u)-1} \lp \nu_u \rp , \out(\nu_u) \rp \nonumber\\ &= \lp \inn(\nu_u), n\wid_1(\nu_u), n\wid_2(\nu_u),...,n\wid_{\dep(\nu_u -1)}, \out(\nu_u) \rp \nonumber \end{align} \item that for all $\act \in C \lp \R ,\R \rp$, that $\real_{\act} (\mu) \in C \lp \R^{\inn(\nu_u)}, \R^{\out(\nu_u)} \rp $, and \item for all $\act \in C \lp \R, \R \rp $ and $x \in \R^{\inn(\nu_u)}$ that: \begin{align} \lp \real_{\act} \lp \mu \rp \rp \lp x \rp = \sum^v_{i=u} c_i \lp \real_{\act} \lp \nu_i \rp \rp \lp x + B_i \rp \end{align} \end{enumerate} \end{lemma} \begin{proof} Assume hypothesis that $\lay(\nu_u) = \lay(\nu_{u+1}) = ... = \lay(\nu_v)$. Note that Item (i) of Lemma \ref{5.3.2} gives us that for all $i \in \{u,u+1,...,v\}$ that: \begin{align} \lay \lp \aff_{\mathbb{I}_{\inn(\nu_i),B_i}} \rp = \lay \lp \aff_{\mathbb{I}_{\inn(\nu_u)}} \rp = \lp \inn \lp \nu_u \rp, \inn \lp \nu_u \rp \rp \in \N^2 \end{align} This together with Lemma \ref{comp_prop}, Item (i), assures us that for all $i \in \{ u,u+1,...,v\}$ it is the case that: \begin{align}\label{5.3.15} \lay\lp \nu_i \bullet \aff_{\mathbb{I}_{\inn(\nu_i)}, B_i} \rp = \lp \inn(\nu_u), \wid_1 \lp \nu_u \rp, \wid_2 \lp \nu_u \rp,..., \wid_{\dep(\nu_u)} \lp \nu_u \rp \rp \end{align} This and \cite[Lemma~3.14, Item~(i)]{Grohs_2022} tells us that for all $i \in \{u,u+1,...,v\}$ it is the case that: \begin{align}\label{5.6.13} \lay \lp c_i \triangleright \lp \nu_i \bullet \aff_{\mathbb{I}_{\inn(\nu_i), B_i}} \rp \rp = \lay \lp \nu_i \bullet \aff_{\mathbb{I}_{\inn(\nu_i),B_i}} \rp \end{align} This, (\ref{5.3.15}), and \cite[Lemma~3.28, Item~(ii)]{Grohs_2022} then yield that: \begin{align} \lay(\mu) &= \lay \lp \oplus^v_{i=u} \lp c_i \triangleright \lp \nu_i \bullet \aff_{\mathbb{I}_{\inn(\nu_i),B_i}} \rp \rp \rp \nonumber\\ &= \lp \inn(\nu_u), \sum^v_{i=u} \wid_1 \lp \nu_u \rp,\sum^v_{i=u} \wid_2 \lp \nu_u \rp,..., \sum^v_{i=u} \wid_{\dep(\nu_u)-1} \lp \nu_u \rp , \out \lp \nu_u \rp \rp \nonumber \\ &= \lp \inn(\nu_u), n\wid_1(\nu_u), n\wid_2 ( \nu_u),...,n\wid_{\dep(\nu_u)-1}(\nu_u), \out(\nu_u) \rp \end{align} This establishes item (i). Items (v) and (vi) from Lemma \ref{5.3.3} tells us that for all $i \in \{ u,u+1,...,v\}$, $\act \in C(\R,\R)$, $x \in \R^{\inn(\nu_u)}$, it is the case that $\real_{\act}\lp \nu_i \bullet \aff_{\mathbb{I}_{\inn(\nu_i)}, B_i} \rp \in C \lp \R^{\inn(\nu_u)}, \R^{\out(\nu_u)} \rp $ and further that: \begin{align} \lp \real_{\act} \lp \nu_i \bullet \aff_{\mathbb{I}_{\inn(\nu_i)},b_i} \rp \rp \lp x \rp = \lp \real_{\act} \lp \nu_i \rp \rp \lp x + b_i \rp \end{align} This along with \cite[Lemma~3.14]{Grohs_2022} ensures that for all $i \in \{u,u+1,...,v\}$, $\act \in C \lp \R, \R \rp$, $x \in \R^{\inn(\nu_u)}$, it is the case that: \begin{align} \real_{\act} \lp c_i \triangleright \lp \nu_i \bullet \aff_{\mathbb{I}_{\inn(\nu_i)},B_i} \rp \rp \in C \lp \R^{\inn(\nu_u)}, \R^{\out(\nu_u)}\rp \end{align} and: \begin{align} \lp \real_{\act} \lp c_i \triangleright \lp \nu_i \bullet \aff_{\mathbb{I}_{\inn(\nu_i)},b_i} \rp \rp \rp \lp x \rp = c_i \lp \real_{\act} \lp \nu_i \rp \rp \lp x + b_i \rp \end{align} Now observe that \cite[Lemma~3.28]{Grohs_2022} and (\ref{5.6.13}) ensure that for all $\act \in C \lp \R, \R \rp$, $x \in \R^{\inn(\nu_u)}$, it is the case that $\real_{\act} \lp \mu \rp \in C \lp \R^{\inn(\nu_u)}, \R^{\out(\nu_u)} \rp$ and that: \begin{align} \lp \real_{\act} \lp \mu \rp \rp \lp x \rp &= \lp \real_{\act} \lp \oplus^v_{i=u} \lp c_i \triangleright \lp \nu_i \bullet \aff_{\mathbb{I}_{\inn(\nu_i)},b_i} \rp \rp \rp \rp \lp x \rp \nonumber\\ &= \sum^v_{i=u} \lp \real_{\act} \lp c_i \triangleright \lp \nu_i \bullet \aff_{\mathbb{I}_{\inn(\nu_i)},b_i} \rp \rp \rp \lp x \rp \nonumber\\ &=\sum^v_{i=u} c_i \lp \real_{\act} \lp \nu_i \rp \rp \lp x+b_i \rp \nonumber \end{align} This establishes items (ii)--(iii); thus, the proof is complete. \end{proof} \begin{lemma}\label{5.6.6} Let $u,v \in \Z$ with $u \leqslant v$. Let $\lambda_u,\lambda_{u+1},..., \lambda_v \in \R$. Let $\nu_u, \nu_{u+1},...,\nu_v, \mu \in \neu$, $B_{u}, B_{u+1},...,B_v \in \R^{\inn(\mu)}$ satisfy that $\lay (\nu_u) = \lay(\nu_{u+1}) = ...= \lay(\nu_v)$ and further that: \begin{align} \mu = \lb \oplus^v_{i=u} \lp \lp \aff_{\mathbb{I}_{\inn(\nu_1)},b_i} \bullet \nu \rp \triangleleft c_i \rp \rb \end{align} It then holds: \begin{enumerate}[label = (\roman*)] \item That: \begin{align} \lay (\mu) &= \lp \inn(\nu_u), \sum^v_{i=u} \wid_1 \lp \nu_u \rp , \sum^v_{i=u} \wid_2 \lp \nu_u \rp,..., \sum^v_{i=u} \wid_{\dep(\nu_u)-1} \lp \nu_u \rp , \out(\nu_u) \rp \nonumber\\ &= \lp \inn(\nu_u), n\wid_1(\nu_u), n\wid_2(\nu_u),...,n\wid_{\dep(\nu_u -1)}, \out(\nu_u) \rp \end{align} \item that for all $\act \in C \lp \R ,\R \rp$, that $\real_{\act} (\mu) \in C \lp \R^{\inn(\nu_u)}, \R^{\out(\nu_u)} \rp $, and \item for all $\act \in C \lp \R, \R \rp $ and $x \in \R^{\inn(\nu_u)}$ that: \begin{align} \lp \real_{\act} \lp \mu \rp \rp \lp x \rp = \sum^v_{i=u} \lp \real_{\act} \lp \nu_i \rp \rp \lp c_ix + b_i \rp \end{align} \end{enumerate} \end{lemma} \begin{proof} Assume hypothesis that $\lay(\nu_u) = \lay(\nu_{u+1}) = ... = \lay(\nu_v)$. Note that Item (i) of Lemma \ref{5.3.2} gives us that for all $i \in \{u,u+1,...,v\}$ that: \begin{align} \lay \lp \aff_{\mathbb{I}_{\inn(\nu_i),B_i}} \rp = \lay \lp \aff_{\mathbb{I}_{\inn(\nu_u)}} \rp = \lp \inn \lp \nu_u \rp, \inn \lp \nu_u \rp \rp \in \N^2 \end{align} Note then that Lemma \ref{comp_prop}, Item (ii), tells us that for all $i \in \{u,u+1,...,v\}$ it is the case that: \begin{align}\label{5.6.22} \lay\lp \aff_{\mathbb{I}_{\inn(\nu_i)}, B_i} \bullet \nu \rp = \lp \inn(\nu_u), \wid_1 \lp \nu_u \rp, \wid_2 \lp \nu_u \rp,..., \wid_{\dep(\nu_u)} \lp \nu_u \rp \rp \end{align} This and Item (i) of Lemma \ref{5.6.4} tells us that for all $i \in \{u,u+1,...,v\}$ it is the case that: \begin{align}\label{5.6.23} \lay \lp \lp \aff_{\mathbb{I}_{\inn(\nu_i), b_i}} \bullet \nu \rp \triangleleft c_i \rp = \lay \lp \aff_{\mathbb{I}_{\inn(\nu_i),b_i} } \bullet\nu \rp \end{align} This, (\ref{5.6.22}), and \cite[Lemma~3.28, Item ~(ii)]{Grohs_2022} tell us that: \begin{align} \lay(\mu) &= \lay \lp \oplus^v_{i=u} \lp \lp \aff_{\mathbb{I}_{\inn(\nu_i),b_i}} \bullet \nu_i\rp \triangleleft c_i\rp \rp \nonumber\\ &= \lp \inn(\nu_u), \sum^v_{i=u} \wid_1 \lp \nu_u \rp,\sum^v_{i=u} \wid_2 \lp \nu_u \rp,..., \sum^v_{i=u} \wid_{\dep(\nu_u)-1} \lp \nu_u \rp , \out \lp \nu_u \rp \rp \nonumber \\ &= \lp \inn(\nu_u), n\wid_1(\nu_u), n\wid_2 ( \nu_u),...,n\wid_{\dep(\nu_u)-1}(\nu_u), \out(\nu_u) \rp \end{align} This establishes Item (i). Items (i) and (ii) from Lemma \ref{5.3.3} tells us that for all $i \in \{ u,u+1,...,v\}$, $\act \in C(\R,\R)$, $x \in \R^{\inn(\nu_u)}$, it is the case that $\real_{\act}\lp \nu_i \bullet \aff_{\mathbb{I}_{\inn(\nu_i)}, B_i} \rp \in C \lp \R^{\inn(\nu_u)}, \R^{\out(\nu_u)} \rp $ and further that: \begin{align} \lp \real_{\act} \lp \aff_{\mathbb{I}_{\inn(\nu_i)},b_i} \bullet \nu_i\rp \rp \lp x \rp = \lp \real_{\act} \lp \nu_i \rp \rp \lp x \rp + b_i \end{align} This along with Lemma \ref{5.6.4} ensures that for all $i \in \{u,u+1,...,v\}$, $\act \in C \lp \R, \R \rp$, $x \in \R^{\inn(\nu_u)}$, it is the case that: \begin{align} \real_{\act} \lp \lp \aff_{\mathbb{I}_{\inn(\nu_i)},b_i} \bullet \nu_i\rp \triangleleft c_i\rp \in C \lp \R^{\inn(\nu_u)}, \R^{\out(\nu_u)}\rp \end{align} and: \begin{align} \lp \real_{\act} \lp \lp \aff_{\mathbb{I}_{\inn(\nu_i)},b_i} \bullet \nu_i \rp \triangleleft c_i \rp \rp \lp x \rp = \lp \real_{\act} \lp \nu_i \rp \rp \lp c_i x + b_i \rp \end{align} Now observe that \cite[Lemma~3.28]{Grohs_2022} and (\ref{5.5.14}) ensure that for all $\act \in C \lp \R, \R \rp$, $x \in \R^{\inn(\nu_u)}$, it is the case that $\real_{\act} \lp \mu \rp \in C \lp \R^{\inn(\nu_u)}, \R^{\out(\nu_u)} \rp$ and that: \begin{align} \lp \real_{\act} \lp \mu \rp \rp \lp x \rp &= \lp \real_{\act} \lp \oplus^v_{i=u} \lp \aff_{\mathbb{I}_{\inn(\nu_i)},b_i} \bullet \nu_i \rp \rp \triangleleft c_i \rp \lp x \rp \\ &= \sum^v_{i=u} \lp \real_{\act} \lp \lp \aff_{\mathbb{I}_{\inn(\nu_i)},b_i} \bullet \nu_i\rp \triangleleft c_i \rp \rp \lp x \rp\\ &=\sum^v_{i=u} \lp \real_{\act} \lp \nu_i \rp \rp \lp c_i x+b_i \rp \nonumber \end{align} This establishes items (ii)\textemdash(iii); thus, the proof is complete. \end{proof} \begin{lemma}\label{5.6.9} Let $L \in \N$, $u,v \in \Z$ with $u\leqslant v$. Let $c_u, c_{u+1},...,c_v \in \R$. $\nu_u, \nu_{u+1},...,\nu_v, \mu \in \neu$, $B_u, B_{u+1},...,B_v \in \R^{\inn(\nu_u)}$, $\act \in C\lp \R, \R \rp$, satisfy for all $j \in \N \cap [u,v]$ that $L = \max_{i\in \N \cap \lb u,v \rb}\\ \dep(\nu_i)$, $\inn(\nu_j) = \inn(\nu_u)$, $\out(\nu_j) = \inn(\mathfrak{I})= \out(\mathfrak{I})$, $\hid(\mathfrak{I}) = 1$, $\real_{\act} (\mathfrak{I}) = \mathbb{I}_\R$, and that: \begin{align} \mu = \dplus^v_{i = u, \mathfrak{I}} \lp c_i \triangleright \lp \nu_i \bullet \aff_{\mathbb{I}_{\inn(\nu_i), },b_i} \rp \rp \end{align} We then have that: \begin{enumerate}[label = (\roman*)] \item it holds that $\real_{\act}(\mu) \in C \lp \R^{\inn(\nu_u)}, \R^{\out(\nu_u)} \rp $, and that, \item it holds for all $ x \in \R^{\inn(\nu_u)}$ that: \begin{align} \lp \real_{\act} \lp \mu \rp \rp \lp x \rp = \sum^v_{i=u} c_i \lp \real_{\act} \lp \nu_i \rp \rp \lp x + b_i\rp \end{align} \end{enumerate} \end{lemma} \begin{proof} Note that Item(i) from Lemma \ref{5.6.5} establish Item(i) and (\ref{5.5.20}); in addition, items (v) \textemdash (vi) from Lemma \ref{5.3.3} tell us that for all $i \in \N \cap [u,v]$, $x \in \R^{\inn(\nu_u}$, it holds that \\ $\real_{\act} \lp \nu_i \bullet \aff_{\mathbb{I}_{\inn(\nu_i)}, B_i} \in C \lp \R^{\inn(\nu_u)}, \R^{\out(\nu_u)}\rp \rp $ and further that: \begin{align} \lp \real_{\act} \lp \nu_i\bullet \aff_{\mathbb{I}_{\inn(\nu_i)},B_i} \rp \rp \lp x \rp = \lp \real_{\act} \lp \nu_i \rp \rp \lp x + b_k \rp \end{align} This, Lemma \ref{5.6.3} and \cite[Lemma~2.14, Item~(ii)]{grohs2019spacetime} show that for all $i \in \N \cap [u,v]$, $x \in \R^{\inn(\nu_u)}$, it holds that: \begin{align} \real_{\act} \lp \ex_{L,\mathfrak{I}} \lp c_i \triangleright \lp \nu_i \bullet \aff_{\mathbb{I}_{\inn(\nu_i)},b_i} \rp \rp \rp = \real_{\act}\lp c_i \triangleright \lp \nu_i \bullet \aff_{\mathbb{I}_{\inn(\nu_i)},b_i} \rp \rp \in C\lp \R^{\inn(\nu_u)}, \R^{\out(\nu_u)} \rp \end{align} and: \begin{align} \lp \real_{\act} \lp \ex_{L,\mathfrak{I}} \lp c_i \triangleright \lp \nu_i \bullet \aff_{\mathbb{I}_{\inn(\nu_i)},b_i} \rp \rp \rp \rp \lp x \rp &= \lp \real_{\act} \lp c_i \triangleright \lp \nu_i \bullet \aff_{\mathbb{I}_{\inn(\nu_i)},b_i} \rp \rp \rp \lp x \rp \nonumber\\ &= c_i \lp \real_{\act} \lp \nu_i \rp \rp \lp x+b_i \rp \end{align} This combined with \cite[Lemma~3.28]{Grohs_2022} and (\ref{5.6.13}) demonstrate that for all $x \in \R^{\inn(\nu_u)}$ it holds that $\real_{\act}\lp \mu \rp \in C\lp \R^{\inn(\nu_u)}, \R^{\out(\nu_u)} \rp $ and that: \begin{align} \lp \real_{\act}\lp \mu \rp \rp \lp x \rp &= \lp \real_{\act} \lp \boxplus^v_{i = u, \mathfrak{I}} \lp c_i \triangleright \lp \nu_i \bullet \aff_{\mathbb{I}_{\inn(\nu_i)}} \rp \rp \rp \rp \lp x \rp \nonumber\\ &= \lp \real_{\act} \lp \oplus^v_{i=u} \ex_{L,\mathfrak{I}} \lp c_i \triangleright \lp \nu_i \bullet \aff_{\mathbb{I}_{\inn(\nu_i)},b_i} \rp \rp \rp \rp\lp x \rp \nonumber \\ &= \sum^v_{i=u} c_i \lp \real_{\act} \lp \nu_i \rp \rp \lp x+b_i \rp \end{align} This establishes Items(ii)--(iii), thus proving the lemma. \end{proof} \begin{lemma} Let $L \in \N$, $u,v \in \Z$ with $u\leqslant v$. Let $c_u, c_{u+1},...,c_v \in \R$. $\nu_u, \nu_{u+1},...,\nu_v, \mu, \mathfrak{I} \in \neu$, $B_u, B_{u+1},...,B_v \in \R^{\inn(\nu_u)}$, $\act \in C\lp \R, \R \rp$, satisfy for all $j \in \N \cap [u,v]$ that $L =\\ \max_{i\in \N \cap \lb u,v \rb} \dep(\nu_i)$, $\inn(\nu_j) = \inn(\nu_u)$, $\out(\nu_j) = \inn(\mathfrak{I})= \out(\mathfrak{I})$, $\hid(\mathfrak{I}) = 1$, $\real_{\act} (\mathfrak{I}) = \mathbb{I}_\R$, and that: \begin{align} \mu = \boxplus^v_{i = u, \mathfrak{I}} \lp \lp \aff_{\mathbb{I} _{\inn(\nu_i)},b_i} \bullet \nu_i\rp \triangleleft c_i \rp \end{align} We then have: \begin{enumerate}[label = (\roman*)] \item it holds that: \begin{align} \lay(\mu) = \lp \inn(\nu_u ), \sum^v_{i=u}\wid_1 \lp \ex_{L,\mathfrak{I}} \lp \nu_i \rp \rp ,\sum^v_{i=u}\wid_2 \lp \ex_{L,\mathfrak{I}} \lp \nu_i\rp\rp,...,\sum^v_{i=u} \wid_{L-1} \lp \ex_{L,\mathfrak{I}} \lp \nu_i \rp , \out \lp \nu_u \rp \rp \rp \end{align} \item it holds that $\real_{\act}(\mu) \in C \lp \R^{\inn(\nu_u)}, \R^{\out(\nu_u)} \rp $, and that, \item it holds for all $ x \in \R^{\inn(\nu_u)}$ that: \begin{align} \lp \real_{\act} \lp \mu \rp \rp \lp x \rp = \sum^v_{i=u} \lp \real_{\act} \lp \nu_i \rp \rp \lp c_ix + b_i\rp \end{align} \end{enumerate} \end{lemma} \begin{proof} Note that Item(i) from Lemma \ref{5.6.6} establish Item(i) and (\ref{5.5.20}); in addition, items (ii) and (iii) from Lemma \ref{5.3.3} tell us that for all $i \in \N \cap [u,v]$, $x \in \R^{\inn(\nu_u}$, it holds that $\real_{\act} \lp \aff_{\mathbb{I}_{\inn(\nu_i)}, B_i} \bullet \nu_i \in C \lp \R^{\inn(\nu_u)}, \R^{\out(\nu_u)}\rp \rp $ and further that: \begin{align} \lp \real_{\act} \lp \aff_{\mathbb{I}_{\inn(\nu_i)},B_i} \bullet \nu_i \rp \rp \lp x \rp = \lp \real_{\act} \lp \nu_i \rp \rp \lp x \rp + b_k \end{align} This, Lemma \ref{5.6.4} and \cite[Lemma~2.14, Item~(ii)]{grohs2019spacetime} show that for all $i \in \N \cap [u,v]$, $x \in \R^{\inn(\nu_u)}$, it holds that: \begin{align} \real_{\act} \lp \ex_{L,\mathfrak{I}} \lp \lp \aff_{\mathbb{I}_{\inn(\nu_i)},b_i} \bullet \nu_i\rp \triangleleft c_i \rp \rp = \real_{\act}\lp \lp \aff_{\mathbb{I}_{\inn(\nu_i)},b_i} \bullet \nu_i \rp \triangleleft c_i\rp \in C\lp \R^{\inn(\nu_u)}, \R^{\out(\nu_u)} \rp \end{align} and: \begin{align} \lp \real_{\act} \lp \ex_{L,\mathfrak{I}} \lp \lp \aff_{\mathbb{I}_{\inn(\nu_i)},b_i} \bullet \nu_i\rp \triangleleft c_i\rp \rp \rp \lp x \rp &= \lp \real_{\act} \lp c_i \triangleright \lp \nu_i \bullet \aff_{\mathbb{I}_{\inn(\nu_i)},b_i} \rp \rp \rp \lp x \rp \nonumber\\ &= \lp \real_{\act} \lp \nu_i \rp \rp \lp c_ix+b_i \rp \end{align} This and \cite[Lemma~3.28]{Grohs_2022} and (\ref{5.6.23}) demonstrate that for all $x \in \R^{\inn(\nu_u)}$ it holds that $\real_{\act}\lp \mu \rp \in C\lp \R^{\inn(\nu_u)}, \R^{\out(\nu_u)} \rp $ and that: \begin{align} \lp \real_{\act}\lp \mu \rp \rp \lp x \rp &= \lp \real_{\act} \lp \boxplus^v_{i = u, \mathfrak{I}} \lp \lp \aff_{\mathbb{I}_{\inn(\nu_i)}} \bullet \nu_i\rp \triangleleft c_i\rp \rp \rp \lp x \rp \nonumber\\ &= \lp \real_{\act} \lp \oplus^v_{i=u} \ex_{L,\mathfrak{I}} \lp \lp \aff_{\mathbb{I}_{\inn(\nu_i)},b_i} \bullet \nu_i\rp \triangleleft c_i\rp \rp \rp\lp x \rp \nonumber \\ &= \sum^v_{i=u} \lp \real_{\act} \lp \nu_i \rp \rp \lp c_ix+b_i \rp \end{align} This completes the proof. \end{proof} \section{Neural Network Diagrams} Conceptually, it will be helpful to construct what are called ``neural network diagrams''. They take inspiration from diagrams typically seen in the literature, for instance, \cite{vaswani_attention_2017}, \cite{arik_tabnet_2021}, and \cite{8099678}. They are constructed as follows. Lines with arrows indicate the flow of data: \begin{center} \begin{tikzcd} {} \arrow[rr, "x"] & & {} \\ {} & & {} \arrow[ll, "x"] \end{tikzcd} \end{center} Named neural networks are always enclosed in boxes with \textsf{serif} fonts: \begin{center} \begin{tikzpicture} % Create a rectangular node with text inside \node[draw, rectangle] at (0, 0) {$\aff_{a,b}$}; \end{tikzpicture} \end{center} Where possible, we seek to label the arrows going in and going out of a boxed neural network with the appropriate operations that take place: \begin{center} \begin{tikzpicture} % Create a rectangular node with text inside \node[draw, rectangle] (box) at (0, 0) {$\aff_{a,b}$}; % Draw an arrow from left to right going into the box \draw[<-] (-2, 0) -- (box.west) node[midway, above] {$ax+b$}; % Draw an arrow from left to right going out of the box \draw[<-] (box.east) -- (2, 0) node[midway, above] {$x$}; \end{tikzpicture} \end{center} It is often more helpful to draw the arrows from right to left, as above. Stacked neural networks are drawn in adjacent boxes. \begin{center} \begin{tikzpicture} % Create the top box with text inside \node[draw, rectangle] (topbox) at (0, 1) {$\aff_{a,b}$}; % Create the bottom box with text inside \node[draw, rectangle] (bottombox) at (0, -1) {$\aff_{c,d}$}; % Draw an arrow from left to right going into the top box \draw[<-] (-2, 1) -- (topbox.west) node[midway, above] {$ax+b$}; % Draw an arrow from left to right going out of the top box \draw[<-] (topbox.east) -- (2, 1) node[midway, above] {$x$}; % Draw an arrow from left to right going into the bottom box \draw[<-] (-2, -1) -- (bottombox.west) node[midway, below] {$cx+d$}; % Draw an arrow from left to right going out of the bottom box \draw[<-] (bottombox.east) -- (2, -1) node[midway, below] {$x$}; \end{tikzpicture} \end{center} For neural networks that take in two inputs and give out one output, we use two arrows going in and one arrow going out: \begin{center} \begin{tikzpicture} % Create the rectangular node with text inside \node[draw, rectangle] (box) at (0, 0) {$\sm_{2,1}$}; % Draw arrow hitting the top right corner of the box \draw[->] (2, 1) -- (box.north east) node[midway, above right] {$x$}; % Draw arrow hitting the bottom right corner of the box \draw[->] (2, -1) -- (box.south east) node[midway, below right] {$y$}; % Draw an arrow going out to the left \draw[->] (box.west) -- (-2, 0) node[midway, above] {$x+y$}; \end{tikzpicture} \end{center} For neural networks that take in one input and give out two outputs, we use one arrow going in and two arrows going out: \begin{center} \begin{tikzpicture} % Create the rectangular node with text inside \node[draw, rectangle] (box) at (0, 0) {$\cpy_{1,2}$}; % Draw arrow hitting the top right corner of the box \draw[->] (box.north west) -- (-2,1) node[midway, above right] {$x$}; % Draw arrow hitting the bottom right corner of the box \draw[->] (box.south west) -- (-2,-1) node[midway, below right] {$x$}; % Draw an arrow going out to the left \draw[->] (2,0) -- (box.east) node[midway, above] {$x$}; \end{tikzpicture} \end{center} Thus taking this all together the sum of neural networks $\aff_{a,b},\aff_{c,d} \in \neu$ is given by: \begin{center} \begin{tikzpicture} % Define nodes \node[draw, rectangle] (top) at (0, 2) {$\aff_{a,b}$}; \node[draw, rectangle] (right) at (2, 0) {$\cpy$}; \node[draw, rectangle] (bottom) at (0, -2) {$\aff_{c,d}$}; \node[draw, rectangle] (left) at (-2, 0) {$\sm$}; % Arrows with labels \draw[->] (right) -- node[midway, above] {$x$} (top); \draw[<-] (right) -- node[midway, above] {$x$} (4,0)(right); \draw[->] (right) -- node[midway, right] {$x$} (bottom); \draw[->] (top) -- node[midway, left] {$ax+b$} (left); \draw[->] (bottom) -- node[midway, left] {$cx+d$} (left); \draw[->] (left) -- node[midway, above] {$ax+b+cx+d$} (-5.5,0); % \draw[->] (-3,0) -- node[midway, above] {Arrow 6} (left); \end{tikzpicture} \end{center}