dissertation_work/Dissertation/Introduction.tex

423 lines
29 KiB
TeX
Raw Permalink Normal View History

2024-02-17 01:32:15 +00:00
\chapter{Introduction.}
\section{Motivation}
Artificial neural networks represent a sea change in computing. They have successfully been used in a wide range of applications, from protein-folding in \cite{tsaban_harnessing_2022}, knot theory in \cite{davies_signature_2021}, and extracting data from gravitational waves in \cite{zhao_space-based_2023}.
\\~\\
As neural networks become more ubiquitous, we see that the number of parameters required to train them increases, which poses two problems: accessibility on low-power devices and the amount of energy needed to train these models, see for instance \cite{wu2022sustainable} and \cite{strubell2019energy}. Parameter estimates become increasingly crucial in an increasingly climate-challenged world. That we know strict and precise upper bounds on parameter estimates tells us when training becomes wasteful, in some sense, and when, perhaps, different approaches may be needed.
\\~\\
Our goal in this dissertation is threefold:
\begin{enumerate}[label = (\roman*)]
\item Firstly, we will take something called Multi-Level Picard first introduced in \cite{e_multilevel_2019} and \cite{e_multilevel_2021}, and in particular, the version of Multi-Level Picard that appears in \cite{hutzenthaler_strong_2021}. We show that dropping the drift term and substantially simplifying the process still results in convergence of the method and polynomial bounds for the number of computations required and rather nice properties for the approximations, such as integrability and measurability.
\item We will then go on to realize that the solution to a modified version of the heat equation has a solution represented as a stochastic differential equation by Feynman-Kac and further that a version of this can be realized by the modified multi-level Picard technique mentioned in Item (i), with certain simplifying assumptions since we dropped the drift term. A substantial amount of this is inspired by \cite{bhj20} and much earlier work in \cite{karatzas1991brownian} and \cite{da_prato_zabczyk_2002}.
\item By far, the most significant part of this dissertation is dedicated to expanding and building upon a framework of neural networks as appears in \cite{grohs2019spacetime}. We modify this definition highly and introduce several new neural network architectures to this framework ($\tay, \pwr, \trp, \tun,\etr$, among others) and show, for all these neural networks, that the parameter count grows only polynomially as the accuracy of our model increases, thus beating the curse of dimensionality. This finally paves the way for giving neural network approximations to the techniques realized in Item (ii). We show that it is not too wasteful (defined on the polynomiality of parameter counts) to use neural networks to approximate MLP to approximate a stochastic differential equation equivalent to certain parabolic PDEs as Feynman-Kac necessitates.
\\~\\
We end this dissertation by proposing two avenues of further research: analytical and algebraic. This framework of understanding neural networks as ordered tuples of ordered pairs may be extended to give neural network approximation of classical PDE approximation techniques such as Runge-Kutta, Adams-Moulton, and Bashforth. We also propose three conjectures about neural networks, as defined in \cite{grohs2019spacetime}. They form a bimodule, and that realization is a functor.
\end{enumerate}
This dissertation is broken down into three parts. At the end of each part, we will encounter tent-pole theorems, which will eventually lead to the final neural network approximation outcome. These tentpole theorems are Theorem \ref{tentpole_1}, Theorem \ref{thm:3.21}, and Theorem. Finally, the culmination of these three theorems is Theorem, the end product of the dissertation.
\section{Notation, Definitions \& Basic notions.}
We introduce here basic notations that we will be using throughout this dissertation. Large parts are taken from standard literature inspired by \textit{Matrix Computations} by \cite{golub2013matrix}, and \textit{Probability: Theory \& Examples} by Rick \cite{durrett2019probability}.
\subsection{Norms and Inner Products}
\begin{definition}[Euclidean Norm]
Let $\left\|\cdot\right\|_E: \R^d \rightarrow [0,\infty)$ denote the Euclidean norm defined for every $d \in \N_0$ and for all $x= \{x_1,x_2,\cdots, x_d\}\in \R^d$ as:
\begin{align}
\| x\|_E = \lp \sum_{i=1}^d x_i^2 \rp^{\frac{1}{2}}
\end{align}
For the particular case that $d=1$ and where it is clear from context, we will denote $\| \cdot \|_E$ as $|\cdot |$.
\end{definition}
\begin{definition}[Max Norm]
Let $\left\| \cdot \right\|_{\infty}: \R^d \rightarrow [0,\infty )$ denote the max norm defined for every $d \in \N_0$ and for all $x = \left\{ x_1,x_2,\cdots,x_d \right\} \in \R^d$ as:
\begin{align}
\left\| x \right\|_{\infty} = \max_{i \in \{1,2,\cdots,d\}} \left\{\left| x_i \right| \right\}
\end{align}
We will denote the max norm $\left\|\cdot \right\|_{\max}: \R^{m\times n} \rightarrow \lb 0, \infty \rp$ defined for every $m,n \in \N$ and for all $A \in \R^{m\times n}$ as:
\begin{align}
\| A \|_{\max} \coloneqq \max_{\substack {i \in \{1,2,...,m\} \\ j \in \{1,2,...,n\}}} \left| \lb A\rb_{i,j}\right|
\end{align}
\end{definition}
\begin{definition}[Frobenius Norm]
Let $\|\cdot \|_F: \R^{m\times n} \rightarrow [0,\infty)$ denote the Frobenius norm defined for every $m,n \in \N$ and for all $A \in \R^{m\times n}$ as:
\begin{align}
\|A\|_F = \lp \sum^m_{i=1} \sum^n_{j=1} \lb A \rb^2_{i,j} \rp^{\frac{1}{2}}
\end{align}
\end{definition}
\begin{definition}[Euclidean Inner Product]
Let $\la \cdot, \cdot \ra: \R^d \times \R^d \rightarrow \R$ denote the Euclidean inner product defined for every $d \in \N$, for all $\R^d \ni x = \{x_1,x_2,...,x_d\}$, and for all $\R^d \ni y = \{y_1,y_2,..., y_d\}$ as:
\begin{align}
\la x, y \ra = \sum^d_{i=1} \lp x_i y_i \rp
\end{align}
\end{definition}
\subsection{Probability Space and Brownian Motion}
\begin{definition}[Probability Space]
A probability space is a triple $\lp \Omega, \mathcal{F}, \mathbb{P} \rp$ where:
\begin{enumerate}[label = (\roman*)]
\item $\Omega$ is a set of outcomes called the \textbf{sample space}.
\item $\mathcal{F}$ is a set of events called the \textbf{event space}, where each event is a set of outcomes from the sample space. More specifically, it is a $\sigma$-algebra on the set $\Omega$.
\item A measurable function $\mathbb{P}: \mathcal{F} \rightarrow [0,1]$ assigning each event in the \textbf{event space} a probability between $0$ and $1$. More specifically, $\mathbb{P}$ is a measure on $\Omega$ with the caveat that the measure of the entire space is $1$, i.e., $\mathbb{P}(\Omega) = 1$.
\end{enumerate}
\end{definition}
\begin{definition}[Random Variable]
Let $(\Omega, \mathcal{F}, \mathbb{P})$ be a probability space, and let $d \in \N_0$. For some $d\in \N_0$ a random variable is a measurable function $\mathcal{X}: \Omega \rightarrow \R^d.$
\end{definition}
\begin{definition}[Expectation]
Given a probability space $\lp \Omega, \mathcal{F}, \mathbb{P} \rp$, the expected value of a random variable $X$, denoted $\E \lb X \rb$ is the Lebesgue integral given by:
\begin{align}
\E\lb X \rb=\int_\Omega X d\mathbb{P}
\end{align}
\end{definition}
\begin{definition}[Stochastic Process]
A stochastic process is a family of random variables over a fixed probability space $(\Omega, \mathcal{F}, \mathbb{R})$, indexed over a set, usually $\lb 0, T\rb$ for $T\in \lp 0,\infty\rp$.
\end{definition}
\begin{definition}[Stochastic Basis]
A stochastic basis is a tuple $\lp \Omega, \mathcal{F}, \mathbb{P}, \mathbb{F} \rp$ where:
\begin{enumerate}[label = (\roman*)]
\item $\lp \Omega, \mathcal{F}, \mathbb{P} \rp$ is a probability space equipped with a filtration $\mathbb{F}$ where,
\item $\mathbb{F}=(\mathcal{F}_i)_{i \in I}$, is a collection of non-decreasing sets under inclusion where for every $i \in I$, $I$ being equipped in total order, it is the case that $\mathcal{F}_i$ is a sub $\sigma$-algebra of $\mathcal{F}$.
\end{enumerate}
\end{definition}
\begin{definition}[Brownian Motion Over a Stochastic Basis]\label{def:brown_motion}
Given a stochastic basis $(\Omega, \mathcal{F}, \mathbb{P}, \mathbb{F})$ a standard $(\mathbb{F}_t)_{t\in [0,T]}$-Brownian motion $\mathcal{W}_t$ is a mapping $\mathcal{W}_t: [0,T] \times \Omega \rightarrow \R^d$ satisfying:
\begin{enumerate}[label = (\roman*)]
\item $\mathcal{W}_t$ is $\mathcal{F}_t$ measurable for all $t\in [0, \infty)$
\item $\mathcal{W}_0 = 0$ with $\mathbb{P}$-a.s.
\item $\mathcal{W}_t-\mathcal{W}_s \sim \norm\lp 0,t-s\rp$ when $s\in \lp 0, t \rp $.
\item $\mathcal{W}_t-\mathcal{W}_s$ is independent of $\mathcal{W}_s$ whenever $s <t$.
\item The paths that $\mathcal{W}_t$ take are $\mathbb{P}$-a.s. continuous.
\end{enumerate}
\end{definition}
\begin{definition}[$\lp \mathbb{F}_t \rp _{t\in [0,T]}$-adapted Stochastic Process]
Let $T \in (0,\infty)$. Let $(\Omega, \mathcal{F}, \mathbb{P}, \mathbb{F})$ be a filtered probability space with the filtration indexed over $[0,T]$. Let $(S,\Sigma)$ be a measurable space. Let $\mathcal{X}: [0,T] \times \Omega \rightarrow S$ be a stochastic process. We say that $\mathcal{X}$ is an $(\mathbb{F}_t)_{t\in [0,T]}$-adapted stochastic process if it is the case that $\mathcal{X}_t: \Omega \rightarrow S$ is $(\mathcal{F}_t, \Sigma)$ measurable for each $t \in [0,T]$.
\end{definition}
\begin{definition}[$(\mathbb{F}_t)_{t\in[0,T]}$-adapted stopping time] Let $T \in (0,\infty)$, $\tau \in [0,T]$. Assume a filtered probability space $(\Omega, \mathcal{F}, \mathbb{P}, \mathbb{F})$. It is then the case that $\tau \in \R$ is a stopping time if the stochastic process $\mathcal{X} = (\mathcal{X}_t)_{t\in [0,T]}$ define as:
\begin{align}
\mathcal{X}_t := \begin{cases}
1 : t < \tau \\
0 : t \geqslant \tau
\end{cases}
\end{align}
is adapted to the filtration $\mathbb{F}:= (\mathcal{F}_i )_{i \in [0,T]}$
\end{definition}
\begin{definition}[Strong Solution of Stochastic Differential Equation]\label{1.9}
Let $d,m \in \N$. Let $\mu: \R^d \rightarrow \R^d$, $\sigma: \R^d \rightarrow \R^{d \times m}$ be Borel-measurable. Let $(\Omega, \mathcal{F}, \mathbb{P}, (\mathbb{F}_t)_{t \in [0,T]})$ be a stochastic basis, and let $\mathcal{W}: [0,T] \times \Omega \rightarrow \R^d$ be a standard $(\mathbb{F}_t)_{t\in [0,T]}$-Brownian motion. For all $t \in [0, T]$, $x \in \R^d$, let $\mathcal{X}^{t,x} = (\mathcal{X}^{t,x}_s)_{s\in [t, T]} \times \Omega \rightarrow \R^d$ be an $(\mathbb{F}_s)_{s\in [t, T]}$-adapted stochastic process with continuous sample paths satisfying that for all $t \in [0, T]$ we have $\mathbb{P}$-a.s. that:
\begin{align}\label{1.5}
\mathcal{X}^{t,x} = \mathcal{X}_0 + \int^t_0 \mu(r, \mathcal{X}^{t,x}_r)dr + \int^t_0 \sigma(r, \mathcal{X}^{t,x}_r) d\mathcal{W}_r
\end{align}
\medskip
A strong solution to the stochastic differential equation (\ref{1.5}) on probability space $(\Omega, \mathcal{F}, \mathbb{P}, (\mathbb{F}_t)_{t \in [0,T]})$, w.r.t Brownian motion $\mathcal{W}$, w.r.t to initial condition $\mathcal{X}_0 = 0$ is a stochastic process $(\mathcal{X}_t)_{t\in[0,\infty)}$ satisfying that:
\begin{enumerate}[label = (\roman*)]
\item $\mathcal{X}_t$ is adapted to the filtration $(\mathbb{F}_t)_{t \in [0,T]}$.
\item $\mathbb{P}(\mathcal{X}_0 = 0) =1$.
\item for all $t \in [0,T]$ it is the case that $\mathbb{P} \lp \int^t_0 \| \mu(r, \mathcal{X}^{t,x}_r) \|_E + \|\sigma(r, \mathcal{X}^{t,x}_r) \|_F d\mathcal{W}_r < \infty \rp =1$
\item it holds with $\mathbb{P}$-a.s. that $\mathcal{X}$ satisfies the equation:
\begin{align}
\mathcal{X}^{t,x} = \mathcal{X}_0 + \int^t_0 \mu(r, \mathcal{X}^{t,x}_r)dr + \int^t_0 \sigma(r, \mathcal{X}^{t,x}_r) d\mathcal{W}_r
\end{align}
\end{enumerate}
\end{definition}
\begin{definition}[Strong Uniqueness Property for Solutions to Stochastic Differential Equations]
Let it be the case that whenever we have two strong solutions $\mathcal{X}$ and $\widetilde{\mathcal{X}}$, w.r.t. process $\mathcal{W}$ and initial condition $\mathcal{X}_0 = 0$, as defined in Definition \ref{1.9}, it is also the case that $\mathbb{P}(\mathcal{X}_t = \widetilde{\mathcal{X}}_t) =1$ for all $t\in [0, T]$. We then say that the pair $(\mu, \sigma)$ exhibits a strong uniqueness property.
\end{definition}
\subsection{Lipschitz and Related Notions}
\begin{definition}[Globally Lipschitz Function]\label{def:1.13}
Let $d \in \N_0$. For every $d\in \N_0$, we say a function $f: \R^d \rightarrow \R^d$ is (globally) Lipschitz if there exists an $L \in (0,\infty)$ such that for all $x,y \in \R^d$ it is the case that :
\begin{align}
\left\| f(x)-f(y) \right\|_E \leqslant L \cdot \left\| x-y\right\|_E
\end{align}
The set of globally Lipschitz functions over set $X$ will be denoted $\lip_G(X)$
\end{definition}
\begin{corollary}
Let $d \in \N_0$. For every $d \in \N_0$, a continuous function $f \in C(\R^d,\R^d)$ over a compact set $\mathcal{K} \subsetneq \R^d$ is Lipschitz over that set.
\end{corollary}
\begin{proof}
By Hiene-Cantor, $f$ is uniformly continuous over set $\mathcal{K}$. Fix an arbitrary $\epsilon$ and let $\delta$ be from the definition of uniform continuity. By compactness we have a finite cover of $\mathcal{K}$ by balls of radius $\delta$, centered around $x_i \in \mathcal{K}$:
\begin{align}
\mathcal{K} \subseteq \bigcup^N_{i=1} B_\delta(x_i)
\end{align}
Note that within a given ball, no point $x_j$ is such that $|x_i-x_j|> \delta$. Thus, by uniform continuity, we have the following:
\begin{align}
|f(x_i)-f(x_j)| < \epsilon \quad \forall i,j \in \{1,2,...,N\}
\end{align}
and thus let $\mathfrak{L}$ be defined as:
\begin{align}
\mathfrak{L} = \max_{\substack{i,j \in \{1,2,...,N\} \\ i \neq j}} \lv \frac{f(x_i)-f(x_j)}{x_i - x_j} \rv
\end{align}
$\mathfrak{L}$ satisfies the Lipschitz property. To see this, let $x_1,x_2$ be two arbitrary points within $\mathcal{K}$. Let $B_\delta(x_i)$ and $B_\delta(x_j)$ be two points such that $x_1 \in B_\delta(x_i)$ and $x_2 \in B_\delta(x_j)$. The triangle inequality then yields that:
\begin{align}
\left|f(x_1)-f(x_2)\right| &\leqslant \left|f(x_1)-f(x_i)\right| + \left|f(x_i)-f(x_j)\right| + \left|f(x_j)-f(x_2)\right| \nonumber\\
&\leqslant \left|f(x_i)-f(x_j)\right| + 2\epsilon \nonumber\\
&\leqslant \mathfrak{L}\cdot\left|x_i-x_j\right| + 2\epsilon \nonumber\\
&\leqslant \mathfrak{L}\cdot\left|x_1-x_2\right| + 2\epsilon \nonumber
\end{align}
for all $\epsilon \in (0,\infty)$.
\end{proof}
\begin{definition}[Locally Lipschitz Function]\label{def:1.14}
Let $d \in N_0$. For every $d \in \N_0$ a function $f: \R^d \rightarrow \R^d$ is locally Lipschitz if for all $x_0 \in \R^d$ there exists a compact set $\mathcal{K} \subseteq \domain(f)$ containing $x_0$, and a constant $L \in (0,\infty)$ for that compact set such that
\begin{align}
\sup_{\substack{x,y\in \mathcal{K} \\ x\neq y}} \left\| \frac{f(x)-f(y)}{x-y} \right\|_E \leqslant L
\end{align}
The set of locally Lipschitz functions over set $X$ will be denoted $\lip_L(X)$.
\end{definition}
\begin{corollary}
A function $f: \R^d \rightarrow \R^d$ that is globally Lipschitz is also locally Lipschitz. More concisely $\lip_G(X) \subsetneq \lip_L(X)$.
\end{corollary}
\begin{proof}
Assume not, that is to say, there exists a point $x\in \domain(f)$, a compact set $\mathcal{K} \subseteq \domain(f)$, and points $x_1,x_2 \in \mathcal{K}$ such that:
\begin{align}
\frac{|f(x_1)-f(x_2)|}{x_1-x_2} \geqslant \mathfrak{L}
\end{align}
This directly contradicts Definition \ref{def:1.13}.
\end{proof}
\subsection{Kolmogorov Equations}
\begin{definition}[Kolmogorov Equation]
We take our definition from \cite[~(7.0.1)]{da_prato_zabczyk_2002} with, $u \curvearrowleft u$, $G \curvearrowleft \sigma$, $F \curvearrowleft \mu$, and $\varphi \curvearrowleft g$, and for our purposes we set $A:\R^d \rightarrow 0$. Given a separable Hilbert space H (in our case $\R^d$), and letting $\mu: [0, T] \times \R^d \rightarrow \R^d$, $\sigma:[0, T] \times \R^d \rightarrow \R^{d\times m}$, and $g:\R^d \rightarrow \R$ be at least Lipschitz, a Kolmogorov Equation is an equation of the form:
\begin{align}\label{(1.7)}
\begin{cases}
\lp \frac{\partial}{\partial t} u \rp \lp t,x \rp = \frac{1}{2} \Trace \lp \sigma \lp t,x \rp \lb \sigma \lp t,x \rp \rb^* \lp \Hess_x u \rp \lp t,x \rp \rp + \la \mu \lp t,x \rp , \lp \nabla_x u \rp \lp t,x \rp \ra \\
u(0,x) = g(x)
\end{cases}
\end{align}
\end{definition}
\begin{definition}[Strict Solution to Kolmogorov Equation]
Let $d\in \N_0$. For every $d\in \N_0$ a function $u: [0,T] \times \R^d \rightarrow \R$ is a strict solution to (\ref{(1.7)}) if and only if:
\begin{enumerate}[label = (\roman*)]
\item $u \in C^{1,1} \lp \lb 0,T \rb \times \R^d \rp$ and $u(0, \cdot) = g$
\item $u(t, \cdot) \in UC^{1,2}([0,T] \times \R^d, \R)$
\item For all $x \in \domain(A)$, $u(\cdot,x)$ is continuously differentiable on $[0,\infty)$ and satisfies (\ref{(1.7)}).
\end{enumerate}
\end{definition}
\begin{definition}[Generalized Solution to Kolmogorov Equation]
A generalized solution to (\ref{(1.7)}) is defined as:
\begin{align}
u(t,x) = \E \lb g \lp \mathcal{X}^{t,x} \rp \rb
\end{align}
Where the stochastic process $\mathcal{X}^{t,x}$ is the solution to the stochastic differential equation, for $x \in \R^d$, $t \in [0,T]$:
\begin{align}
\mathcal{X}^{t,x} = \int^t_0 \mu \lp \mathcal{X}^{t,x}_r \rp dr + \int^t_0 \sigma \lp \mathcal{X}^{t,x}_r \rp dW_r
\end{align}
\end{definition}
\begin{definition}[Laplace Operator w.r.t. $x$]
Let $d \in \N_0$, and $f\in C^2\lp \R^d,\R \rp$. For every $d\in \N_0$, the Laplace operator $\nabla^2_x : C^2(\R^d,\R) \rightarrow \R$ is defined as:
\begin{align}
\Delta_xf = \nabla_x^2f := \nabla \cdot \nabla f = \sum^d_{i=1} \frac{\partial f}{\partial x_i}
\end{align}
\end{definition}
\subsection{Linear Algebra Notation and Definitions}
\begin{definition}[Identity, Zero Matrix, and the 1-matrix]
Let $d \in \N$. We will define the identity matrix for every $d \in \N$ as the matrix $\mathbb{I}_d \in \R^{d\times d}$ given by:
\begin{align}
\mathbb{I}_d = \lb \mathbb{I}_d \rb_{i,j} = \begin{cases}
1 & i=j \\
0 & \text{else}
\end{cases}
\end{align}
Note that $\mathbb{I}_1 =1$.
Let $m,n,i,j \in \N$. For every $m,n \in \N$, $i \in \left\{1,2,\hdots,m \right\}$, and $j \in \left\{ 1,2,\hdots,n\right\}$ we define the zero matrix $\mymathbb{0}_{m,n} \in \R^{m\times n}$ as:
\begin{align}
\mymathbb{0}_{m,n} =\lb \mymathbb{0}_{m,n} \rb_{i,j} = 0
\end{align}
Where we only have a column of zeros, it is convenient to denote $\mymathbb{0}_d$ where $d$ is the height of the column.
Let $m,n,i,j \in \N$. For every $m,n \in \N$, $i \in \left\{ 1,2,\hdots,m\right\}$, and $j \in \left\{1,2,\hdots,n \right\}$ we define matrix of ones $\mymathbb{e}_{m,n} \in \R^{m \times n}$ as:
\begin{align}
\mymathbb{e}_{m,n} = \lb \mymathbb{e} \rb_{i,j} = 1 \quad
\end{align}
Where we only have a column of ones, it is convenient to denote $\mymathbb{e}_d$ where $d$ is the height of the column.
\end{definition}
\begin{definition}[Single-entry matrix]
Let $m,n,k,l \in \N$ and let $c\in \R$. For $k \in \N \cap \lb 1,m\rb$ and $l \in \N \cap \lb 1,n\rb$, we will denote by $\mymathbb{k}^{m,n}_{k,l,c} \in \R^{m \times n}$ as the matrix defined by:
\begin{align}
\mymathbb{k}^{m,n}_{k,l,c} =\lb \mymathbb{k}^{m,n}_{k,l}\rb_{i,j} = \begin{cases}
c &:k=i \land l=j \\
0 &:else
\end{cases}
\end{align}
\end{definition}
\begin{definition}[Complex conjugate and transpose]
Let $m,n,i,j \in \N$, and $A \in \mathbb{C}^{m \times n}$. For every $m,n \in \N$, $i \in \left\{1,2,\hdots,m\right\}$ and $j \in \left\{1,2,\hdots, n\right\}$, we denote by $A^* \in \mathbb{C}^{n \times m}$ the matrix:
\begin{align}
A^*\coloneqq \lb A^* \rb _{i,j} = \overline{\lb A \rb _{j,i}}
\end{align}
Where it is clear that we are dealing with real matrices, i.e., $A \in \R^{m\times n}$, we will denote this as $A^\intercal$.
\end{definition}
\begin{definition}[Column and Row Notation]\label{def:1.1.23}
Let $m,n,i,j \in \N$ and let $A \in \R^{m \times n}$. For every $m,n \in N$ and $i \in \left\{ 1,2,\hdots ,m\right\}$ we denote $i$-th row as:
\begin{align}
[A]_{i,*} = \begin{bmatrix}
a_{i,1} & a_{i,2} & \cdots & a_{i,n}
\end{bmatrix}
\end{align}
Similarly for every $m,n \in \N$ and $j \in \left\{ 1,2,\hdots,n\right\}$, we done the $j$-th row as:
\begin{align}
[A]_{*,j} = \begin{bmatrix}
a_{1,j} \\
a_{2,j} \\
\vdots \\
a_{m,j}
\end{bmatrix}
\end{align}
\end{definition}
\begin{definition}[Component-wise notation]
Let $m,n,i,j \in \N$, and let $A \in \R^{m \times n}$. Let $f: \R \rightarrow \R$. For all $m,n \in \N, i \in \left\{1,2,\hdots,m \right\}$, and $j \in \left\{1,2,\hdots,n \right\}$ we will define $f \lp \lb A \rb_{*,*} \rp \in \R^{m \times n}$ as:
\begin{align}
f\lp \lb A\rb_{*,*}\rp \coloneqq \lb f \lp \lb A\rb_{i,j}\rp \rb_{i,j}
\end{align}
Thus under this notation the component-wise square of $A$ is $\lp \lb A \rb_{*,*}\rp^2$, the component-wise $\sin$ is $\sin\lp \lb A \rb_{*,*}\rp$ and the Hadamard product of $A,B \in \R^{m \times n}$ then becomes $ A \odot B = \ \lb A \rb_{*,*} \times \lb B \rb_{*,*}$.
\end{definition}
\begin{remark}
Where we are dealing with a row vector $x \in \R^{d \times 1}$ and it is evident from the context we may choose to write $f\lp \lb x\rb_* \rp$.
\end{remark}
\begin{definition}[The Diagonalization Operator]
Let $m_1,m_2,n_1,n_2 \in \N$. Given $A \in \R^{m_1 \times n_1}$ and $B \in \R^{m_2\times n_2}$, we will denote by $\diag\lp A,B\rp$ the matrix:
\begin{align}
\diag\lp A,B\rp = \begin{bmatrix}
A & \mymathbb{0}_{m_1,n_2}\\
\mymathbb{0}_{m_2,n_1}& B
\end{bmatrix}
\end{align}
\end{definition}
\begin{remark}
$\diag\lp A_1,A_2,\hdots,A_n\rp$ is defined analogously for a finite set of matrices $A_1,A_2,\hdots,A_n$.
\end{remark}
\begin{definition}[Number of rows and columns notation]
Let $m,n \in \N$. Let $A\in \R^{m \times n}$. Let $\rows:\R^{m \times n} \rightarrow\N$ and $\columns:\R^{m\times n} \rightarrow \N$, be the functions respectively $\rows\lp A \rp = m$ and $\columns\lp A\rp = n$.
\end{definition}
\subsection{$O$-type Notation and Function Growth}
\begin{definition}[$O$-type notation]
Let $g \in C(\R,\R)$. We say that $f \in C(\R,\R)$ is in $O(g(x))$, denoted $f \in O(g(x))$, if there exists $c\in \lp 0, \infty\rp$ and $x_0 \in \lp 0,\infty\rp$ such that for all $x\in \lb x_0,\infty \rp $ it is the case that:
\begin{align}
0 \leqslant f(x) \leqslant c \cdot g(x)
\end{align}
We say that $f \in \Omega(g(x))$ if there exists $c\in \lp 0,\infty\rp$ and $x_0 \in \lp 0,\infty \rp$ such that for all $x\in \lb x_0, \infty\rp$ it is the case that:
\begin{align}
0 \leqslant cg(x) \leqslant f(x)
\end{align}
We say that $f \in \Theta(g(x))$ if there exists $c_1,c_2,x_0 \in \lp 0,\infty\rp$ such that for all $x \in \lb x_0,\infty\rp$ it is the case that:
\begin{align}
0 \leqslant c_1g(x) \leqslant f \leqslant c_2g(x)
\end{align}
\end{definition}
\begin{corollary}[Bounded functions and $O$-type notation]\label{1.1.20.1}
Let $f(x) \in C(\R,\R)$, then:
\begin{enumerate}[label = (\roman*)]
\item if $f$ is bounded above for all $x\in \R$, it is in $O(1)$ for some constant $c\in \R$.
\item if $f$ is bounded below for all $x \in \R$, it is in $\Omega(1)$ for some constant $c \in \R$.
\item if $f$ is bounded above and below for all $x\in \R$, it is in $\Theta(1)$ for some constant $c\in \R$.
\end{enumerate}
\end{corollary}
\begin{proof}
Assume $f \in C(\R, \R)$, then:
\begin{enumerate}[label = (\roman*)]
\item Assume for all $x \in \R$ it is the case that $f(x) \leqslant M$ for some $M\in \R$, then there exists an $x_0\in \lp 0,\infty \rp$ such that for all $x\in \lp x_0,\infty \rp $ it is also the case that $0 \leqslant f(x) \leqslant M$, whence $f(x) \in O(1)$.
\item Assume for all $x \in \R$ it is the case that $f(x) \geqslant M $ for some $M\in \R$, then there exists an $x_0\in \lp 0,\infty \rp$ such that for all $x\in \lb x_0, \infty \rp$ it is also the case that $f(x) \geqslant M \geqslant 0$, whence $f(x) \in \Omega(1)$.
\item This is a consequence of items (i) and (ii).
\end{enumerate}
\end{proof}
\begin{corollary}\label{1.1.20.2}
Let $n\in \N_0$. For some $n\in \N_0$, let $f \in O(x^n)$. It is then also the case that $f \in O \lp x^{n+1} \rp$.
\end{corollary}
\begin{proof}
Let $f \in O(x^n)$. Then there exists $c_0,x_0 \in \lp 0,\infty\rp$, such that for all $x \in \lb x_0,\infty\rp$ it is the case that:
\begin{align}
f(x) \leqslant c_0\cdot x^n
\end{align}
Note however that for all $n\in \N_0$, there also exists $c_1,x_1 \in \lp 0,\infty\rp$ such that for all $x \in \lp x_1,\infty \rp$ it is the case that:
\begin{align}
x^n \leqslant c_1\cdot x^{n+1}
\end{align}
Thus taken together this implies that for all $x \in \lp \max \left\{ x_0,x_1\right\},\infty\rp$ it is the case that:
\begin{align}
f(x) \leqslant c_0 \cdot x^n \leqslant c_0\cdot c_1 \cdot x^{n+1}
\end{align}
\end{proof}
\begin{definition}[The floor and ceiling functions]
We denote by $\lfloor\cdot \rfloor: \R \rightarrow \Z$ and $\lceil \cdot \rceil: \R \rightarrow\Z$ the functions satisfying for all $x \in \R$ that $\lfloor x \rfloor = \max \lp \Z \cap \lp -\infty,x \rb \rp $ and $\lceil x \rceil = \min \lp \Z \cap \lp -\infty,x \rb \rp$.
\end{definition}
\subsection{The Concatenation of Vectors \& Functions}
\begin{definition}[Vertical Vector Concatenation]
Let $m,n \in \N$. Let $x= \lb x_1 \: x_2\: \hdots \: x_m \rb^\intercal \in \R^m$ and $y = \lb y_1,y_2,\hdots,y_n\rb^\intercal \in \R^n$. For every $m,n \in \N$, we will denote by $x \frown y \in \R^m \times \R^n$ the vector given as:
\begin{align}
\begin{bmatrix}
x_1 \\x_2\\ \vdots \\x_m \\y_1 \\y_2\\ \vdots \\y_n
\end{bmatrix}
\end{align}
\end{definition}
\begin{remark}
We will stipulate that when concatenating vectors as $x_1 \frown x_2$, $x_1$ is on top, and $x_2$ is at the bottom.
\end{remark}
\begin{corollary}\label{sum_of_frown_frown_of_sum}
Let $m_1,m_2,n_1,n_2 \in \N$ and let $x \in \R^{m_1}$, $y \in \R^{n_1}$, $\fx\in \R^{m_2}$, and $\fy \in \R^{n_2}$. It is then the case that $\lb x \frown \fx\rb+\lb y \frown \fy\rb = \lb x+y\rb\frown \lb \fx +\fy\rb$.
\end{corollary}
\begin{proof}
This follows straightforwardly from the fact that:
\begin{align}
\lb x \frown \fx \rb + \lb y + \fy\rb = \begin{bmatrix}
x_1 \\ x_2 \\ \vdots \\ x_{m_1} \\ \fx_1 \\ \fx_2 \\ \vdots \\ \fx_{m_2}
\end{bmatrix} + \begin{bmatrix}
y_1 \\ y_2 \\ \vdots \\ y_{n_1} \\ \fy_1\\ \fy_2 \\ \vdots \\ \fy_{n_2}
\end{bmatrix} = \begin{bmatrix}
x_1+y_1 \\ x_2 + y_2 \\ \vdots \\ x_{m_1+n+1} \\ \fx_1+\fy_1 \\ \fx_2 + \fy_2 \\ \vdots \\ \fx_{m_2} + \fy_{n_2}
\end{bmatrix} = \lb x+y\rb\frown \lb \fx +\fy\rb
\end{align}
\end{proof}
\begin{definition}[Function Concatenation]
Let $m_1,n_1,m_2,n_2 \in \N$. Let $f : \R^{m_1} \rightarrow\R^{n_1}$ and $g: \R^{m_2}\rightarrow\R^{n_2}$. We will denote by $f \frown g: \R^{m_1} \times \R^{m_2} \rightarrow \R^{n_1} \times \R^{n_2}$ as the function given for all $x = \{ x_1,x_2,\hdots, x_{m_1}\} \in \R^{m_1}$, $\overline{x} \in \{ \overline{x_1},\overline{x_2},\hdots ,\overline{x}_{m_2}\} \in \R^{m_2}$, and $x \frown \overline{x} =\{x_1,x_2,\hdots,x_{m_1},\overline{x}_1,\overline{x}_2,\hdots,\overline{x}_{m_2}\} \in \R^{m_1} \times \R^{m_2}$ by:
\begin{align}
\begin{bmatrix}
x_1 \\ x_2\\ \vdots \\x_{m_1} \\\overline{x_1} \\\overline{x_2}\\ \vdots \\ \overline{x_{m_2}}
\end{bmatrix} \xrightarrow{\hspace*{1.5cm}}
\begin{bmatrix}
f(x) \\ g(\overline{x})
\end{bmatrix}
\end{align}
\end{definition}
\begin{corollary}\label{concat_fun_fun_concat}
Let $m,n \in \N$. Let $x_1 \in \R^m$,$x_2 \in \R^n$, and $f\in C\lp \R, \R\rp$. It is then the case that $f\lp x_1 \frown x_2\rp = f \lp x_1\rp \frown f\lp x_2\rp$.
\end{corollary}
\begin{proof}
This follows straightforwardly from the definition of function concatenation.
\end{proof}
\begin{lemma}\label{par_cont}
Let $m_1,m_2,n_1, n_2 \in \N$. Let $f \in C\lp \R^{m_1}, \R^{n_1}\rp$ and $g \in C\lp \R^{m_2}, \R^{n_2}\rp$. It is then also the case that $f \frown g \in C \lp \R^{m_1} \times \R^{n_1}, \R^{m_2} \times \R^{n_2}\rp$.
\end{lemma}
\begin{proof}
Let $\R^{m_2} \times \R^{n_2}$ be equipped with the usual product topology, i.e., the topology generated by all products $X \times Y$ of open subsets $X \in \R^{m_2}$ and $Y\in \R^{n_2}$. In such a case let $V \subsetneq \R^{m_2} \times \R^{n_2}$ be an open subset. Then let it be that $V_f$ and $V_g$ are the canonical projections to the first and second factors respectively. Since projection under the usual topology is continuous, it is the case that $V_f \subsetneq \R^{m_2}$ and $V_g \subsetneq \R^{n_2}$ are open sets, respectively. As such it is then also the case that $f^{-1}\lp V_f\rp \subsetneq \R^{m_1}$ and $g^{-1}\lp V_g\rp \subsetneq \R^{n_1}$ are open sets as well by continuity of $f$ and $g$. Thus, their product is open as well, proving the lemma.
\end{proof}