summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJaron Kent-Dobias <jaron@kent-dobias.com>2025-02-12 22:06:09 -0300
committerJaron Kent-Dobias <jaron@kent-dobias.com>2025-02-12 22:06:09 -0300
commit70aa0dc5a377b8a072ba96b18b9f6adc760df9b5 (patch)
tree87a0d408959cfa6ad28864d6cb9096e861cdd39e
parentae56a721eb63e6e2032e10436a4944d9203aa560 (diff)
downloadictp-saifr_colloquium-70aa0dc5a377b8a072ba96b18b9f6adc760df9b5.tar.gz
ictp-saifr_colloquium-70aa0dc5a377b8a072ba96b18b9f6adc760df9b5.tar.bz2
ictp-saifr_colloquium-70aa0dc5a377b8a072ba96b18b9f6adc760df9b5.zip
More writing and figure creation.
-rw-r--r--figs/gradient_vs_sgd_1.pngbin0 -> 75373 bytes
-rw-r--r--figs/gradient_vs_sgd_2.pngbin0 -> 76257 bytes
-rw-r--r--figs/gradient_vs_sgd_3.pngbin0 -> 97379 bytes
-rw-r--r--figs/gradient_vs_sgd_4.pngbin0 -> 135879 bytes
-rw-r--r--figs/gradient_vs_sgd_5.pngbin0 -> 156623 bytes
-rw-r--r--gradient_vs_sgd.xcfbin0 -> 249706 bytes
-rw-r--r--ictp-saifr_colloquium.tex111
7 files changed, 95 insertions, 16 deletions
diff --git a/figs/gradient_vs_sgd_1.png b/figs/gradient_vs_sgd_1.png
new file mode 100644
index 0000000..12bc5c1
--- /dev/null
+++ b/figs/gradient_vs_sgd_1.png
Binary files differ
diff --git a/figs/gradient_vs_sgd_2.png b/figs/gradient_vs_sgd_2.png
new file mode 100644
index 0000000..a4cd977
--- /dev/null
+++ b/figs/gradient_vs_sgd_2.png
Binary files differ
diff --git a/figs/gradient_vs_sgd_3.png b/figs/gradient_vs_sgd_3.png
new file mode 100644
index 0000000..1883cea
--- /dev/null
+++ b/figs/gradient_vs_sgd_3.png
Binary files differ
diff --git a/figs/gradient_vs_sgd_4.png b/figs/gradient_vs_sgd_4.png
new file mode 100644
index 0000000..f3b9c51
--- /dev/null
+++ b/figs/gradient_vs_sgd_4.png
Binary files differ
diff --git a/figs/gradient_vs_sgd_5.png b/figs/gradient_vs_sgd_5.png
new file mode 100644
index 0000000..faab526
--- /dev/null
+++ b/figs/gradient_vs_sgd_5.png
Binary files differ
diff --git a/gradient_vs_sgd.xcf b/gradient_vs_sgd.xcf
new file mode 100644
index 0000000..13f95e0
--- /dev/null
+++ b/gradient_vs_sgd.xcf
Binary files differ
diff --git a/ictp-saifr_colloquium.tex b/ictp-saifr_colloquium.tex
index 92b4904..1d823a7 100644
--- a/ictp-saifr_colloquium.tex
+++ b/ictp-saifr_colloquium.tex
@@ -12,6 +12,7 @@
\usepackage{pifont}
\usepackage{graphicx}
\usepackage{xcolor}
+\usepackage{tikz}
\definecolor{ictpblue}{HTML}{0471b9}
\definecolor{ictpgreen}{HTML}{0c8636}
@@ -59,7 +60,8 @@
\end{frame}
\begin{frame}
- \frametitle{Curve fitting: the bad, the good, and the ugly}
+ \frametitle{Linear least squares}
+ \framesubtitle{The bad, the good, and the ugly}
\begin{columns}
\begin{column}{0.4\textwidth}
@@ -81,7 +83,8 @@
\end{frame}
\begin{frame}
- \frametitle{Curve fitting: the bad, the good, and the ugly}
+ \frametitle{Linear least squares}
+ \framesubtitle{The bad, the good, and the ugly}
\begin{columns}
\begin{column}{0.4\textwidth}
@@ -109,7 +112,8 @@
\end{columns}
\end{frame}
\begin{frame}
- \frametitle{Curve fitting: the bad, the good, and the ugly}
+ \frametitle{Linear least squares}
+ \framesubtitle{The bad, the good, and the ugly}
\begin{columns}
\begin{column}{0.333\textwidth}
@@ -188,7 +192,8 @@
\end{frame}
\begin{frame}
- \frametitle{Curve fitting: the bad, the good, and the ugly}
+ \frametitle{Linear least squares}
+ \framesubtitle{The bad, the good, and the ugly}
\begin{columns}
\begin{column}{0.5\textwidth}
@@ -224,7 +229,7 @@
\end{frame}
\begin{frame}
- \frametitle{Machine learning is just curve fitting}
+ \frametitle{Machine learning is just \emph{non}linear least squares}
\begin{columns}
\begin{column}{0.9\textwidth}
@@ -261,7 +266,8 @@
\end{frame}
\begin{frame}
- \frametitle{Curve fitting: the bad, the good, the ugly, and the weird}
+ \frametitle{Linear least squares}
+ \framesubtitle{The bad, the good, the ugly, and the weird}
\begin{columns}
\begin{column}{0.25\textwidth}
\centering
@@ -347,7 +353,8 @@
\end{frame}
\begin{frame}
- \frametitle{Curve fitting: the bad, the good, the ugly, and the weird}
+ \frametitle{Linear least squares}
+ \framesubtitle{The bad, the good, the ugly, and the weird}
\begin{columns}
\begin{column}{0.5\textwidth}
@@ -370,7 +377,8 @@
\end{frame}
\begin{frame}
- \frametitle{Overparamaterized curve fitting}
+ \frametitle{Linear least squares}
+ \framesubtitle{Overparameterized solutions are not unique}
\begin{columns}
\begin{column}{0.5\textwidth}
Underparameterized fitting ($M>N$) has a unique minimizing solution
@@ -408,7 +416,8 @@
\end{frame}
\begin{frame}
- \frametitle{Overparamaterized curve fitting and algorithms}
+ \frametitle{Linear least squares}
+ \framesubtitle{Gradient descent and implicit regularization}
\begin{columns}
\begin{column}{0.5\textwidth}
Overparameterized fits found with gradient descent algorithm: take small
@@ -439,7 +448,8 @@
\end{frame}
\begin{frame}
- \frametitle{Curve fitting: choice of basis}
+ \frametitle{Linear least squares}
+ \framesubtitle{Choice of basis}
\begin{columns}
\begin{column}{0.5\textwidth}
@@ -462,7 +472,8 @@
\end{frame}
\begin{frame}
- \frametitle{Curve fitting: choice of basis}
+ \frametitle{Linear least squares}
+ \framesubtitle{Choice of basis}
\centering
\Large\textbf{Polynomial basis}\normalsize
@@ -553,7 +564,8 @@
\end{frame}
\begin{frame}
- \frametitle{Curve fitting: choice of basis}
+ \frametitle{Linear least squares}
+ \framesubtitle{Choice of basis}
\centering
\Large\textbf{Absolute value basis\vphantom{y}}\normalsize
@@ -644,7 +656,8 @@
\end{frame}
\begin{frame}
- \frametitle{Curve fitting: choice of basis}
+ \frametitle{Linear least squares}
+ \framesubtitle{Choice of basis}
\begin{columns}
\begin{column}{0.5\textwidth}
@@ -667,7 +680,8 @@
\end{frame}
\begin{frame}
- \frametitle{Curve fitting: sparseness and level of noise}
+ \frametitle{Linear least squares}
+ \framesubtitle{Sparseness and level of noise}
\begin{columns}
\begin{column}{0.5\textwidth}
@@ -680,7 +694,8 @@
\end{frame}
\begin{frame}
- \frametitle{Curve fitting: sparseness and level of noise}
+ \frametitle{Linear least squares}
+ \framesubtitle{Sparseness and level of noise}
\begin{columns}
\begin{column}{0.25\textwidth}
\centering
@@ -766,7 +781,8 @@
\end{frame}
\begin{frame}
- \frametitle{Curve fitting: sparseness and level of noise}
+ \frametitle{Linear least squares}
+ \framesubtitle{Sparseness and level of noise}
\begin{columns}
\begin{column}{0.5\textwidth}
@@ -784,6 +800,69 @@
\end{frame}
\begin{frame}
+ \frametitle{Machine learning is just \emph{non}linear least squares}
+ \begin{columns}
+ \begin{column}{0.5\textwidth}
+ Gradient descent produces poor solutions to many machine learning problems
+
+ \bigskip
+
+ \textbf{BUT:} no one uses gradient descent
+
+ \bigskip
+
+ \emph{Stochastic} gradient descent (SGD): follow approximated gradient of $\chi^2$ calculated using small subsets (batches) of the data
+
+ \bigskip
+
+ Approximated gradient takes \emph{fewer} steps to find \emph{better} solutions
+ \end{column}
+ \begin{column}{0.5\textwidth}
+ \end{column}
+ \end{columns}
+\end{frame}
+
+\begin{frame}
+ \begin{columns}
+ \begin{column}{0.9\textwidth}
+ \begin{overprint}
+ \onslide<1>\includegraphics[width=\columnwidth]{figs/gradient_vs_sgd_1.png}
+ \onslide<2>\includegraphics[width=\columnwidth]{figs/gradient_vs_sgd_2.png}
+ \onslide<3>\includegraphics[width=\columnwidth]{figs/gradient_vs_sgd_3.png}
+ \onslide<4>\includegraphics[width=\columnwidth]{figs/gradient_vs_sgd_4.png}
+ \onslide<5>\includegraphics[width=\columnwidth]{figs/gradient_vs_sgd_5.png}
+ \end{overprint}
+ \end{column}
+ \end{columns}
+\end{frame}
+
+\begin{frame}
+ \centering
+ \begin{tikzpicture}
+ \draw (0,0) node[align=center] {Overparameterization works\\\includegraphics[width=3cm]{figs/fit_overparamfit_abs2.pdf}};
+ \draw (4,2) node[align=center] {Gradient descent\\implicitly regularizes\\\includegraphics[height=2cm]{figs/fit_gradient_5.pdf}};
+ \draw (-4,2) node[align=center] {Neural networks\\are good bases\\\includegraphics[height=2cm]{figs/fit_basis_abs.pdf}};
+ \draw (-4,-2) node[align=center] {Data is sparse\\and high-dimensional\\\includegraphics[height=2cm]{figs/fit_data_abs2.pdf}};
+ \draw (4,-2) node[align=center] {SGD finds\\high-entropy solutions\\\includegraphics[height=2cm]{figs/gradient_vs_sgd_4.png}};
+ \end{tikzpicture}
+\end{frame}
+
+\begin{frame}
+ \frametitle{Machine learning is just \emph{non}linear least squares}
+ \begin{columns}
+ \begin{column}{0.5\textwidth}
+ Structure and geometry of manifold of "perfect" solutions integral to understanding overparameterized fits
+
+ \medskip
+
+ \textbf{BUT:} extremely little is known outside of the linear case
+ \end{column}
+ \begin{column}{0.5\textwidth}
+ \end{column}
+ \end{columns}
+\end{frame}
+
+\begin{frame}
\frametitle{The Euler characteristic \boldmath{$\chi$}}
\begin{columns}
\begin{column}{0.5\textwidth}