From 70aa0dc5a377b8a072ba96b18b9f6adc760df9b5 Mon Sep 17 00:00:00 2001
From: Jaron Kent-Dobias <jaron@kent-dobias.com>
Date: Wed, 12 Feb 2025 22:06:09 -0300
Subject: More writing and figure creation.

---
 figs/gradient_vs_sgd_1.png | Bin 0 -> 75373 bytes
 figs/gradient_vs_sgd_2.png | Bin 0 -> 76257 bytes
 figs/gradient_vs_sgd_3.png | Bin 0 -> 97379 bytes
 figs/gradient_vs_sgd_4.png | Bin 0 -> 135879 bytes
 figs/gradient_vs_sgd_5.png | Bin 0 -> 156623 bytes
 gradient_vs_sgd.xcf        | Bin 0 -> 249706 bytes
 ictp-saifr_colloquium.tex  | 111 ++++++++++++++++++++++++++++++++++++++-------
 7 files changed, 95 insertions(+), 16 deletions(-)
 create mode 100644 figs/gradient_vs_sgd_1.png
 create mode 100644 figs/gradient_vs_sgd_2.png
 create mode 100644 figs/gradient_vs_sgd_3.png
 create mode 100644 figs/gradient_vs_sgd_4.png
 create mode 100644 figs/gradient_vs_sgd_5.png
 create mode 100644 gradient_vs_sgd.xcf

diff --git a/figs/gradient_vs_sgd_1.png b/figs/gradient_vs_sgd_1.png
new file mode 100644
index 0000000..12bc5c1
Binary files /dev/null and b/figs/gradient_vs_sgd_1.png differ
diff --git a/figs/gradient_vs_sgd_2.png b/figs/gradient_vs_sgd_2.png
new file mode 100644
index 0000000..a4cd977
Binary files /dev/null and b/figs/gradient_vs_sgd_2.png differ
diff --git a/figs/gradient_vs_sgd_3.png b/figs/gradient_vs_sgd_3.png
new file mode 100644
index 0000000..1883cea
Binary files /dev/null and b/figs/gradient_vs_sgd_3.png differ
diff --git a/figs/gradient_vs_sgd_4.png b/figs/gradient_vs_sgd_4.png
new file mode 100644
index 0000000..f3b9c51
Binary files /dev/null and b/figs/gradient_vs_sgd_4.png differ
diff --git a/figs/gradient_vs_sgd_5.png b/figs/gradient_vs_sgd_5.png
new file mode 100644
index 0000000..faab526
Binary files /dev/null and b/figs/gradient_vs_sgd_5.png differ
diff --git a/gradient_vs_sgd.xcf b/gradient_vs_sgd.xcf
new file mode 100644
index 0000000..13f95e0
Binary files /dev/null and b/gradient_vs_sgd.xcf differ
diff --git a/ictp-saifr_colloquium.tex b/ictp-saifr_colloquium.tex
index 92b4904..1d823a7 100644
--- a/ictp-saifr_colloquium.tex
+++ b/ictp-saifr_colloquium.tex
@@ -12,6 +12,7 @@
 \usepackage{pifont}
 \usepackage{graphicx}
 \usepackage{xcolor}
+\usepackage{tikz}
 
 \definecolor{ictpblue}{HTML}{0471b9}
 \definecolor{ictpgreen}{HTML}{0c8636}
@@ -59,7 +60,8 @@
 \end{frame}
 
 \begin{frame}
-  \frametitle{Curve fitting: the bad, the good, and the ugly}
+  \frametitle{Linear least squares}
+  \framesubtitle{The bad, the good, and the ugly}
 
   \begin{columns}
     \begin{column}{0.4\textwidth}
@@ -81,7 +83,8 @@
 \end{frame}
 
 \begin{frame}
-  \frametitle{Curve fitting: the bad, the good, and the ugly}
+  \frametitle{Linear least squares}
+  \framesubtitle{The bad, the good, and the ugly}
 
   \begin{columns}
     \begin{column}{0.4\textwidth}
@@ -109,7 +112,8 @@
   \end{columns}
 \end{frame}
 \begin{frame}
-  \frametitle{Curve fitting: the bad, the good, and the ugly}
+  \frametitle{Linear least squares}
+  \framesubtitle{The bad, the good, and the ugly}
 
   \begin{columns}
     \begin{column}{0.333\textwidth}
@@ -188,7 +192,8 @@
 \end{frame}
 
 \begin{frame}
-  \frametitle{Curve fitting: the bad, the good, and the ugly}
+  \frametitle{Linear least squares}
+  \framesubtitle{The bad, the good, and the ugly}
 
   \begin{columns}
     \begin{column}{0.5\textwidth}
@@ -224,7 +229,7 @@
 \end{frame}
 
 \begin{frame}
-  \frametitle{Machine learning is just curve fitting}
+  \frametitle{Machine learning is just \emph{non}linear least squares}
 
   \begin{columns}
     \begin{column}{0.9\textwidth}
@@ -261,7 +266,8 @@
 \end{frame}
 
 \begin{frame}
-  \frametitle{Curve fitting: the bad, the good, the ugly, and the weird}
+  \frametitle{Linear least squares}
+  \framesubtitle{The bad, the good, the ugly, and the weird}
   \begin{columns}
     \begin{column}{0.25\textwidth}
       \centering
@@ -347,7 +353,8 @@
 \end{frame}
 
 \begin{frame}
-  \frametitle{Curve fitting: the bad, the good, the ugly, and the weird}
+  \frametitle{Linear least squares}
+  \framesubtitle{The bad, the good, the ugly, and the weird}
 
   \begin{columns}
     \begin{column}{0.5\textwidth}
@@ -370,7 +377,8 @@
 \end{frame}
 
 \begin{frame}
-  \frametitle{Overparamaterized curve fitting}
+  \frametitle{Linear least squares}
+  \framesubtitle{Overparameterized solutions are not unique}
   \begin{columns}
     \begin{column}{0.5\textwidth}
       Underparameterized fitting ($M>N$) has a unique minimizing solution
@@ -408,7 +416,8 @@
 \end{frame}
 
 \begin{frame}
-  \frametitle{Overparamaterized curve fitting and algorithms}
+  \frametitle{Linear least squares}
+  \framesubtitle{Gradient descent and implicit regularization}
   \begin{columns}
     \begin{column}{0.5\textwidth}
       Overparameterized fits found with gradient descent algorithm: take small
@@ -439,7 +448,8 @@
 \end{frame}
 
 \begin{frame}
-  \frametitle{Curve fitting: choice of basis}
+  \frametitle{Linear least squares}
+  \framesubtitle{Choice of basis}
 
   \begin{columns}
     \begin{column}{0.5\textwidth}
@@ -462,7 +472,8 @@
 \end{frame}
 
 \begin{frame}
-  \frametitle{Curve fitting: choice of basis}
+  \frametitle{Linear least squares}
+  \framesubtitle{Choice of basis}
   \centering
   \Large\textbf{Polynomial basis}\normalsize
 
@@ -553,7 +564,8 @@
 \end{frame}
 
 \begin{frame}
-  \frametitle{Curve fitting: choice of basis}
+  \frametitle{Linear least squares}
+  \framesubtitle{Choice of basis}
   \centering
   \Large\textbf{Absolute value basis\vphantom{y}}\normalsize
 
@@ -644,7 +656,8 @@
 \end{frame}
 
 \begin{frame}
-  \frametitle{Curve fitting: choice of basis}
+  \frametitle{Linear least squares}
+  \framesubtitle{Choice of basis}
 
   \begin{columns}
     \begin{column}{0.5\textwidth}
@@ -667,7 +680,8 @@
 \end{frame}
 
 \begin{frame}
-  \frametitle{Curve fitting: sparseness and level of noise}
+  \frametitle{Linear least squares}
+  \framesubtitle{Sparseness and level of noise}
 
   \begin{columns}
     \begin{column}{0.5\textwidth}
@@ -680,7 +694,8 @@
 \end{frame}
 
 \begin{frame}
-  \frametitle{Curve fitting: sparseness and level of noise}
+  \frametitle{Linear least squares}
+  \framesubtitle{Sparseness and level of noise}
   \begin{columns}
     \begin{column}{0.25\textwidth}
       \centering
@@ -766,7 +781,8 @@
 \end{frame}
 
 \begin{frame}
-  \frametitle{Curve fitting: sparseness and level of noise}
+  \frametitle{Linear least squares}
+  \framesubtitle{Sparseness and level of noise}
 
   \begin{columns}
     \begin{column}{0.5\textwidth}
@@ -783,6 +799,69 @@
   \end{columns}
 \end{frame}
 
+\begin{frame}
+  \frametitle{Machine learning is just \emph{non}linear least squares}
+  \begin{columns}
+    \begin{column}{0.5\textwidth}
+      Gradient descent produces poor solutions to many machine learning problems
+
+      \bigskip
+
+      \textbf{BUT:} no one uses gradient descent
+
+      \bigskip
+
+      \emph{Stochastic} gradient descent (SGD): follow approximated gradient of $\chi^2$ calculated using small subsets (batches) of the data
+
+      \bigskip
+
+      Approximated gradient takes \emph{fewer} steps to find \emph{better} solutions
+    \end{column}
+    \begin{column}{0.5\textwidth}
+    \end{column}
+  \end{columns}
+\end{frame}
+
+\begin{frame}
+  \begin{columns}
+    \begin{column}{0.9\textwidth}
+      \begin{overprint}
+        \onslide<1>\includegraphics[width=\columnwidth]{figs/gradient_vs_sgd_1.png}
+        \onslide<2>\includegraphics[width=\columnwidth]{figs/gradient_vs_sgd_2.png}
+        \onslide<3>\includegraphics[width=\columnwidth]{figs/gradient_vs_sgd_3.png}
+        \onslide<4>\includegraphics[width=\columnwidth]{figs/gradient_vs_sgd_4.png}
+        \onslide<5>\includegraphics[width=\columnwidth]{figs/gradient_vs_sgd_5.png}
+      \end{overprint}
+    \end{column}
+  \end{columns}
+\end{frame}
+
+\begin{frame}
+  \centering
+  \begin{tikzpicture}
+    \draw (0,0) node[align=center] {Overparameterization works\\\includegraphics[width=3cm]{figs/fit_overparamfit_abs2.pdf}};
+    \draw (4,2) node[align=center] {Gradient descent\\implicitly regularizes\\\includegraphics[height=2cm]{figs/fit_gradient_5.pdf}};
+    \draw (-4,2) node[align=center] {Neural networks\\are good bases\\\includegraphics[height=2cm]{figs/fit_basis_abs.pdf}};
+    \draw (-4,-2) node[align=center] {Data is sparse\\and high-dimensional\\\includegraphics[height=2cm]{figs/fit_data_abs2.pdf}};
+    \draw (4,-2) node[align=center] {SGD finds\\high-entropy solutions\\\includegraphics[height=2cm]{figs/gradient_vs_sgd_4.png}};
+  \end{tikzpicture}
+\end{frame}
+
+\begin{frame}
+  \frametitle{Machine learning is just \emph{non}linear least squares}
+  \begin{columns}
+    \begin{column}{0.5\textwidth}
+      Structure and geometry of manifold of "perfect" solutions integral to understanding overparameterized fits
+
+      \medskip
+
+      \textbf{BUT:} extremely little is known outside of the linear case
+    \end{column}
+    \begin{column}{0.5\textwidth}
+    \end{column}
+  \end{columns}
+\end{frame}
+
 \begin{frame}
   \frametitle{The Euler characteristic \boldmath{$\chi$}}
   \begin{columns}
-- 
cgit v1.2.3-70-g09d2