From 70aa0dc5a377b8a072ba96b18b9f6adc760df9b5 Mon Sep 17 00:00:00 2001 From: Jaron Kent-Dobias Date: Wed, 12 Feb 2025 22:06:09 -0300 Subject: More writing and figure creation. --- figs/gradient_vs_sgd_1.png | Bin 0 -> 75373 bytes figs/gradient_vs_sgd_2.png | Bin 0 -> 76257 bytes figs/gradient_vs_sgd_3.png | Bin 0 -> 97379 bytes figs/gradient_vs_sgd_4.png | Bin 0 -> 135879 bytes figs/gradient_vs_sgd_5.png | Bin 0 -> 156623 bytes gradient_vs_sgd.xcf | Bin 0 -> 249706 bytes ictp-saifr_colloquium.tex | 111 ++++++++++++++++++++++++++++++++++++++------- 7 files changed, 95 insertions(+), 16 deletions(-) create mode 100644 figs/gradient_vs_sgd_1.png create mode 100644 figs/gradient_vs_sgd_2.png create mode 100644 figs/gradient_vs_sgd_3.png create mode 100644 figs/gradient_vs_sgd_4.png create mode 100644 figs/gradient_vs_sgd_5.png create mode 100644 gradient_vs_sgd.xcf diff --git a/figs/gradient_vs_sgd_1.png b/figs/gradient_vs_sgd_1.png new file mode 100644 index 0000000..12bc5c1 Binary files /dev/null and b/figs/gradient_vs_sgd_1.png differ diff --git a/figs/gradient_vs_sgd_2.png b/figs/gradient_vs_sgd_2.png new file mode 100644 index 0000000..a4cd977 Binary files /dev/null and b/figs/gradient_vs_sgd_2.png differ diff --git a/figs/gradient_vs_sgd_3.png b/figs/gradient_vs_sgd_3.png new file mode 100644 index 0000000..1883cea Binary files /dev/null and b/figs/gradient_vs_sgd_3.png differ diff --git a/figs/gradient_vs_sgd_4.png b/figs/gradient_vs_sgd_4.png new file mode 100644 index 0000000..f3b9c51 Binary files /dev/null and b/figs/gradient_vs_sgd_4.png differ diff --git a/figs/gradient_vs_sgd_5.png b/figs/gradient_vs_sgd_5.png new file mode 100644 index 0000000..faab526 Binary files /dev/null and b/figs/gradient_vs_sgd_5.png differ diff --git a/gradient_vs_sgd.xcf b/gradient_vs_sgd.xcf new file mode 100644 index 0000000..13f95e0 Binary files /dev/null and b/gradient_vs_sgd.xcf differ diff --git a/ictp-saifr_colloquium.tex b/ictp-saifr_colloquium.tex index 92b4904..1d823a7 100644 --- a/ictp-saifr_colloquium.tex +++ b/ictp-saifr_colloquium.tex @@ -12,6 +12,7 @@ \usepackage{pifont} \usepackage{graphicx} \usepackage{xcolor} +\usepackage{tikz} \definecolor{ictpblue}{HTML}{0471b9} \definecolor{ictpgreen}{HTML}{0c8636} @@ -59,7 +60,8 @@ \end{frame} \begin{frame} - \frametitle{Curve fitting: the bad, the good, and the ugly} + \frametitle{Linear least squares} + \framesubtitle{The bad, the good, and the ugly} \begin{columns} \begin{column}{0.4\textwidth} @@ -81,7 +83,8 @@ \end{frame} \begin{frame} - \frametitle{Curve fitting: the bad, the good, and the ugly} + \frametitle{Linear least squares} + \framesubtitle{The bad, the good, and the ugly} \begin{columns} \begin{column}{0.4\textwidth} @@ -109,7 +112,8 @@ \end{columns} \end{frame} \begin{frame} - \frametitle{Curve fitting: the bad, the good, and the ugly} + \frametitle{Linear least squares} + \framesubtitle{The bad, the good, and the ugly} \begin{columns} \begin{column}{0.333\textwidth} @@ -188,7 +192,8 @@ \end{frame} \begin{frame} - \frametitle{Curve fitting: the bad, the good, and the ugly} + \frametitle{Linear least squares} + \framesubtitle{The bad, the good, and the ugly} \begin{columns} \begin{column}{0.5\textwidth} @@ -224,7 +229,7 @@ \end{frame} \begin{frame} - \frametitle{Machine learning is just curve fitting} + \frametitle{Machine learning is just \emph{non}linear least squares} \begin{columns} \begin{column}{0.9\textwidth} @@ -261,7 +266,8 @@ \end{frame} \begin{frame} - \frametitle{Curve fitting: the bad, the good, the ugly, and the weird} + \frametitle{Linear least squares} + \framesubtitle{The bad, the good, the ugly, and the weird} \begin{columns} \begin{column}{0.25\textwidth} \centering @@ -347,7 +353,8 @@ \end{frame} \begin{frame} - \frametitle{Curve fitting: the bad, the good, the ugly, and the weird} + \frametitle{Linear least squares} + \framesubtitle{The bad, the good, the ugly, and the weird} \begin{columns} \begin{column}{0.5\textwidth} @@ -370,7 +377,8 @@ \end{frame} \begin{frame} - \frametitle{Overparamaterized curve fitting} + \frametitle{Linear least squares} + \framesubtitle{Overparameterized solutions are not unique} \begin{columns} \begin{column}{0.5\textwidth} Underparameterized fitting ($M>N$) has a unique minimizing solution @@ -408,7 +416,8 @@ \end{frame} \begin{frame} - \frametitle{Overparamaterized curve fitting and algorithms} + \frametitle{Linear least squares} + \framesubtitle{Gradient descent and implicit regularization} \begin{columns} \begin{column}{0.5\textwidth} Overparameterized fits found with gradient descent algorithm: take small @@ -439,7 +448,8 @@ \end{frame} \begin{frame} - \frametitle{Curve fitting: choice of basis} + \frametitle{Linear least squares} + \framesubtitle{Choice of basis} \begin{columns} \begin{column}{0.5\textwidth} @@ -462,7 +472,8 @@ \end{frame} \begin{frame} - \frametitle{Curve fitting: choice of basis} + \frametitle{Linear least squares} + \framesubtitle{Choice of basis} \centering \Large\textbf{Polynomial basis}\normalsize @@ -553,7 +564,8 @@ \end{frame} \begin{frame} - \frametitle{Curve fitting: choice of basis} + \frametitle{Linear least squares} + \framesubtitle{Choice of basis} \centering \Large\textbf{Absolute value basis\vphantom{y}}\normalsize @@ -644,7 +656,8 @@ \end{frame} \begin{frame} - \frametitle{Curve fitting: choice of basis} + \frametitle{Linear least squares} + \framesubtitle{Choice of basis} \begin{columns} \begin{column}{0.5\textwidth} @@ -667,7 +680,8 @@ \end{frame} \begin{frame} - \frametitle{Curve fitting: sparseness and level of noise} + \frametitle{Linear least squares} + \framesubtitle{Sparseness and level of noise} \begin{columns} \begin{column}{0.5\textwidth} @@ -680,7 +694,8 @@ \end{frame} \begin{frame} - \frametitle{Curve fitting: sparseness and level of noise} + \frametitle{Linear least squares} + \framesubtitle{Sparseness and level of noise} \begin{columns} \begin{column}{0.25\textwidth} \centering @@ -766,7 +781,8 @@ \end{frame} \begin{frame} - \frametitle{Curve fitting: sparseness and level of noise} + \frametitle{Linear least squares} + \framesubtitle{Sparseness and level of noise} \begin{columns} \begin{column}{0.5\textwidth} @@ -783,6 +799,69 @@ \end{columns} \end{frame} +\begin{frame} + \frametitle{Machine learning is just \emph{non}linear least squares} + \begin{columns} + \begin{column}{0.5\textwidth} + Gradient descent produces poor solutions to many machine learning problems + + \bigskip + + \textbf{BUT:} no one uses gradient descent + + \bigskip + + \emph{Stochastic} gradient descent (SGD): follow approximated gradient of $\chi^2$ calculated using small subsets (batches) of the data + + \bigskip + + Approximated gradient takes \emph{fewer} steps to find \emph{better} solutions + \end{column} + \begin{column}{0.5\textwidth} + \end{column} + \end{columns} +\end{frame} + +\begin{frame} + \begin{columns} + \begin{column}{0.9\textwidth} + \begin{overprint} + \onslide<1>\includegraphics[width=\columnwidth]{figs/gradient_vs_sgd_1.png} + \onslide<2>\includegraphics[width=\columnwidth]{figs/gradient_vs_sgd_2.png} + \onslide<3>\includegraphics[width=\columnwidth]{figs/gradient_vs_sgd_3.png} + \onslide<4>\includegraphics[width=\columnwidth]{figs/gradient_vs_sgd_4.png} + \onslide<5>\includegraphics[width=\columnwidth]{figs/gradient_vs_sgd_5.png} + \end{overprint} + \end{column} + \end{columns} +\end{frame} + +\begin{frame} + \centering + \begin{tikzpicture} + \draw (0,0) node[align=center] {Overparameterization works\\\includegraphics[width=3cm]{figs/fit_overparamfit_abs2.pdf}}; + \draw (4,2) node[align=center] {Gradient descent\\implicitly regularizes\\\includegraphics[height=2cm]{figs/fit_gradient_5.pdf}}; + \draw (-4,2) node[align=center] {Neural networks\\are good bases\\\includegraphics[height=2cm]{figs/fit_basis_abs.pdf}}; + \draw (-4,-2) node[align=center] {Data is sparse\\and high-dimensional\\\includegraphics[height=2cm]{figs/fit_data_abs2.pdf}}; + \draw (4,-2) node[align=center] {SGD finds\\high-entropy solutions\\\includegraphics[height=2cm]{figs/gradient_vs_sgd_4.png}}; + \end{tikzpicture} +\end{frame} + +\begin{frame} + \frametitle{Machine learning is just \emph{non}linear least squares} + \begin{columns} + \begin{column}{0.5\textwidth} + Structure and geometry of manifold of "perfect" solutions integral to understanding overparameterized fits + + \medskip + + \textbf{BUT:} extremely little is known outside of the linear case + \end{column} + \begin{column}{0.5\textwidth} + \end{column} + \end{columns} +\end{frame} + \begin{frame} \frametitle{The Euler characteristic \boldmath{$\chi$}} \begin{columns} -- cgit v1.2.3-70-g09d2