diff options
author | Jaron Kent-Dobias <jaron@kent-dobias.com> | 2025-02-12 22:06:09 -0300 |
---|---|---|
committer | Jaron Kent-Dobias <jaron@kent-dobias.com> | 2025-02-12 22:06:09 -0300 |
commit | 70aa0dc5a377b8a072ba96b18b9f6adc760df9b5 (patch) | |
tree | 87a0d408959cfa6ad28864d6cb9096e861cdd39e | |
parent | ae56a721eb63e6e2032e10436a4944d9203aa560 (diff) | |
download | ictp-saifr_colloquium-70aa0dc5a377b8a072ba96b18b9f6adc760df9b5.tar.gz ictp-saifr_colloquium-70aa0dc5a377b8a072ba96b18b9f6adc760df9b5.tar.bz2 ictp-saifr_colloquium-70aa0dc5a377b8a072ba96b18b9f6adc760df9b5.zip |
More writing and figure creation.
-rw-r--r-- | figs/gradient_vs_sgd_1.png | bin | 0 -> 75373 bytes | |||
-rw-r--r-- | figs/gradient_vs_sgd_2.png | bin | 0 -> 76257 bytes | |||
-rw-r--r-- | figs/gradient_vs_sgd_3.png | bin | 0 -> 97379 bytes | |||
-rw-r--r-- | figs/gradient_vs_sgd_4.png | bin | 0 -> 135879 bytes | |||
-rw-r--r-- | figs/gradient_vs_sgd_5.png | bin | 0 -> 156623 bytes | |||
-rw-r--r-- | gradient_vs_sgd.xcf | bin | 0 -> 249706 bytes | |||
-rw-r--r-- | ictp-saifr_colloquium.tex | 111 |
7 files changed, 95 insertions, 16 deletions
diff --git a/figs/gradient_vs_sgd_1.png b/figs/gradient_vs_sgd_1.png Binary files differnew file mode 100644 index 0000000..12bc5c1 --- /dev/null +++ b/figs/gradient_vs_sgd_1.png diff --git a/figs/gradient_vs_sgd_2.png b/figs/gradient_vs_sgd_2.png Binary files differnew file mode 100644 index 0000000..a4cd977 --- /dev/null +++ b/figs/gradient_vs_sgd_2.png diff --git a/figs/gradient_vs_sgd_3.png b/figs/gradient_vs_sgd_3.png Binary files differnew file mode 100644 index 0000000..1883cea --- /dev/null +++ b/figs/gradient_vs_sgd_3.png diff --git a/figs/gradient_vs_sgd_4.png b/figs/gradient_vs_sgd_4.png Binary files differnew file mode 100644 index 0000000..f3b9c51 --- /dev/null +++ b/figs/gradient_vs_sgd_4.png diff --git a/figs/gradient_vs_sgd_5.png b/figs/gradient_vs_sgd_5.png Binary files differnew file mode 100644 index 0000000..faab526 --- /dev/null +++ b/figs/gradient_vs_sgd_5.png diff --git a/gradient_vs_sgd.xcf b/gradient_vs_sgd.xcf Binary files differnew file mode 100644 index 0000000..13f95e0 --- /dev/null +++ b/gradient_vs_sgd.xcf diff --git a/ictp-saifr_colloquium.tex b/ictp-saifr_colloquium.tex index 92b4904..1d823a7 100644 --- a/ictp-saifr_colloquium.tex +++ b/ictp-saifr_colloquium.tex @@ -12,6 +12,7 @@ \usepackage{pifont} \usepackage{graphicx} \usepackage{xcolor} +\usepackage{tikz} \definecolor{ictpblue}{HTML}{0471b9} \definecolor{ictpgreen}{HTML}{0c8636} @@ -59,7 +60,8 @@ \end{frame} \begin{frame} - \frametitle{Curve fitting: the bad, the good, and the ugly} + \frametitle{Linear least squares} + \framesubtitle{The bad, the good, and the ugly} \begin{columns} \begin{column}{0.4\textwidth} @@ -81,7 +83,8 @@ \end{frame} \begin{frame} - \frametitle{Curve fitting: the bad, the good, and the ugly} + \frametitle{Linear least squares} + \framesubtitle{The bad, the good, and the ugly} \begin{columns} \begin{column}{0.4\textwidth} @@ -109,7 +112,8 @@ \end{columns} \end{frame} \begin{frame} - \frametitle{Curve fitting: the bad, the good, and the ugly} + \frametitle{Linear least squares} + \framesubtitle{The bad, the good, and the ugly} \begin{columns} \begin{column}{0.333\textwidth} @@ -188,7 +192,8 @@ \end{frame} \begin{frame} - \frametitle{Curve fitting: the bad, the good, and the ugly} + \frametitle{Linear least squares} + \framesubtitle{The bad, the good, and the ugly} \begin{columns} \begin{column}{0.5\textwidth} @@ -224,7 +229,7 @@ \end{frame} \begin{frame} - \frametitle{Machine learning is just curve fitting} + \frametitle{Machine learning is just \emph{non}linear least squares} \begin{columns} \begin{column}{0.9\textwidth} @@ -261,7 +266,8 @@ \end{frame} \begin{frame} - \frametitle{Curve fitting: the bad, the good, the ugly, and the weird} + \frametitle{Linear least squares} + \framesubtitle{The bad, the good, the ugly, and the weird} \begin{columns} \begin{column}{0.25\textwidth} \centering @@ -347,7 +353,8 @@ \end{frame} \begin{frame} - \frametitle{Curve fitting: the bad, the good, the ugly, and the weird} + \frametitle{Linear least squares} + \framesubtitle{The bad, the good, the ugly, and the weird} \begin{columns} \begin{column}{0.5\textwidth} @@ -370,7 +377,8 @@ \end{frame} \begin{frame} - \frametitle{Overparamaterized curve fitting} + \frametitle{Linear least squares} + \framesubtitle{Overparameterized solutions are not unique} \begin{columns} \begin{column}{0.5\textwidth} Underparameterized fitting ($M>N$) has a unique minimizing solution @@ -408,7 +416,8 @@ \end{frame} \begin{frame} - \frametitle{Overparamaterized curve fitting and algorithms} + \frametitle{Linear least squares} + \framesubtitle{Gradient descent and implicit regularization} \begin{columns} \begin{column}{0.5\textwidth} Overparameterized fits found with gradient descent algorithm: take small @@ -439,7 +448,8 @@ \end{frame} \begin{frame} - \frametitle{Curve fitting: choice of basis} + \frametitle{Linear least squares} + \framesubtitle{Choice of basis} \begin{columns} \begin{column}{0.5\textwidth} @@ -462,7 +472,8 @@ \end{frame} \begin{frame} - \frametitle{Curve fitting: choice of basis} + \frametitle{Linear least squares} + \framesubtitle{Choice of basis} \centering \Large\textbf{Polynomial basis}\normalsize @@ -553,7 +564,8 @@ \end{frame} \begin{frame} - \frametitle{Curve fitting: choice of basis} + \frametitle{Linear least squares} + \framesubtitle{Choice of basis} \centering \Large\textbf{Absolute value basis\vphantom{y}}\normalsize @@ -644,7 +656,8 @@ \end{frame} \begin{frame} - \frametitle{Curve fitting: choice of basis} + \frametitle{Linear least squares} + \framesubtitle{Choice of basis} \begin{columns} \begin{column}{0.5\textwidth} @@ -667,7 +680,8 @@ \end{frame} \begin{frame} - \frametitle{Curve fitting: sparseness and level of noise} + \frametitle{Linear least squares} + \framesubtitle{Sparseness and level of noise} \begin{columns} \begin{column}{0.5\textwidth} @@ -680,7 +694,8 @@ \end{frame} \begin{frame} - \frametitle{Curve fitting: sparseness and level of noise} + \frametitle{Linear least squares} + \framesubtitle{Sparseness and level of noise} \begin{columns} \begin{column}{0.25\textwidth} \centering @@ -766,7 +781,8 @@ \end{frame} \begin{frame} - \frametitle{Curve fitting: sparseness and level of noise} + \frametitle{Linear least squares} + \framesubtitle{Sparseness and level of noise} \begin{columns} \begin{column}{0.5\textwidth} @@ -784,6 +800,69 @@ \end{frame} \begin{frame} + \frametitle{Machine learning is just \emph{non}linear least squares} + \begin{columns} + \begin{column}{0.5\textwidth} + Gradient descent produces poor solutions to many machine learning problems + + \bigskip + + \textbf{BUT:} no one uses gradient descent + + \bigskip + + \emph{Stochastic} gradient descent (SGD): follow approximated gradient of $\chi^2$ calculated using small subsets (batches) of the data + + \bigskip + + Approximated gradient takes \emph{fewer} steps to find \emph{better} solutions + \end{column} + \begin{column}{0.5\textwidth} + \end{column} + \end{columns} +\end{frame} + +\begin{frame} + \begin{columns} + \begin{column}{0.9\textwidth} + \begin{overprint} + \onslide<1>\includegraphics[width=\columnwidth]{figs/gradient_vs_sgd_1.png} + \onslide<2>\includegraphics[width=\columnwidth]{figs/gradient_vs_sgd_2.png} + \onslide<3>\includegraphics[width=\columnwidth]{figs/gradient_vs_sgd_3.png} + \onslide<4>\includegraphics[width=\columnwidth]{figs/gradient_vs_sgd_4.png} + \onslide<5>\includegraphics[width=\columnwidth]{figs/gradient_vs_sgd_5.png} + \end{overprint} + \end{column} + \end{columns} +\end{frame} + +\begin{frame} + \centering + \begin{tikzpicture} + \draw (0,0) node[align=center] {Overparameterization works\\\includegraphics[width=3cm]{figs/fit_overparamfit_abs2.pdf}}; + \draw (4,2) node[align=center] {Gradient descent\\implicitly regularizes\\\includegraphics[height=2cm]{figs/fit_gradient_5.pdf}}; + \draw (-4,2) node[align=center] {Neural networks\\are good bases\\\includegraphics[height=2cm]{figs/fit_basis_abs.pdf}}; + \draw (-4,-2) node[align=center] {Data is sparse\\and high-dimensional\\\includegraphics[height=2cm]{figs/fit_data_abs2.pdf}}; + \draw (4,-2) node[align=center] {SGD finds\\high-entropy solutions\\\includegraphics[height=2cm]{figs/gradient_vs_sgd_4.png}}; + \end{tikzpicture} +\end{frame} + +\begin{frame} + \frametitle{Machine learning is just \emph{non}linear least squares} + \begin{columns} + \begin{column}{0.5\textwidth} + Structure and geometry of manifold of "perfect" solutions integral to understanding overparameterized fits + + \medskip + + \textbf{BUT:} extremely little is known outside of the linear case + \end{column} + \begin{column}{0.5\textwidth} + \end{column} + \end{columns} +\end{frame} + +\begin{frame} \frametitle{The Euler characteristic \boldmath{$\chi$}} \begin{columns} \begin{column}{0.5\textwidth} |