More work on presentation

author: Jaron Kent-Dobias <jaron@kent-dobias.com> 2025-02-11 12:23:44 -0300
committer: Jaron Kent-Dobias <jaron@kent-dobias.com> 2025-02-11 12:23:44 -0300
commit: 01a22225f2d207f04df595290e0e5c742a29ccee (patch)
tree: c7a885772aeaacd77978775bc1ebe9d50f9e9668 /ictp-saifr_colloquium.tex
parent: 64a3acf60804cfa2e504f695526c75c625640973 (diff)
download: ictp-saifr_colloquium-01a22225f2d207f04df595290e0e5c742a29ccee.tar.gz
ictp-saifr_colloquium-01a22225f2d207f04df595290e0e5c742a29ccee.tar.bz2
ictp-saifr_colloquium-01a22225f2d207f04df595290e0e5c742a29ccee.zip
1 files changed, 146 insertions, 4 deletions
diff --git a/ictp-saifr_colloquium.tex b/ictp-saifr_colloquium.tex
index 2d621f2..dc152f8 100644
--- a/ictp-saifr_colloquium.tex
+++ b/ictp-saifr_colloquium.tex
@@ -227,7 +227,7 @@
   \frametitle{Machine learning is just curve fitting}
 
   \begin{columns}
-    \begin{column}{0.8\textwidth}
+    \begin{column}{0.9\textwidth}
       Number of data points $M$ is big: all images on the internet
 
       \medskip
@@ -243,15 +243,157 @@
 
       \medskip
 
-      $\chi^2(\pmb a\mid\text{data})$ is called \emph{cost} or \emph{objective function}
+    $\chi^2(\pmb a\mid\text{data})$ is called the \emph{cost} or \emph{loss function}
 
       \medskip
 
-      $\chi^2(\pmb a^*\mid\text{data})$ is call the \emph{training error}
+      $\chi^2(\pmb a^*\mid\text{data})$ is called the \emph{training error}
 
       \medskip
 
-      MSE is called \emph{test} or \emph{generalization error}
+      MSE is called the \emph{test} or \emph{generalization error}
+
+      \bigskip
+
+      \textbf{BUT:} machine learning uses many more parameters $N$ than data points $M$
+    \end{column}
+  \end{columns}
+\end{frame}
+
+\begin{frame}
+  \frametitle{Curve fitting: the bad, the good, the ugly, and the weird}
+  \begin{columns}
+    \begin{column}{0.25\textwidth}
+      \centering
+      $M=40$, $N=2$
+
+      \bigskip
+
+      \includegraphics[width=\columnwidth]{figs/fit_underfit_poly.pdf}
+
+      \bigskip
+
+      Underfit
+
+      \smallskip
+
+      $\chi^2$ is large
+
+      \smallskip
+
+      Best fit has \emph{high bias} \phantom{variance}
+    \end{column}
+    \begin{column}{0.25\textwidth}
+      \centering
+      $M=40$, $N=7$
+
+      \bigskip
+
+      \includegraphics[width=\columnwidth]{figs/fit_goodfit_poly.pdf}
+
+      \bigskip
+
+      Good fit!
+
+      \smallskip
+
+      $\chi^2$ is moderate
+
+      \smallskip
+
+      Best fit has \emph{low variance} and \emph{low bias}
+    \end{column}
+    \begin{column}{0.25\textwidth}
+      \centering
+      $M=40$, $N=40$
+
+      \bigskip
+
+      \includegraphics[width=\columnwidth]{figs/fit_overfit_poly.pdf}
+
+      \bigskip
+
+      Overfit
+
+      \smallskip
+
+      $\chi^2$ is zero
+
+      \smallskip
+
+      Best fit has \emph{high variance}
+    \end{column}
+    \begin{column}{0.25\textwidth}
+      \centering
+      $M=40$, $N=80$
+
+      \bigskip
+
+      \includegraphics[width=\columnwidth]{figs/fit_overparamfit_poly.pdf}
+
+      \bigskip
+
+      Good fit?
+
+      \smallskip
+
+      $\chi^2$ is zero
+
+      \smallskip
+
+      Best fit has \emph{low variance} and \emph{low bias}
+    \end{column}
+  \end{columns}
+\end{frame}
+
+\begin{frame}
+  \frametitle{Curve fitting: the bad, the good, the ugly, and the weird}
+
+  \begin{columns}
+    \begin{column}{0.5\textwidth}
+      \centering
+      $M=40$, $N=80$
+
+      \bigskip
+
+      \includegraphics[width=\columnwidth]{figs/fit_overparamfit_poly.pdf}
+    \end{column}
+    \begin{column}{0.5\textwidth}
+      \centering
+      \includegraphics[width=\textwidth]{figs/fit_bias-variance2_poly.pdf}
+
+      \bigskip
+
+      Bias--variance trade-off is blown up!
+    \end{column}
+  \end{columns}
+\end{frame}
+
+\begin{frame}
+  \frametitle{Overparamaterized curve fitting}
+  \begin{columns}
+    \begin{column}{0.5\textwidth}
+      Underparameterized fitting ($M>N$) has a unique minimizing solution
+
+      \medskip
+
+      Overparameterized fits are not unique: $M$ constraints
+      \[
+        0=y_i-\hat f(x_i\mid\pmb a)\qquad\text{for all $1\leq i\leq M$}
+      \]
+      plus $N$ unknowns $\pmb a=[a_1,\ldots, a_N]$ gives a manifold of $N-M$ dimensions
+
+      \medskip
+
+      What leads to the `good' solutions instead of `bad' ones?
+    \end{column}
+    \begin{column}{0.5\textwidth}
+      \centering
+      $M=40$, $N=80$
+
+      \bigskip
+
+      \includegraphics[width=\columnwidth]{figs/fit_overparamfit_poly.pdf}
     \end{column}
   \end{columns}
 \end{frame}
author	Jaron Kent-Dobias <jaron@kent-dobias.com>	2025-02-11 12:23:44 -0300
committer	Jaron Kent-Dobias <jaron@kent-dobias.com>	2025-02-11 12:23:44 -0300
commit	01a22225f2d207f04df595290e0e5c742a29ccee (patch)
tree	c7a885772aeaacd77978775bc1ebe9d50f9e9668 /ictp-saifr_colloquium.tex
parent	64a3acf60804cfa2e504f695526c75c625640973 (diff)
download	ictp-saifr_colloquium-01a22225f2d207f04df595290e0e5c742a29ccee.tar.gz ictp-saifr_colloquium-01a22225f2d207f04df595290e0e5c742a29ccee.tar.bz2 ictp-saifr_colloquium-01a22225f2d207f04df595290e0e5c742a29ccee.zip