summaryrefslogtreecommitdiff
path: root/ictp-saifr_colloquium.tex
diff options
context:
space:
mode:
Diffstat (limited to 'ictp-saifr_colloquium.tex')
-rw-r--r--ictp-saifr_colloquium.tex150
1 files changed, 146 insertions, 4 deletions
diff --git a/ictp-saifr_colloquium.tex b/ictp-saifr_colloquium.tex
index 2d621f2..dc152f8 100644
--- a/ictp-saifr_colloquium.tex
+++ b/ictp-saifr_colloquium.tex
@@ -227,7 +227,7 @@
\frametitle{Machine learning is just curve fitting}
\begin{columns}
- \begin{column}{0.8\textwidth}
+ \begin{column}{0.9\textwidth}
Number of data points $M$ is big: all images on the internet
\medskip
@@ -243,15 +243,157 @@
\medskip
- $\chi^2(\pmb a\mid\text{data})$ is called \emph{cost} or \emph{objective function}
+ $\chi^2(\pmb a\mid\text{data})$ is called the \emph{cost} or \emph{loss function}
\medskip
- $\chi^2(\pmb a^*\mid\text{data})$ is call the \emph{training error}
+ $\chi^2(\pmb a^*\mid\text{data})$ is called the \emph{training error}
\medskip
- MSE is called \emph{test} or \emph{generalization error}
+ MSE is called the \emph{test} or \emph{generalization error}
+
+ \bigskip
+
+ \textbf{BUT:} machine learning uses many more parameters $N$ than data points $M$
+ \end{column}
+ \end{columns}
+\end{frame}
+
+\begin{frame}
+ \frametitle{Curve fitting: the bad, the good, the ugly, and the weird}
+ \begin{columns}
+ \begin{column}{0.25\textwidth}
+ \centering
+ $M=40$, $N=2$
+
+ \bigskip
+
+ \includegraphics[width=\columnwidth]{figs/fit_underfit_poly.pdf}
+
+ \bigskip
+
+ Underfit
+
+ \smallskip
+
+ $\chi^2$ is large
+
+ \smallskip
+
+ Best fit has \emph{high bias} \phantom{variance}
+ \end{column}
+ \begin{column}{0.25\textwidth}
+ \centering
+ $M=40$, $N=7$
+
+ \bigskip
+
+ \includegraphics[width=\columnwidth]{figs/fit_goodfit_poly.pdf}
+
+ \bigskip
+
+ Good fit!
+
+ \smallskip
+
+ $\chi^2$ is moderate
+
+ \smallskip
+
+ Best fit has \emph{low variance} and \emph{low bias}
+ \end{column}
+ \begin{column}{0.25\textwidth}
+ \centering
+ $M=40$, $N=40$
+
+ \bigskip
+
+ \includegraphics[width=\columnwidth]{figs/fit_overfit_poly.pdf}
+
+ \bigskip
+
+ Overfit
+
+ \smallskip
+
+ $\chi^2$ is zero
+
+ \smallskip
+
+ Best fit has \emph{high variance}
+ \end{column}
+ \begin{column}{0.25\textwidth}
+ \centering
+ $M=40$, $N=80$
+
+ \bigskip
+
+ \includegraphics[width=\columnwidth]{figs/fit_overparamfit_poly.pdf}
+
+ \bigskip
+
+ Good fit?
+
+ \smallskip
+
+ $\chi^2$ is zero
+
+ \smallskip
+
+ Best fit has \emph{low variance} and \emph{low bias}
+ \end{column}
+ \end{columns}
+\end{frame}
+
+\begin{frame}
+ \frametitle{Curve fitting: the bad, the good, the ugly, and the weird}
+
+ \begin{columns}
+ \begin{column}{0.5\textwidth}
+ \centering
+ $M=40$, $N=80$
+
+ \bigskip
+
+ \includegraphics[width=\columnwidth]{figs/fit_overparamfit_poly.pdf}
+ \end{column}
+ \begin{column}{0.5\textwidth}
+ \centering
+ \includegraphics[width=\textwidth]{figs/fit_bias-variance2_poly.pdf}
+
+ \bigskip
+
+ Bias--variance trade-off is blown up!
+ \end{column}
+ \end{columns}
+\end{frame}
+
+\begin{frame}
+ \frametitle{Overparamaterized curve fitting}
+ \begin{columns}
+ \begin{column}{0.5\textwidth}
+ Underparameterized fitting ($M>N$) has a unique minimizing solution
+
+ \medskip
+
+ Overparameterized fits are not unique: $M$ constraints
+ \[
+ 0=y_i-\hat f(x_i\mid\pmb a)\qquad\text{for all $1\leq i\leq M$}
+ \]
+ plus $N$ unknowns $\pmb a=[a_1,\ldots, a_N]$ gives a manifold of $N-M$ dimensions
+
+ \medskip
+
+ What leads to the `good' solutions instead of `bad' ones?
+ \end{column}
+ \begin{column}{0.5\textwidth}
+ \centering
+ $M=40$, $N=80$
+
+ \bigskip
+
+ \includegraphics[width=\columnwidth]{figs/fit_overparamfit_poly.pdf}
\end{column}
\end{columns}
\end{frame}