diff options
Diffstat (limited to 'ictp-saifr_colloquium.tex')
-rw-r--r-- | ictp-saifr_colloquium.tex | 150 |
1 files changed, 146 insertions, 4 deletions
diff --git a/ictp-saifr_colloquium.tex b/ictp-saifr_colloquium.tex index 2d621f2..dc152f8 100644 --- a/ictp-saifr_colloquium.tex +++ b/ictp-saifr_colloquium.tex @@ -227,7 +227,7 @@ \frametitle{Machine learning is just curve fitting} \begin{columns} - \begin{column}{0.8\textwidth} + \begin{column}{0.9\textwidth} Number of data points $M$ is big: all images on the internet \medskip @@ -243,15 +243,157 @@ \medskip - $\chi^2(\pmb a\mid\text{data})$ is called \emph{cost} or \emph{objective function} + $\chi^2(\pmb a\mid\text{data})$ is called the \emph{cost} or \emph{loss function} \medskip - $\chi^2(\pmb a^*\mid\text{data})$ is call the \emph{training error} + $\chi^2(\pmb a^*\mid\text{data})$ is called the \emph{training error} \medskip - MSE is called \emph{test} or \emph{generalization error} + MSE is called the \emph{test} or \emph{generalization error} + + \bigskip + + \textbf{BUT:} machine learning uses many more parameters $N$ than data points $M$ + \end{column} + \end{columns} +\end{frame} + +\begin{frame} + \frametitle{Curve fitting: the bad, the good, the ugly, and the weird} + \begin{columns} + \begin{column}{0.25\textwidth} + \centering + $M=40$, $N=2$ + + \bigskip + + \includegraphics[width=\columnwidth]{figs/fit_underfit_poly.pdf} + + \bigskip + + Underfit + + \smallskip + + $\chi^2$ is large + + \smallskip + + Best fit has \emph{high bias} \phantom{variance} + \end{column} + \begin{column}{0.25\textwidth} + \centering + $M=40$, $N=7$ + + \bigskip + + \includegraphics[width=\columnwidth]{figs/fit_goodfit_poly.pdf} + + \bigskip + + Good fit! + + \smallskip + + $\chi^2$ is moderate + + \smallskip + + Best fit has \emph{low variance} and \emph{low bias} + \end{column} + \begin{column}{0.25\textwidth} + \centering + $M=40$, $N=40$ + + \bigskip + + \includegraphics[width=\columnwidth]{figs/fit_overfit_poly.pdf} + + \bigskip + + Overfit + + \smallskip + + $\chi^2$ is zero + + \smallskip + + Best fit has \emph{high variance} + \end{column} + \begin{column}{0.25\textwidth} + \centering + $M=40$, $N=80$ + + \bigskip + + \includegraphics[width=\columnwidth]{figs/fit_overparamfit_poly.pdf} + + \bigskip + + Good fit? + + \smallskip + + $\chi^2$ is zero + + \smallskip + + Best fit has \emph{low variance} and \emph{low bias} + \end{column} + \end{columns} +\end{frame} + +\begin{frame} + \frametitle{Curve fitting: the bad, the good, the ugly, and the weird} + + \begin{columns} + \begin{column}{0.5\textwidth} + \centering + $M=40$, $N=80$ + + \bigskip + + \includegraphics[width=\columnwidth]{figs/fit_overparamfit_poly.pdf} + \end{column} + \begin{column}{0.5\textwidth} + \centering + \includegraphics[width=\textwidth]{figs/fit_bias-variance2_poly.pdf} + + \bigskip + + Bias--variance trade-off is blown up! + \end{column} + \end{columns} +\end{frame} + +\begin{frame} + \frametitle{Overparamaterized curve fitting} + \begin{columns} + \begin{column}{0.5\textwidth} + Underparameterized fitting ($M>N$) has a unique minimizing solution + + \medskip + + Overparameterized fits are not unique: $M$ constraints + \[ + 0=y_i-\hat f(x_i\mid\pmb a)\qquad\text{for all $1\leq i\leq M$} + \] + plus $N$ unknowns $\pmb a=[a_1,\ldots, a_N]$ gives a manifold of $N-M$ dimensions + + \medskip + + What leads to the `good' solutions instead of `bad' ones? + \end{column} + \begin{column}{0.5\textwidth} + \centering + $M=40$, $N=80$ + + \bigskip + + \includegraphics[width=\columnwidth]{figs/fit_overparamfit_poly.pdf} \end{column} \end{columns} \end{frame} |