diff options
Diffstat (limited to 'ictp-saifr_colloquium.tex')
-rw-r--r-- | ictp-saifr_colloquium.tex | 183 |
1 files changed, 175 insertions, 8 deletions
diff --git a/ictp-saifr_colloquium.tex b/ictp-saifr_colloquium.tex index 83c3226..8611f15 100644 --- a/ictp-saifr_colloquium.tex +++ b/ictp-saifr_colloquium.tex @@ -1,4 +1,4 @@ -\documentclass[aspectratio=169,usenames,dvipsnames,fleqn]{beamer} +\documentclass[aspectratio=169,usenames,dvipsnames]{beamer} \setbeamerfont{title}{family=\bf} \setbeamerfont{frametitle}{family=\bf} @@ -59,27 +59,194 @@ \end{frame} \begin{frame} - \frametitle{Curve fitting: the good, the bad, and the weird} + \frametitle{Curve fitting: the bad, the good, and the ugly} \begin{columns} - \begin{column}{0.5\textwidth} + \begin{column}{0.4\textwidth} + You have $M$ data points $(x_1,y_1),\ldots,(x_M,y_M)$ + + \bigskip + + Perhaps a noisy sample of a ground truth function + $y_i=f(x_i)+\xi$ + \end{column} + \begin{column}{0.6\textwidth} + \begin{overprint} + \onslide<1>\includegraphics[width=\columnwidth]{figs/fit_data.pdf} + \onslide<2>\includegraphics[width=\columnwidth]{figs/fit_data_truth.pdf} + \onslide<3>\includegraphics[width=\columnwidth]{figs/fit_data.pdf} + \end{overprint} \end{column} \end{columns} +\end{frame} + +\begin{frame} + \frametitle{Curve fitting: the bad, the good, and the ugly} + \begin{columns} - \begin{column}{0.5\textwidth} + \begin{column}{0.4\textwidth} + Pick a basis of $N$ functions $b_1(x), \ldots, b_N(x)$ + + \bigskip + + Approximate the ground truth + \[ + \hat f(x\mid a_1,\ldots, a_N)=\sum_{j=1}^Na_jb_j(x) + \] + + Find $a_1, \ldots, a_N$ minimizing + \[ + \chi^2 + =\sum_{i=1}^M\left(y_i-\sum_{j=1}^Na_jb_j(x_i)\right)^2 + \] + \end{column} + \begin{column}{0.6\textwidth} + \begin{overprint} + \onslide<1>\includegraphics[width=\columnwidth]{figs/fit_basis_poly.pdf} + \end{overprint} + \end{column} + \end{columns} +\end{frame} +\begin{frame} + \frametitle{Curve fitting: the bad, the good, and the ugly} + + \begin{columns} + \begin{column}{0.333\textwidth} + \centering + $M=40$, $N=2$ + + \bigskip + + \includegraphics[width=\columnwidth]{figs/fit_underfit_poly.pdf} + + \bigskip + + Underfit + + \smallskip + + Too few parameters + + \smallskip + + $\chi^2$ is large + + \smallskip + + Best fit is \emph{biased} + \end{column} + \begin{column}{0.333\textwidth} + \centering + $M=40$, $N=7$ + + \bigskip + + \includegraphics[width=\columnwidth]{figs/fit_goodfit_poly.pdf} + + \bigskip + + Good fit! + + \smallskip + + Right number of parameters + + \smallskip + + $\chi^2$ is moderate + + \smallskip + + \vphantom{Best fit} + \end{column} + \begin{column}{0.333\textwidth} + \centering + $M=40$, $N=40$ + + \bigskip + + \includegraphics[width=\columnwidth]{figs/fit_overfit_poly.pdf} + + \bigskip + + Overfit + + \smallskip + + Too many parameters + + \smallskip + + $\chi^2$ is zero + + \smallskip + + Best fit has \emph{high variance} \end{column} \end{columns} \end{frame} \begin{frame} - \frametitle{Curve fitting: the good, the bad, and the weird} + \frametitle{Curve fitting: the bad, the good, and the ugly} \begin{columns} - \begin{column}{0.33\textwidth} + \begin{column}{0.5\textwidth} + Knowing the ground truth, fit error is + \[ + \text{MSE}=\int dx\left(f(x)-\sum_{j=1}^Na_jb_j(x)\right)^2 + \] + + \smallskip + + Trade-off between \emph{bias} and \emph{variance}: + \begin{itemize} + \item \textbf{Bias} reflects missing qualitative features of the data + \item \textbf{Variance} reflects strong dependence on the noise + \end{itemize} \end{column} - \begin{column}{0.33\textwidth} + \begin{column}{0.5\textwidth} + \includegraphics[width=\columnwidth]{figs/fit_bias-variance_poly.pdf} + + \medskip + + \includegraphics[width=0.32\columnwidth]{figs/fit_underfit_poly.pdf} + \hfill + \includegraphics[width=0.32\columnwidth]{figs/fit_goodfit_poly.pdf} + \hfill + \includegraphics[width=0.32\columnwidth]{figs/fit_overfit_poly.pdf} + + \smallskip\small + + \hspace{2em}$N=2$ \hfill $N=7$ \hfill $N=40$ \hspace{1.2em} \end{column} - \begin{column}{0.33\textwidth} + \end{columns} +\end{frame} + +\begin{frame} + \frametitle{Machine learning is just curve fitting} + + \begin{columns} + \begin{column}{0.8\textwidth} + Number of data points $M$ is big: all images on the internet + + \medskip + + Ground truth function is unknown: Probability the image contains a cat + + \medskip + + Fit function is a neural network: + \[ + \hat f(\mathbf x\mid B_1,\ldots B_L)=\sigma\left(B_L \sigma\left( B_{L-1}\cdots\sigma\left(B_2\sigma (B_1\mathbf x)\right)\cdots\right)\right) + \] + + \medskip + + $\chi^2$ is called \emph{training error} + + \medskip + + MSE is called \emph{test} or \emph{generalization error} \end{column} \end{columns} \end{frame} |