summaryrefslogtreecommitdiff
path: root/ictp-saifr_colloquium.tex
diff options
context:
space:
mode:
Diffstat (limited to 'ictp-saifr_colloquium.tex')
-rw-r--r--ictp-saifr_colloquium.tex183
1 files changed, 175 insertions, 8 deletions
diff --git a/ictp-saifr_colloquium.tex b/ictp-saifr_colloquium.tex
index 83c3226..8611f15 100644
--- a/ictp-saifr_colloquium.tex
+++ b/ictp-saifr_colloquium.tex
@@ -1,4 +1,4 @@
-\documentclass[aspectratio=169,usenames,dvipsnames,fleqn]{beamer}
+\documentclass[aspectratio=169,usenames,dvipsnames]{beamer}
\setbeamerfont{title}{family=\bf}
\setbeamerfont{frametitle}{family=\bf}
@@ -59,27 +59,194 @@
\end{frame}
\begin{frame}
- \frametitle{Curve fitting: the good, the bad, and the weird}
+ \frametitle{Curve fitting: the bad, the good, and the ugly}
\begin{columns}
- \begin{column}{0.5\textwidth}
+ \begin{column}{0.4\textwidth}
+ You have $M$ data points $(x_1,y_1),\ldots,(x_M,y_M)$
+
+ \bigskip
+
+ Perhaps a noisy sample of a ground truth function
+ $y_i=f(x_i)+\xi$
+ \end{column}
+ \begin{column}{0.6\textwidth}
+ \begin{overprint}
+ \onslide<1>\includegraphics[width=\columnwidth]{figs/fit_data.pdf}
+ \onslide<2>\includegraphics[width=\columnwidth]{figs/fit_data_truth.pdf}
+ \onslide<3>\includegraphics[width=\columnwidth]{figs/fit_data.pdf}
+ \end{overprint}
\end{column}
\end{columns}
+\end{frame}
+
+\begin{frame}
+ \frametitle{Curve fitting: the bad, the good, and the ugly}
+
\begin{columns}
- \begin{column}{0.5\textwidth}
+ \begin{column}{0.4\textwidth}
+ Pick a basis of $N$ functions $b_1(x), \ldots, b_N(x)$
+
+ \bigskip
+
+ Approximate the ground truth
+ \[
+ \hat f(x\mid a_1,\ldots, a_N)=\sum_{j=1}^Na_jb_j(x)
+ \]
+
+ Find $a_1, \ldots, a_N$ minimizing
+ \[
+ \chi^2
+ =\sum_{i=1}^M\left(y_i-\sum_{j=1}^Na_jb_j(x_i)\right)^2
+ \]
+ \end{column}
+ \begin{column}{0.6\textwidth}
+ \begin{overprint}
+ \onslide<1>\includegraphics[width=\columnwidth]{figs/fit_basis_poly.pdf}
+ \end{overprint}
+ \end{column}
+ \end{columns}
+\end{frame}
+\begin{frame}
+ \frametitle{Curve fitting: the bad, the good, and the ugly}
+
+ \begin{columns}
+ \begin{column}{0.333\textwidth}
+ \centering
+ $M=40$, $N=2$
+
+ \bigskip
+
+ \includegraphics[width=\columnwidth]{figs/fit_underfit_poly.pdf}
+
+ \bigskip
+
+ Underfit
+
+ \smallskip
+
+ Too few parameters
+
+ \smallskip
+
+ $\chi^2$ is large
+
+ \smallskip
+
+ Best fit is \emph{biased}
+ \end{column}
+ \begin{column}{0.333\textwidth}
+ \centering
+ $M=40$, $N=7$
+
+ \bigskip
+
+ \includegraphics[width=\columnwidth]{figs/fit_goodfit_poly.pdf}
+
+ \bigskip
+
+ Good fit!
+
+ \smallskip
+
+ Right number of parameters
+
+ \smallskip
+
+ $\chi^2$ is moderate
+
+ \smallskip
+
+ \vphantom{Best fit}
+ \end{column}
+ \begin{column}{0.333\textwidth}
+ \centering
+ $M=40$, $N=40$
+
+ \bigskip
+
+ \includegraphics[width=\columnwidth]{figs/fit_overfit_poly.pdf}
+
+ \bigskip
+
+ Overfit
+
+ \smallskip
+
+ Too many parameters
+
+ \smallskip
+
+ $\chi^2$ is zero
+
+ \smallskip
+
+ Best fit has \emph{high variance}
\end{column}
\end{columns}
\end{frame}
\begin{frame}
- \frametitle{Curve fitting: the good, the bad, and the weird}
+ \frametitle{Curve fitting: the bad, the good, and the ugly}
\begin{columns}
- \begin{column}{0.33\textwidth}
+ \begin{column}{0.5\textwidth}
+ Knowing the ground truth, fit error is
+ \[
+ \text{MSE}=\int dx\left(f(x)-\sum_{j=1}^Na_jb_j(x)\right)^2
+ \]
+
+ \smallskip
+
+ Trade-off between \emph{bias} and \emph{variance}:
+ \begin{itemize}
+ \item \textbf{Bias} reflects missing qualitative features of the data
+ \item \textbf{Variance} reflects strong dependence on the noise
+ \end{itemize}
\end{column}
- \begin{column}{0.33\textwidth}
+ \begin{column}{0.5\textwidth}
+ \includegraphics[width=\columnwidth]{figs/fit_bias-variance_poly.pdf}
+
+ \medskip
+
+ \includegraphics[width=0.32\columnwidth]{figs/fit_underfit_poly.pdf}
+ \hfill
+ \includegraphics[width=0.32\columnwidth]{figs/fit_goodfit_poly.pdf}
+ \hfill
+ \includegraphics[width=0.32\columnwidth]{figs/fit_overfit_poly.pdf}
+
+ \smallskip\small
+
+ \hspace{2em}$N=2$ \hfill $N=7$ \hfill $N=40$ \hspace{1.2em}
\end{column}
- \begin{column}{0.33\textwidth}
+ \end{columns}
+\end{frame}
+
+\begin{frame}
+ \frametitle{Machine learning is just curve fitting}
+
+ \begin{columns}
+ \begin{column}{0.8\textwidth}
+ Number of data points $M$ is big: all images on the internet
+
+ \medskip
+
+ Ground truth function is unknown: Probability the image contains a cat
+
+ \medskip
+
+ Fit function is a neural network:
+ \[
+ \hat f(\mathbf x\mid B_1,\ldots B_L)=\sigma\left(B_L \sigma\left( B_{L-1}\cdots\sigma\left(B_2\sigma (B_1\mathbf x)\right)\cdots\right)\right)
+ \]
+
+ \medskip
+
+ $\chi^2$ is called \emph{training error}
+
+ \medskip
+
+ MSE is called \emph{test} or \emph{generalization error}
\end{column}
\end{columns}
\end{frame}