\documentclass[aspectratio=169,usenames,dvipsnames]{beamer} \setbeamerfont{title}{family=\bf} \setbeamerfont{frametitle}{family=\bf} \setbeamerfont{normal text}{family=\rm} \setbeamertemplate{navigation symbols}{} \setbeamercolor{titlelike}{parent=structure,fg=cyan} \usepackage{enumitem} \usepackage[utf8]{inputenc} \usepackage[T1]{fontenc} \usepackage{pifont} \usepackage{graphicx} \usepackage{xcolor} \usepackage{tikz} \definecolor{ictpblue}{HTML}{0471b9} \definecolor{ictpgreen}{HTML}{0c8636} \definecolor{mb}{HTML}{5e81b5} \definecolor{my}{HTML}{e19c24} \definecolor{mg}{HTML}{8fb032} \definecolor{mr}{HTML}{eb6235} \setbeamercolor{titlelike}{parent=structure,fg=ictpblue} \setbeamercolor{itemize item}{fg=ictpblue} \usepackage[ style=phys, eprint=true, maxnames = 100, terseinits=true ]{biblatex} \addbibresource{ictp-saifr_colloquium.bib} \title{ Fitting with more parameters than data points\\\normalsize Topology of the solutions to overparameterized problems } \author{\textbf{Jaron Kent-Dobias}\\Simons--FAPESP Young Investigator} \date{19 February 2025} \begin{document} \begin{frame} \maketitle \vspace{-6pc} \begin{minipage}[c]{10pc} \centering \includegraphics[height=6pc]{figs/ift-unesp.png} \end{minipage} \hfill\begin{minipage}[c]{10pc} \centering \includegraphics[height=6pc]{figs/logo-ictp-saifr.jpg} \end{minipage} \vspace{2pc} \end{frame} \begin{frame} \frametitle{Linear least squares} \framesubtitle{The bad, the good, and the ugly} \begin{columns} \begin{column}{0.4\textwidth} You have $M$ data points $(x_1,y_1),\ldots,(x_M,y_M)$ \bigskip Perhaps a noisy sample of a ground truth function $y_i=f(x_i)+\xi$ \end{column} \begin{column}{0.6\textwidth} \begin{overprint} \onslide<1>\includegraphics[width=\columnwidth]{figs/fit_data.pdf} \onslide<2>\includegraphics[width=\columnwidth]{figs/fit_data_truth.pdf} \onslide<3>\includegraphics[width=\columnwidth]{figs/fit_data.pdf} \end{overprint} \end{column} \end{columns} \end{frame} \begin{frame} \frametitle{Linear least squares} \framesubtitle{The bad, the good, and the ugly} \begin{columns} \begin{column}{0.4\textwidth} Pick a basis of $N$ functions \[b_1(x), \ldots, b_N(x)\] \smallskip Approximate the ground truth \[ \hat f(x\mid a_1,\ldots, a_N)=\sum_{j=1}^Na_jb_j(x) \] Find $\pmb a=[a_1, \ldots, a_N]$ minimizing \[ \chi^2(\pmb a\mid\pmb x,\pmb y) =\sum_{i=1}^M\left(y_i-\hat f(x_i\mid\pmb a)\right)^2 \] \end{column} \begin{column}{0.6\textwidth} \begin{overprint} \onslide<1>\includegraphics[width=\columnwidth]{figs/fit_basis_poly.pdf} \end{overprint} \end{column} \end{columns} \end{frame} \begin{frame} \frametitle{Linear least squares} \framesubtitle{The bad, the good, and the ugly} \begin{columns} \begin{column}{0.333\textwidth} \centering $M=40$, $N=2$ \bigskip \includegraphics[width=\columnwidth]{figs/fit_underfit_poly.pdf} \bigskip Underfit \smallskip Too few parameters \smallskip $\chi^2$ is large \smallskip Best fit is \emph{biased} \end{column} \begin{column}{0.333\textwidth} \centering $M=40$, $N=7$ \bigskip \includegraphics[width=\columnwidth]{figs/fit_goodfit_poly.pdf} \bigskip Good fit! \smallskip Right number of parameters \smallskip $\chi^2$ is moderate \smallskip \vphantom{Best fit} \end{column} \begin{column}{0.333\textwidth} \centering $M=40$, $N=40$ \bigskip \includegraphics[width=\columnwidth]{figs/fit_overfit_poly.pdf} \bigskip Overfit \smallskip Too many parameters \smallskip $\chi^2$ is zero \smallskip Best fit has \emph{high variance} \end{column} \end{columns} \end{frame} \begin{frame} \frametitle{Linear least squares} \framesubtitle{The bad, the good, and the ugly} \begin{columns} \begin{column}{0.5\textwidth} Knowing the ground truth, fit error is \[ \text{MSE}=\int dx\left(f(x)-\hat f(x\mid\pmb a)\right)^2 \] \smallskip Trade-off between \emph{bias} and \emph{variance}: \begin{itemize} \item \textbf{Bias} reflects missing qualitative features of the data \item \textbf{Variance} reflects strong dependence on the noise \end{itemize} \end{column} \begin{column}{0.5\textwidth} \includegraphics[width=\columnwidth]{figs/fit_bias-variance_poly.pdf} \medskip \includegraphics[width=0.32\columnwidth]{figs/fit_underfit_poly.pdf} \hfill \includegraphics[width=0.32\columnwidth]{figs/fit_goodfit_poly.pdf} \hfill \includegraphics[width=0.32\columnwidth]{figs/fit_overfit_poly.pdf} \smallskip\small \hspace{2em}$N=2$ \hfill $N=7$ \hfill $N=40$ \hspace{1.2em} \end{column} \end{columns} \end{frame} \begin{frame} \frametitle{Machine learning is just \emph{non}linear least squares} \begin{columns} \begin{column}{0.9\textwidth} Number of data points $M$ is big: all images on the internet \medskip Ground truth function is unknown: Probability the image contains a cat \medskip Fit function is a neural network: \[ \hat f(\pmb x\mid B_1,\ldots B_L)=\sigma\left(B_L \sigma\left( B_{L-1}\cdots\sigma\left(B_2\sigma (B_1\pmb x)\right)\cdots\right)\right) \] \medskip $\chi^2(\pmb a\mid\text{data})$ is called the \emph{cost} or \emph{loss function} \medskip $\chi^2(\pmb a^*\mid\text{data})$ is called the \emph{training error} \medskip MSE is called the \emph{test} or \emph{generalization error} \bigskip \textbf{BUT:} machine learning uses many more parameters $N$ than data points $M$ \end{column} \end{columns} \end{frame} \begin{frame} \frametitle{Linear least squares} \framesubtitle{The bad, the good, the ugly, and the weird} \begin{columns} \begin{column}{0.25\textwidth} \centering $M=40$, $N=2$ \bigskip \includegraphics[width=\columnwidth]{figs/fit_underfit_poly.pdf} \bigskip Underfit \smallskip $\chi^2$ is large \smallskip Best fit has \emph{high bias} \phantom{variance} \end{column} \begin{column}{0.25\textwidth} \centering $M=40$, $N=7$ \bigskip \includegraphics[width=\columnwidth]{figs/fit_goodfit_poly.pdf} \bigskip Good fit! \smallskip $\chi^2$ is moderate \smallskip Best fit has \emph{low variance} and \emph{low bias} \end{column} \begin{column}{0.25\textwidth} \centering $M=40$, $N=40$ \bigskip \includegraphics[width=\columnwidth]{figs/fit_overfit_poly.pdf} \bigskip Overfit \smallskip $\chi^2$ is zero \smallskip Best fit has \emph{high variance} \end{column} \begin{column}{0.25\textwidth} \centering $M=40$, $N=80$ \bigskip \includegraphics[width=\columnwidth]{figs/fit_overparamfit_poly.pdf} \bigskip Good fit? \smallskip $\chi^2$ is zero \smallskip Best fit has \emph{low variance} and \emph{low bias} \end{column} \end{columns} \end{frame} \begin{frame} \frametitle{Linear least squares} \framesubtitle{The bad, the good, the ugly, and the weird} \begin{columns} \begin{column}{0.5\textwidth} \centering $M=40$, $N=80$ \bigskip \includegraphics[width=\columnwidth]{figs/fit_overparamfit_poly.pdf} \end{column} \begin{column}{0.5\textwidth} \centering \includegraphics[width=\textwidth]{figs/fit_bias-variance2_poly.pdf} \bigskip Bias--variance trade-off is blown up! \end{column} \end{columns} \end{frame} \begin{frame} \frametitle{Linear least squares} \framesubtitle{Overparameterized solutions are not unique} \begin{columns} \begin{column}{0.5\textwidth} Underparameterized fitting ($M>N$) has a unique minimizing solution \medskip Overparameterized fits are not unique: $\chi^2=0$ gives $M$ constraints \[ 0=y_i-\hat f(x_i\mid\pmb a)\qquad\text{for all $1\leq i\leq M$} \] with $N$ unknowns $\pmb a=[a_1,\ldots, a_N]$ gives a manifold of $DlN-M$ dimensions \medskip What leads to the `good' solutions instead of `bad' ones? \end{column} \begin{column}{0.5\textwidth} \centering $M=40$, $N=80$ \bigskip \begin{overprint} \onslide<1>\includegraphics[width=\columnwidth]{figs/fit_overparamfit_poly.pdf} \onslide<2>\includegraphics[width=\columnwidth]{figs/fit_overparamfit_poly_1.pdf} \onslide<3>\includegraphics[width=\columnwidth]{figs/fit_overparamfit_poly_2.pdf} \onslide<4>\includegraphics[width=\columnwidth]{figs/fit_overparamfit_poly_3.pdf} \onslide<5>\includegraphics[width=\columnwidth]{figs/fit_overparamfit_poly_6.pdf} \onslide<6>\includegraphics[width=\columnwidth]{figs/fit_overparamfit_poly_7.pdf} \onslide<7>\includegraphics[width=\columnwidth]{figs/fit_overparamfit_poly_8.pdf} \onslide<8>\includegraphics[width=\columnwidth]{figs/fit_overparamfit_poly_9.pdf} \end{overprint} \end{column} \end{columns} \end{frame} \begin{frame} \frametitle{Linear least squares} \framesubtitle{Gradient descent and implicit regularization} \begin{columns} \begin{column}{0.5\textwidth} Overparameterized fits found with gradient descent algorithm: take small steps in the direction $\nabla\chi^2$ until $\|\nabla\chi^2\|<\epsilon$ \medskip Result of descent depends on initial condition: what $\pmb a$ to start with? \medskip \textbf{Unexpected fact:} gradient descent with small initialization equivalent to unique optimum in \emph{regularized} problem: \[ \chi^2_\text{eff}(\pmb a\mid\text{data})=\chi^2(\pmb a\mid\text{data})+\lambda\|\pmb a\|^2 \] \tiny\fullcite{Neyshabur_2017_Implicit} \end{column} \begin{column}{0.5\textwidth} \begin{overprint} \onslide<1>\includegraphics[width=\columnwidth]{figs/fit_gradient_1.pdf} \onslide<2>\includegraphics[width=\columnwidth]{figs/fit_gradient_2.pdf} \onslide<3>\includegraphics[width=\columnwidth]{figs/fit_gradient_3.pdf} \onslide<4>\includegraphics[width=\columnwidth]{figs/fit_gradient_4.pdf} \onslide<5>\includegraphics[width=\columnwidth]{figs/fit_gradient_5.pdf} \onslide<6>\includegraphics[width=\columnwidth]{figs/fit_gradient_6.pdf} \end{overprint} \end{column} \end{columns} \end{frame} \begin{frame} \frametitle{Linear least squares} \framesubtitle{Choice of basis} \begin{columns} \begin{column}{0.5\textwidth} \centering \textbf{Polynomial basis} \bigskip \includegraphics[width=\columnwidth]{figs/fit_basis_poly.pdf} \end{column} \begin{column}{0.5\textwidth} \centering \textbf{Absolute value basis} \bigskip \includegraphics[width=\columnwidth]{figs/fit_basis_abs.pdf} \end{column} \end{columns} \end{frame} \begin{frame} \frametitle{Linear least squares} \framesubtitle{Choice of basis} \centering \Large\textbf{Polynomial basis}\normalsize \bigskip \begin{columns} \begin{column}{0.25\textwidth} \centering $M=40$, $N=2$ \bigskip \includegraphics[width=\columnwidth]{figs/fit_underfit_poly.pdf} \bigskip Underfit \smallskip $\chi^2$ is large \smallskip Best fit has \emph{high bias} \phantom{variance} \end{column} \begin{column}{0.25\textwidth} \centering $M=40$, $N=7$ \bigskip \includegraphics[width=\columnwidth]{figs/fit_goodfit_poly.pdf} \bigskip Good fit! \smallskip $\chi^2$ is moderate \smallskip Best fit has \emph{low variance} and \emph{low bias} \end{column} \begin{column}{0.25\textwidth} \centering $M=40$, $N=40$ \bigskip \includegraphics[width=\columnwidth]{figs/fit_overfit_poly.pdf} \bigskip Overfit \smallskip $\chi^2$ is zero \smallskip Best fit has \emph{high variance} \end{column} \begin{column}{0.25\textwidth} \centering $M=40$, $N=80$ \bigskip \includegraphics[width=\columnwidth]{figs/fit_overparamfit_poly.pdf} \bigskip Good fit? \smallskip $\chi^2$ is zero \smallskip Best fit has \emph{low variance} and \emph{low bias} \end{column} \end{columns} \end{frame} \begin{frame} \frametitle{Linear least squares} \framesubtitle{Choice of basis} \centering \Large\textbf{Absolute value basis\vphantom{y}}\normalsize \bigskip \begin{columns} \begin{column}{0.25\textwidth} \centering $M=40$, $N=2$ \bigskip \includegraphics[width=\columnwidth]{figs/fit_underfit_abs.pdf} \bigskip Underfit \smallskip $\chi^2$ is large \smallskip Best fit has \emph{high bias} \phantom{variance} \end{column} \begin{column}{0.25\textwidth} \centering $M=40$, $N=7$ \bigskip \includegraphics[width=\columnwidth]{figs/fit_goodfit_abs.pdf} \bigskip Good fit! \smallskip $\chi^2$ is moderate \smallskip Best fit has \emph{low variance} and \emph{low bias} \end{column} \begin{column}{0.25\textwidth} \centering $M=40$, $N=40$ \bigskip \includegraphics[width=\columnwidth]{figs/fit_overfit_abs.pdf} \bigskip Good fit? \smallskip $\chi^2$ is zero \smallskip Best fit has \emph{low variance} and \emph{low bias} \end{column} \begin{column}{0.25\textwidth} \centering $M=40$, $N=80$ \bigskip \includegraphics[width=\columnwidth]{figs/fit_overparamfit_abs.pdf} \bigskip Good fit? \smallskip $\chi^2$ is zero \smallskip Best fit has \emph{low variance} and \emph{low bias} \end{column} \end{columns} \end{frame} \begin{frame} \frametitle{Linear least squares} \framesubtitle{Choice of basis} \begin{columns} \begin{column}{0.5\textwidth} \centering \textbf{Polynomial basis} \bigskip \includegraphics[width=\columnwidth]{figs/fit_bias-variance2_poly.pdf} \end{column} \begin{column}{0.5\textwidth} \centering \textbf{Absolute value basis} \bigskip \includegraphics[width=\columnwidth]{figs/fit_bias-variance_abs.pdf} \end{column} \end{columns} \end{frame} \begin{frame} \frametitle{Linear least squares} \framesubtitle{Sparseness and level of noise} \begin{columns} \begin{column}{0.5\textwidth} \includegraphics[width=\columnwidth]{figs/fit_data.pdf} \end{column} \begin{column}{0.5\textwidth} \includegraphics[width=\columnwidth]{figs/fit_data_abs2.pdf} \end{column} \end{columns} \end{frame} \begin{frame} \frametitle{Linear least squares} \framesubtitle{Sparseness and level of noise} \begin{columns} \begin{column}{0.25\textwidth} \centering $M=10$, $N=4$ \bigskip \includegraphics[width=\columnwidth]{figs/fit_underfit_abs2.pdf} \bigskip Underfit \smallskip $\chi^2$ is large \smallskip Best fit has \emph{high bias} \phantom{variance} \end{column} \begin{column}{0.25\textwidth} \centering $M=10$, $N=6$ \bigskip \includegraphics[width=\columnwidth]{figs/fit_goodfit_abs2.pdf} \bigskip Good fit! \smallskip $\chi^2$ is moderate \smallskip Best fit has \emph{low variance} and \emph{low bias} \end{column} \begin{column}{0.25\textwidth} \centering $M=10$, $N=10$ \bigskip \includegraphics[width=\columnwidth]{figs/fit_overfit_abs2.pdf} \bigskip Good fit? \smallskip $\chi^2$ is zero \smallskip Best fit has \emph{low variance} and \emph{low bias} \end{column} \begin{column}{0.25\textwidth} \centering $M=10$, $N=15$ \bigskip \includegraphics[width=\columnwidth]{figs/fit_overparamfit_abs2.pdf} \bigskip Better fit?! \smallskip $\chi^2$ is zero \smallskip Best fit has \emph{low variance} and \emph{low bias} \end{column} \end{columns} \end{frame} \begin{frame} \frametitle{Linear least squares} \framesubtitle{Sparseness and level of noise} \begin{columns} \begin{column}{0.5\textwidth} \centering $M=10$, $N=15$ \bigskip \includegraphics[width=\columnwidth]{figs/fit_overparamfit_abs2.pdf} \end{column} \begin{column}{0.5\textwidth} \includegraphics[width=\columnwidth]{figs/fit_bias-variance_abs2.pdf} \end{column} \end{columns} \end{frame} \begin{frame} \frametitle{Machine learning is just \emph{non}linear least squares} \begin{columns} \begin{column}{0.5\textwidth} Gradient descent produces poor solutions to many machine learning problems \bigskip \textbf{BUT:} no one uses gradient descent \bigskip \emph{Stochastic} gradient descent (SGD): follow approximated gradient of $\chi^2$ calculated using small subsets (batches) of the data \bigskip Approximated gradient takes \emph{fewer} steps to find \emph{better} solutions \end{column} \begin{column}{0.5\textwidth} \includegraphics[width=\textwidth]{figs/beneventano_2023.png} \medskip\tiny \fullcite{Beneventano_2023_On} \end{column} \end{columns} \end{frame} \begin{frame} \begin{columns} \begin{column}{0.9\textwidth} \begin{overprint} \onslide<1>\includegraphics[width=\columnwidth]{figs/gradient_vs_sgd_1.png} \onslide<2>\includegraphics[width=\columnwidth]{figs/gradient_vs_sgd_2.png} \onslide<3>\includegraphics[width=\columnwidth]{figs/gradient_vs_sgd_3.png} \onslide<4>\includegraphics[width=\columnwidth]{figs/gradient_vs_sgd_4.png} \onslide<5>\includegraphics[width=\columnwidth]{figs/gradient_vs_sgd_5.png} \end{overprint} \end{column} \end{columns} \end{frame} \begin{frame} \centering \begin{tikzpicture} \draw (0,0) node[align=center] {Overparameterization works\\\includegraphics[width=3cm]{figs/fit_overparamfit_abs2.pdf}}; \draw (4,2) node[align=center] {Gradient descent\\implicitly regularizes\\\includegraphics[height=2cm]{figs/fit_gradient_5.pdf}}; \draw (-4,2) node[align=center] {Neural networks\\are good models\\\includegraphics[height=2cm]{figs/fit_basis_abs.pdf}}; \draw (-4,-2) node[align=center] {Data is sparse, low-noise,\\and high-dimensional\\\includegraphics[height=2cm]{figs/fit_data_abs2.pdf}}; \draw (4,-2) node[align=center] {SGD finds\\high-entropy solutions\\\includegraphics[height=2cm]{figs/gradient_vs_sgd_4.png}}; \end{tikzpicture} \end{frame} \begin{frame} \frametitle{Machine learning is just \emph{non}linear least squares} \begin{columns} \begin{column}{0.5\textwidth} Structure and geometry of manifold of ``perfect'' solutions integral to understanding overparameterized fits \medskip \textbf{BUT:} little is known outside the linear case \medskip State of the art: sample two points, relax an elastic line between them \bigskip \textbf{Can we develop better ways to understand the solution space in nonlinear problems?} \end{column} \begin{column}{0.5\textwidth} \includegraphics[width=0.52\textwidth]{figs/garipov_2018.png} \hfill \includegraphics[width=0.46\textwidth]{figs/draxler_2018.png} \tiny\medskip \fullcite{Garipov_2018_Loss} \medskip \fullcite{Draxler_2018_Essentially} \end{column} \end{columns} \end{frame} \begin{frame} \frametitle{A simple model of nonlinear least squares} \begin{columns} \begin{column}{0.9\textwidth} Data: $x_i\mapsto J^i$, $N\times\cdots\times N$ $p$-tensors of Gaussian random numbers\\ \hphantom{Data: }$y_i\mapsto V_0$, a constant \\ \medskip Parameters: $\pmb a$ is an $N$-dimensional vector constrained to live on the sphere \\ \medskip Model: $\hat f(J\mid\pmb a)$ is the homogeneous degree-$p$ polynomial \[ \hat f(J\mid \pmb a) =\sum_{i_1=1}^N\cdots\sum_{i_p=1}^NJ_{i_1,\ldots,i_p}a_{i_1}\cdots a_{i_p} \] Cost function: the usual \[ \chi^2(\pmb a\mid J^1,\ldots,J^M) =\sum_{i=1}^M\left[ V_0-\hat f(J^i\mid\pmb a) \right]^2 \] \end{column} \end{columns} \end{frame} \begin{frame} \frametitle{A simple model of nonlinear least squares} \begin{columns} \begin{column}{0.33\textwidth} \textbf{When does \boldmath{$\chi^2$} have any solutions?} \medskip \tiny \fullcite{Fyodorov_2019_A} \smallskip \fullcite{Fyodorov_2020_Counting} \smallskip \fullcite{Fyodorov_2022_Optimization} \smallskip \fullcite{Montanari_2024_On} \end{column} \begin{column}{0.33\textwidth} \textbf{When does \boldmath{$\chi^2$} have suboptimal minima that hinder optimization?} \medskip \tiny\fullcite{Kent-Dobias_2024_Conditioning} \medskip \normalsize \textbf{How does gradient descent behave?} \medskip \tiny \fullcite{Urbani_2023_A} \smallskip \fullcite{Kamali_2023_Dynamical} \smallskip \fullcite{Montanari_2023_Solving} \normalsize \end{column} \begin{column}{0.33\textwidth} \textbf{How does stochastic gradient descent behave?} \medskip \tiny \fullcite{Kamali_2023_Stochastic} \bigskip \normalsize \textbf{What does the manifold of perfect solutions look like?} \medskip This presentation! \medskip \tiny \fullcite{Kent-Dobias_2024_On} \end{column} \end{columns} \end{frame} \begin{frame} \frametitle{A simple model of nonlinear least squares} \begin{columns} \begin{column}{0.85\textwidth} Solutions form $D=N-M-1$ dimensional manifold: \[ \Omega=\left\{ \pmb a\in\mathbb R^N\mid\|\pmb a\|^2=N, V_0=\hat f(J^i\mid\pmb a)\;\forall\;1\leq i\leq M \right\} \] How to count number of components? We don't know \medskip Very good at counting isolated minima: \[ \#_{\substack{\text{critical}\\\text{points}}}=\int d\pmb x\, \delta\big(\nabla h(\pmb x)\big)\, \big|\det\operatorname{Hess}h(\pmb x)\big| \] \smallskip \tiny\fullcite{Kent-Dobias_2023_How} \normalsize \medskip \textbf{Today:} how to compute the \emph{Euler characteristic} of the solution manifold \end{column} \end{columns} \end{frame} \begin{frame} \frametitle{The Euler characteristic \boldmath{$\chi$}} \begin{columns} \begin{column}{0.5\textwidth} The Euler characteristic $\chi(\Omega)$ is a topological invariant of a manifold $\Omega$ \medskip Defined by tiling the manifold, then taking the alternating sum \begin{align*} \chi(\Omega_{\text{cow}}) &= {\only<2,5->{\color{Red}}\#_\text{vertices}} &&\hspace{-1em}- {\only<3,5->{\color{ictpgreen}}\#_\text{edges}} &&\hspace{-1em}+ {\only<4,5->{\color{ictpblue}}\#_\text{faces}} \\ &\color{White}\only<2->{\color{Black}}= {\only<2,5->{\color{Red}}2904} &&\hspace{-1em}\color{White}\only<3->{\color{Black}}- {\only<3,5->{\color{ictpgreen}}8706} &&\hspace{-1em}\color{White}\only<4->{\color{Black}}+ {\only<4,5->{\color{ictpblue}}5804} \\ &\color{White}\only<5->{\color{Black}}=2 \end{align*} \[ \color{White}\only<6->{\color{Black}}\chi(\Omega_\text{football}) ={\only<6->{\color{Red}}60}-{\only<6->{\color{ictpgreen}}90}+{\only<6->{\color{ictpblue}}32}=2 \] \color{White}\only<7>{\color{Black}}Cow is homeomorphic to a sphere \end{column} \begin{column}{0.5\textwidth} \begin{overprint} \onslide<1,5>\includegraphics[width=\textwidth]{figs/cow.png} \onslide<2>\includegraphics[width=\textwidth]{figs/cow_vert.png} \onslide<3>\includegraphics[width=\textwidth]{figs/cow_edge.png} \onslide<4>\includegraphics[width=\textwidth]{figs/cow_face.png} \onslide<6->\hspace{2em}\includegraphics{figs/Football_Pallo_valmiina-cropped.jpg} \end{overprint} \end{column} \end{columns} \end{frame} \begin{frame} \frametitle{The Euler characteristic \boldmath{$\chi$}} \framesubtitle{Characteristics of the characteristic} \begin{columns} \begin{column}{0.5\textwidth} For closed, connected 2-dimensional manifolds, related to genus $g$ by $\chi=2-2g$ \medskip General properties: \vspace{-0.5em} \[ \chi(\Omega)=0 \text{ for odd-dimensional $\Omega$} \] \vspace{-1.6em} \[ \chi(S^D)=2\text{ for even }D \] \[ \chi(\Omega_1\sqcup\Omega_2)=\chi(\Omega_1)+\chi(\Omega_2) \] \[ \chi(\Omega_1\times\Omega_2)=\chi(\Omega_1)\times\chi(\Omega_2) \] \smallskip Examples: \vspace{-0.5em} \[\chi(M\text{ even-$D$ spheres})=2M\] \vspace{-1.6em} \[\chi(S^1\times\text{anything})=0\] \end{column} \begin{column}{0.5\textwidth} \includegraphics[width=\textwidth]{figs/genus.png} \end{column} \end{columns} \end{frame} \begin{frame} \frametitle{The Euler characteristic \boldmath{$\chi$}} \framesubtitle{Computing the Euler characteristic} \begin{columns} \begin{column}{0.5\textwidth} Morse theory: gradient flow on an arbitrary ``height'' function $h$ makes a complex \begin{align*} \chi(\Omega) &= {\only<2,5>{\color{Red}}\#_\text{vertices}} - {\only<3,5>{\color{ictpgreen}}\#_\text{edges}} + {\only<4,5>{\color{ictpblue}}\#_\text{faces}} +\cdots \\ &= {\only<6>{\color{ictpblue}}\#_\text{index 0}} - {\only<6>{\color{ictpgreen}}\#_\text{index 1}} + {\only<6>{\color{Red}}\#_\text{index 2}} +\cdots \\ &=\sum_{i=0}^D(-1)^i\#_\text{index i} \end{align*} \end{column} \begin{column}{0.5\textwidth} \begin{overprint} \onslide<1>\includegraphics[width=\textwidth]{figs/other_sphere.png} \onslide<2>\includegraphics[width=\textwidth]{figs/other_sphere_vert.png} \onslide<3>\includegraphics[width=\textwidth]{figs/other_sphere_edge.png} \onslide<4>\includegraphics[width=\textwidth]{figs/other_sphere_face.png} \onslide<5>\includegraphics[width=\textwidth]{figs/other_sphere_all.png} \onslide<6>\includegraphics[width=\textwidth]{figs/other_sphere_crit.png} \end{overprint} \end{column} \end{columns} \end{frame} \begin{frame} \frametitle{The Euler characteristic \boldmath{$\chi$}} \framesubtitle{Computing the Euler characteristic} \begin{columns} \begin{column}{0.6\textwidth} \[ \Omega=\left\{ \pmb a\in\mathbb R^N\mid\|\pmb a\|^2=N, V_0=\hat f(J^i\mid\pmb a)\;\forall\;1\leq i\leq M \right\} \] Pick whatever height function $h:\Omega\to\mathbb R$ you like: $h(\pmb a)=\frac1N\pmb a_0\cdot\pmb a$ for arbitrary $\pmb a_0$ \[ \#_{\substack{\text{critical}\\\text{points}}} =\int_\Omega d\pmb x\, \delta\big(\nabla h(\pmb x)\big)\, \big|\det\operatorname{Hess}h(\pmb x)\big| \] \[ \hspace{-1em}\operatorname{sgn}\big(\det\operatorname{Hess}(\pmb x)\big) = \operatorname{sgn}\left(\prod_{i=1}^D\lambda_i\right) =(-1)^{\text{index}} \] \[ \chi(\Omega) =\sum_{i=0}^D(-1)^i\#_\text{index i} =\int_\Omega d\pmb x\,\delta\big(\nabla h(\pmb x)\big)\,\det\operatorname{Hess}h(\pmb x) \] \end{column} \begin{column}{0.4\textwidth} \begin{overprint} \onslide<1>\includegraphics[width=\textwidth]{figs/function_1.png} \onslide<2>\includegraphics[width=\textwidth]{figs/function_2.png} \onslide<3>\includegraphics[width=\textwidth]{figs/function_3.png} \end{overprint} \end{column} \end{columns} \end{frame} \begin{frame} \frametitle{A simple model of nonlinear least squares} \framesubtitle{Results} \begin{columns} \begin{column}{0.5\textwidth} $M$ data points, $N$ parameters, $\alpha=M/N$ \[ V_0=\hat f(J\mid \pmb a) =\sum_{i_1=1}^N\cdots\sum_{i_p=1}^NJ_{i_1,\ldots,i_p}a_{i_1}\cdots a_{i_p} \] Simplest case: $p=1$, $\Omega$ is intersection of random hyperplanes with the sphere \medskip Results in $\chi(\Omega)=2$ or $\chi(\Omega)=0$ depending on whether solutions exist \medskip \hspace{2em}\includegraphics[width=0.33\textwidth]{figs/connected.pdf} \hfill \includegraphics[width=0.33\textwidth]{figs/gone.pdf}\hspace{2em} \end{column} \begin{column}{0.5\textwidth} \begin{overprint} \onslide<1>\includegraphics[width=\textwidth]{figs/intersections_1.pdf} \onslide<2>\includegraphics[width=\textwidth]{figs/intersections_2.pdf} \onslide<3>\includegraphics[width=\textwidth]{figs/intersections_3.pdf} \onslide<4>\includegraphics[width=\textwidth]{figs/intersections_4.pdf} \onslide<5>\includegraphics[width=\textwidth]{figs/phases_1.pdf} \end{overprint} \end{column} \end{columns} \end{frame} \begin{frame} \frametitle{A simple model of nonlinear least squares} \framesubtitle{Results} \begin{columns} \begin{column}{0.5\textwidth} $M$ data points, $N$ parameters, $\alpha=M/N$ \[ V_0=\hat f(J\mid \pmb a) =\sum_{i_1=1}^N\cdots\sum_{i_p=1}^NJ_{i_1,\ldots,i_p}a_{i_1}\cdots a_{i_p} \] For $p\geq2$, different phases with $|\chi(\Omega)|\gg1$ with varying sign \medskip \includegraphics[width=0.23\textwidth]{figs/middle.pdf} \includegraphics[width=0.23\textwidth]{figs/complex.pdf} \includegraphics[width=0.23\textwidth]{figs/shattered.pdf} \includegraphics[width=0.23\textwidth]{figs/gone.pdf} \end{column} \begin{column}{0.5\textwidth} \begin{overprint} \onslide<1>\centering\includegraphics[width=0.8\textwidth]{figs/middle.pdf}\\$\chi(\Omega)\ll0$ \onslide<2>\centering\includegraphics[width=0.8\textwidth]{figs/complex.pdf}\\$\chi(\Omega)\ll0$ \onslide<3>\centering\includegraphics[width=0.8\textwidth]{figs/shattered.pdf}\\$\chi(\Omega)\gg0$ \onslide<4>\includegraphics[width=\textwidth]{figs/phases_2.pdf} \onslide<5>\includegraphics[width=\textwidth]{figs/phases_3.pdf} \onslide<6>\includegraphics[width=\textwidth]{figs/phases_4.pdf} \end{overprint} \end{column} \end{columns} \end{frame} \begin{frame} \frametitle{A simple model of nonlinear least squares} \framesubtitle{Results} \begin{columns} \begin{column}{1.1\textwidth} \hspace{1em}Phases for inhomogeneous models: $1-\lambda$ parts linear ($p=1$) plus $\lambda$ quadratic ($p=2$) \medskip \includegraphics[width=0.21\columnwidth]{figs/phases_12_0.pdf} \nolinebreak\hspace{-2.5em} \includegraphics[width=0.21\columnwidth]{figs/phases_12_1.pdf} \nolinebreak\hspace{-2.5em} \includegraphics[width=0.21\columnwidth]{figs/phases_12_2.pdf} \nolinebreak\hspace{-2.5em} \includegraphics[width=0.21\columnwidth]{figs/phases_12_3.pdf} \nolinebreak\hspace{-2.5em} \includegraphics[width=0.21\columnwidth]{figs/phases_12_4.pdf} \nolinebreak\hspace{-2.5em} \includegraphics[width=0.21\columnwidth]{figs/phases_12_5.pdf} \medskip \hspace{5em} \includegraphics[width=0.13\textwidth]{figs/connected.pdf} \hfill \includegraphics[width=0.13\textwidth]{figs/middle.pdf} \hfill \includegraphics[width=0.13\textwidth]{figs/complex.pdf} \hfill \includegraphics[width=0.13\textwidth]{figs/shattered.pdf} \hfill \includegraphics[width=0.13\textwidth]{figs/gone.pdf} \hspace{5em} \end{column} \end{columns} \end{frame} \begin{frame} \frametitle{A simple model of nonlinear least squares} \framesubtitle{Conclusions and future directions} \begin{columns} \begin{column}{0.5\textwidth} Many outstanding questions: \bigskip \textbf{How do these structures interact with dynamics?} \bigskip \textbf{What structures exist in a problem with a non-random ground truth?} \bigskip \textbf{How do topological properties of solutions correlate with their quality?} \end{column} \begin{column}{0.5\textwidth} \begin{tikzpicture}[scale=0.6, every node/.style={scale=0.6}] \draw (0,0) node[align=center] {Overparameterization works\\\includegraphics[width=3cm]{figs/fit_overparamfit_abs2.pdf}}; \draw (4,2) node[align=center] {Gradient descent\\implicitly regularizes\\\includegraphics[height=2cm]{figs/fit_gradient_5.pdf}}; \draw (-4,2) node[align=center] {Neural networks\\are good models\\\includegraphics[height=2cm]{figs/fit_basis_abs.pdf}}; \draw (-4,-2) node[align=center] {Data is sparse, low-noise,\\and high-dimensional\\\includegraphics[height=2cm]{figs/fit_data_abs2.pdf}}; \draw (4,-2) node[align=center] {SGD finds\\high-entropy solutions\\\includegraphics[height=2cm]{figs/gradient_vs_sgd_4.png}}; \end{tikzpicture} \end{column} \end{columns} \end{frame} \end{document}