\documentclass[aspectratio=169,usenames,dvipsnames]{beamer}

\setbeamerfont{title}{family=\bf}
\setbeamerfont{frametitle}{family=\bf}
\setbeamerfont{normal text}{family=\rm}
\setbeamertemplate{navigation symbols}{}
\setbeamercolor{titlelike}{parent=structure,fg=cyan}

\usepackage{enumitem}
\usepackage[utf8]{inputenc}
\usepackage[T1]{fontenc}
\usepackage{pifont}
\usepackage{graphicx}
\usepackage{xcolor}
\usepackage{tikz}

\definecolor{ictpblue}{HTML}{0471b9}
\definecolor{ictpgreen}{HTML}{0c8636}

\definecolor{mb}{HTML}{5e81b5}
\definecolor{my}{HTML}{e19c24}
\definecolor{mg}{HTML}{8fb032}
\definecolor{mr}{HTML}{eb6235}

\setbeamercolor{titlelike}{parent=structure,fg=ictpblue}
\setbeamercolor{itemize item}{fg=ictpblue}

\usepackage[
  style=phys,
  eprint=true,
  maxnames = 100,
  terseinits=true
]{biblatex}


\addbibresource{ictp-saifr_colloquium.bib}

\title{
  Fitting with more parameters than data points\\\normalsize
  Topology of the solutions to overparameterized problems
}
\author{\textbf{Jaron Kent-Dobias}\\Simons--FAPESP Young Investigator}
\date{19 February 2025}

\begin{document}

\begin{frame}
  \maketitle

  \vspace{-6pc}
  \begin{minipage}[c]{10pc}
    \centering
    \includegraphics[height=6pc]{figs/ift-unesp.png}
  \end{minipage}
  \hfill\begin{minipage}[c]{10pc}
    \centering
    \includegraphics[height=6pc]{figs/logo-ictp-saifr.jpg}
  \end{minipage}
  \vspace{2pc}
\end{frame}

\begin{frame}
  \frametitle{Linear least squares}
  \framesubtitle{The bad, the good, and the ugly}

  \begin{columns}
    \begin{column}{0.4\textwidth}
      You have $M$ data points $(x_1,y_1),\ldots,(x_M,y_M)$

      \bigskip

      Perhaps a noisy sample of a ground truth function
      $y_i=f(x_i)+\xi$
    \end{column}
    \begin{column}{0.6\textwidth}
      \begin{overprint}
        \onslide<1>\includegraphics[width=\columnwidth]{figs/fit_data.pdf}
        \onslide<2>\includegraphics[width=\columnwidth]{figs/fit_data_truth.pdf}
        \onslide<3>\includegraphics[width=\columnwidth]{figs/fit_data.pdf}
      \end{overprint}
    \end{column}
  \end{columns}
\end{frame}

\begin{frame}
  \frametitle{Linear least squares}
  \framesubtitle{The bad, the good, and the ugly}

  \begin{columns}
    \begin{column}{0.4\textwidth}
      Pick a basis of $N$ functions
      \[b_1(x), \ldots, b_N(x)\]

      \smallskip

      Approximate the ground truth
      \[
        \hat f(x\mid a_1,\ldots, a_N)=\sum_{j=1}^Na_jb_j(x)
      \]

      Find $\pmb a=[a_1, \ldots, a_N]$ minimizing
      \[
        \chi^2(\pmb a\mid\pmb x,\pmb y)
        =\sum_{i=1}^M\left(y_i-\hat f(x_i\mid\pmb a)\right)^2
      \]
    \end{column}
    \begin{column}{0.6\textwidth}
      \begin{overprint}
        \onslide<1>\includegraphics[width=\columnwidth]{figs/fit_basis_poly.pdf}
      \end{overprint}
    \end{column}
  \end{columns}
\end{frame}
\begin{frame}
  \frametitle{Linear least squares}
  \framesubtitle{The bad, the good, and the ugly}

  \begin{columns}
    \begin{column}{0.333\textwidth}
      \centering
      $M=40$, $N=2$

      \bigskip

      \includegraphics[width=\columnwidth]{figs/fit_underfit_poly.pdf}

      \bigskip

      Underfit

      \smallskip

      Too few parameters

      \smallskip

      $\chi^2$ is large

      \smallskip

      Best fit is \emph{biased}
    \end{column}
    \begin{column}{0.333\textwidth}
      \centering
      $M=40$, $N=7$

      \bigskip

      \includegraphics[width=\columnwidth]{figs/fit_goodfit_poly.pdf}

      \bigskip

      Good fit!

      \smallskip

      Right number of parameters

      \smallskip

      $\chi^2$ is moderate

      \smallskip

      \vphantom{Best fit}
    \end{column}
    \begin{column}{0.333\textwidth}
      \centering
      $M=40$, $N=40$

      \bigskip

      \includegraphics[width=\columnwidth]{figs/fit_overfit_poly.pdf}

      \bigskip

      Overfit

      \smallskip

      Too many parameters

      \smallskip

      $\chi^2$ is zero

      \smallskip

      Best fit has \emph{high variance}
    \end{column}
  \end{columns}
\end{frame}

\begin{frame}
  \frametitle{Linear least squares}
  \framesubtitle{The bad, the good, and the ugly}

  \begin{columns}
    \begin{column}{0.5\textwidth}
      Knowing the ground truth, fit error is
      \[
        \text{MSE}=\int dx\left(f(x)-\hat f(x\mid\pmb a)\right)^2
      \]

      \smallskip

      Trade-off between \emph{bias} and \emph{variance}:
      \begin{itemize}
        \item \textbf{Bias} reflects missing qualitative features of the data
      \item \textbf{Variance} reflects strong dependence on the noise
      \end{itemize}
    \end{column}
    \begin{column}{0.5\textwidth}
      \includegraphics[width=\columnwidth]{figs/fit_bias-variance_poly.pdf}

      \medskip

      \includegraphics[width=0.32\columnwidth]{figs/fit_underfit_poly.pdf}
      \hfill
      \includegraphics[width=0.32\columnwidth]{figs/fit_goodfit_poly.pdf}
      \hfill
      \includegraphics[width=0.32\columnwidth]{figs/fit_overfit_poly.pdf}

      \smallskip\small

      \hspace{2em}$N=2$ \hfill $N=7$ \hfill $N=40$ \hspace{1.2em}
    \end{column}
  \end{columns}
\end{frame}

\begin{frame}
  \frametitle{Machine learning is just \emph{non}linear least squares}

  \begin{columns}
    \begin{column}{0.9\textwidth}
      Number of data points $M$ is big: all images on the internet

      \medskip

      Ground truth function is unknown: Probability the image contains a cat

      \medskip

      Fit function is a neural network:
      \[
        \hat f(\pmb x\mid B_1,\ldots B_L)=\sigma\left(B_L \sigma\left( B_{L-1}\cdots\sigma\left(B_2\sigma (B_1\pmb x)\right)\cdots\right)\right)
      \]

      \medskip

    $\chi^2(\pmb a\mid\text{data})$ is called the \emph{cost} or \emph{loss function}

      \medskip

      $\chi^2(\pmb a^*\mid\text{data})$ is called the \emph{training error}

      \medskip

      MSE is called the \emph{test} or \emph{generalization error}

      \bigskip

      \textbf{BUT:} machine learning uses many more parameters $N$ than data points $M$
    \end{column}
  \end{columns}
\end{frame}

\begin{frame}
  \frametitle{Linear least squares}
  \framesubtitle{The bad, the good, the ugly, and the weird}
  \begin{columns}
    \begin{column}{0.25\textwidth}
      \centering
      $M=40$, $N=2$

      \bigskip

      \includegraphics[width=\columnwidth]{figs/fit_underfit_poly.pdf}

      \bigskip

      Underfit

      \smallskip

      $\chi^2$ is large

      \smallskip

      Best fit has \emph{high bias} \phantom{variance}
    \end{column}
    \begin{column}{0.25\textwidth}
      \centering
      $M=40$, $N=7$

      \bigskip

      \includegraphics[width=\columnwidth]{figs/fit_goodfit_poly.pdf}

      \bigskip

      Good fit!

      \smallskip

      $\chi^2$ is moderate

      \smallskip

      Best fit has \emph{low variance} and \emph{low bias}
    \end{column}
    \begin{column}{0.25\textwidth}
      \centering
      $M=40$, $N=40$

      \bigskip

      \includegraphics[width=\columnwidth]{figs/fit_overfit_poly.pdf}

      \bigskip

      Overfit

      \smallskip

      $\chi^2$ is zero

      \smallskip

      Best fit has \emph{high variance}
    \end{column}
    \begin{column}{0.25\textwidth}
      \centering
      $M=40$, $N=80$

      \bigskip

      \includegraphics[width=\columnwidth]{figs/fit_overparamfit_poly.pdf}

      \bigskip

      Good fit?

      \smallskip

      $\chi^2$ is zero

      \smallskip

      Best fit has \emph{low variance} and \emph{low bias}
    \end{column}
  \end{columns}
\end{frame}

\begin{frame}
  \frametitle{Linear least squares}
  \framesubtitle{The bad, the good, the ugly, and the weird}

  \begin{columns}
    \begin{column}{0.5\textwidth}
      \centering
      $M=40$, $N=80$

      \bigskip

      \includegraphics[width=\columnwidth]{figs/fit_overparamfit_poly.pdf}
    \end{column}
    \begin{column}{0.5\textwidth}
      \centering
      \includegraphics[width=\textwidth]{figs/fit_bias-variance2_poly.pdf}

      \bigskip

      Bias--variance trade-off is blown up!
    \end{column}
  \end{columns}
\end{frame}

\begin{frame}
  \frametitle{Linear least squares}
  \framesubtitle{Overparameterized solutions are not unique}
  \begin{columns}
    \begin{column}{0.5\textwidth}
      Underparameterized fitting ($M>N$) has a unique minimizing solution

      \medskip

      Overparameterized fits are not unique: $\chi^2=0$ gives $M$ constraints
      \[
        0=y_i-\hat f(x_i\mid\pmb a)\qquad\text{for all $1\leq i\leq M$}
      \]
      with $N$ unknowns $\pmb a=[a_1,\ldots, a_N]$ gives a manifold of $DlN-M$ dimensions

      \medskip

      What leads to the `good' solutions instead of `bad' ones?
    \end{column}
    \begin{column}{0.5\textwidth}
      \centering
      $M=40$, $N=80$

      \bigskip

      \begin{overprint}
        \onslide<1>\includegraphics[width=\columnwidth]{figs/fit_overparamfit_poly.pdf}
        \onslide<2>\includegraphics[width=\columnwidth]{figs/fit_overparamfit_poly_1.pdf}
        \onslide<3>\includegraphics[width=\columnwidth]{figs/fit_overparamfit_poly_2.pdf}
        \onslide<4>\includegraphics[width=\columnwidth]{figs/fit_overparamfit_poly_3.pdf}
        \onslide<5>\includegraphics[width=\columnwidth]{figs/fit_overparamfit_poly_6.pdf}
        \onslide<6>\includegraphics[width=\columnwidth]{figs/fit_overparamfit_poly_7.pdf}
        \onslide<7>\includegraphics[width=\columnwidth]{figs/fit_overparamfit_poly_8.pdf}
        \onslide<8>\includegraphics[width=\columnwidth]{figs/fit_overparamfit_poly_9.pdf}
      \end{overprint}
    \end{column}
  \end{columns}
\end{frame}

\begin{frame}
  \frametitle{Linear least squares}
  \framesubtitle{Gradient descent and implicit regularization}
  \begin{columns}
    \begin{column}{0.5\textwidth}
      Overparameterized fits found with gradient descent algorithm: take small
      steps in the direction $\nabla\chi^2$ until $\|\nabla\chi^2\|<\epsilon$

      \medskip

      Result of descent depends on initial condition: what $\pmb a$ to start with?

      \medskip

      \textbf{Unexpected fact:} gradient descent with small initialization equivalent to unique optimum in \emph{regularized} problem:
      \[
        \chi^2_\text{eff}(\pmb a\mid\text{data})=\chi^2(\pmb a\mid\text{data})+\lambda\|\pmb a\|^2
      \]

      \tiny\fullcite{Neyshabur_2017_Implicit}
    \end{column}
    \begin{column}{0.5\textwidth}
      \begin{overprint}
        \onslide<1>\includegraphics[width=\columnwidth]{figs/fit_gradient_1.pdf}
        \onslide<2>\includegraphics[width=\columnwidth]{figs/fit_gradient_2.pdf}
        \onslide<3>\includegraphics[width=\columnwidth]{figs/fit_gradient_3.pdf}
        \onslide<4>\includegraphics[width=\columnwidth]{figs/fit_gradient_4.pdf}
        \onslide<5>\includegraphics[width=\columnwidth]{figs/fit_gradient_5.pdf}
        \onslide<6>\includegraphics[width=\columnwidth]{figs/fit_gradient_6.pdf}
      \end{overprint}
    \end{column}
  \end{columns}
\end{frame}

\begin{frame}
  \frametitle{Linear least squares}
  \framesubtitle{Choice of basis}

  \begin{columns}
    \begin{column}{0.5\textwidth}
      \centering
      \textbf{Polynomial basis}

      \bigskip

      \includegraphics[width=\columnwidth]{figs/fit_basis_poly.pdf}
    \end{column}
    \begin{column}{0.5\textwidth}
      \centering
      \textbf{Absolute value basis}

      \bigskip

      \includegraphics[width=\columnwidth]{figs/fit_basis_abs.pdf}
    \end{column}
  \end{columns}
\end{frame}

\begin{frame}
  \frametitle{Linear least squares}
  \framesubtitle{Choice of basis}
  \centering
  \Large\textbf{Polynomial basis}\normalsize

  \bigskip

  \begin{columns}
    \begin{column}{0.25\textwidth}
      \centering
      $M=40$, $N=2$

      \bigskip

      \includegraphics[width=\columnwidth]{figs/fit_underfit_poly.pdf}

      \bigskip

      Underfit

      \smallskip

      $\chi^2$ is large

      \smallskip

      Best fit has \emph{high bias} \phantom{variance}
    \end{column}
    \begin{column}{0.25\textwidth}
      \centering
      $M=40$, $N=7$

      \bigskip

      \includegraphics[width=\columnwidth]{figs/fit_goodfit_poly.pdf}

      \bigskip

      Good fit!

      \smallskip

      $\chi^2$ is moderate

      \smallskip

      Best fit has \emph{low variance} and \emph{low bias}
    \end{column}
    \begin{column}{0.25\textwidth}
      \centering
      $M=40$, $N=40$

      \bigskip

      \includegraphics[width=\columnwidth]{figs/fit_overfit_poly.pdf}

      \bigskip

      Overfit

      \smallskip

      $\chi^2$ is zero

      \smallskip

      Best fit has \emph{high variance}
    \end{column}
    \begin{column}{0.25\textwidth}
      \centering
      $M=40$, $N=80$

      \bigskip

      \includegraphics[width=\columnwidth]{figs/fit_overparamfit_poly.pdf}

      \bigskip

      Good fit?

      \smallskip

      $\chi^2$ is zero

      \smallskip

      Best fit has \emph{low variance} and \emph{low bias}
    \end{column}
  \end{columns}
\end{frame}

\begin{frame}
  \frametitle{Linear least squares}
  \framesubtitle{Choice of basis}
  \centering
  \Large\textbf{Absolute value basis\vphantom{y}}\normalsize

  \bigskip

  \begin{columns}
    \begin{column}{0.25\textwidth}
      \centering
      $M=40$, $N=2$

      \bigskip

      \includegraphics[width=\columnwidth]{figs/fit_underfit_abs.pdf}

      \bigskip

      Underfit

      \smallskip

      $\chi^2$ is large

      \smallskip

      Best fit has \emph{high bias} \phantom{variance}
    \end{column}
    \begin{column}{0.25\textwidth}
      \centering
      $M=40$, $N=7$

      \bigskip

      \includegraphics[width=\columnwidth]{figs/fit_goodfit_abs.pdf}

      \bigskip

      Good fit!

      \smallskip

      $\chi^2$ is moderate

      \smallskip

      Best fit has \emph{low variance} and \emph{low bias}
    \end{column}
    \begin{column}{0.25\textwidth}
      \centering
      $M=40$, $N=40$

      \bigskip

      \includegraphics[width=\columnwidth]{figs/fit_overfit_abs.pdf}

      \bigskip

      Good fit?

      \smallskip

      $\chi^2$ is zero

      \smallskip

      Best fit has \emph{low variance} and \emph{low bias}
    \end{column}
    \begin{column}{0.25\textwidth}
      \centering
      $M=40$, $N=80$

      \bigskip

      \includegraphics[width=\columnwidth]{figs/fit_overparamfit_abs.pdf}

      \bigskip

      Good fit?

      \smallskip

      $\chi^2$ is zero

      \smallskip

      Best fit has \emph{low variance} and \emph{low bias}
    \end{column}
  \end{columns}
\end{frame}

\begin{frame}
  \frametitle{Linear least squares}
  \framesubtitle{Choice of basis}

  \begin{columns}
    \begin{column}{0.5\textwidth}
      \centering
      \textbf{Polynomial basis}

      \bigskip

      \includegraphics[width=\columnwidth]{figs/fit_bias-variance2_poly.pdf}
    \end{column}
    \begin{column}{0.5\textwidth}
      \centering
      \textbf{Absolute value basis}

      \bigskip

      \includegraphics[width=\columnwidth]{figs/fit_bias-variance_abs.pdf}
    \end{column}
  \end{columns}
\end{frame}

\begin{frame}
  \frametitle{Linear least squares}
  \framesubtitle{Sparseness and level of noise}

  \begin{columns}
    \begin{column}{0.5\textwidth}
      \includegraphics[width=\columnwidth]{figs/fit_data.pdf}
    \end{column}
    \begin{column}{0.5\textwidth}
      \includegraphics[width=\columnwidth]{figs/fit_data_abs2.pdf}
    \end{column}
  \end{columns}
\end{frame}

\begin{frame}
  \frametitle{Linear least squares}
  \framesubtitle{Sparseness and level of noise}
  \begin{columns}
    \begin{column}{0.25\textwidth}
      \centering
      $M=10$, $N=4$

      \bigskip

      \includegraphics[width=\columnwidth]{figs/fit_underfit_abs2.pdf}

      \bigskip

      Underfit

      \smallskip

      $\chi^2$ is large

      \smallskip

      Best fit has \emph{high bias} \phantom{variance}
    \end{column}
    \begin{column}{0.25\textwidth}
      \centering
      $M=10$, $N=6$

      \bigskip

      \includegraphics[width=\columnwidth]{figs/fit_goodfit_abs2.pdf}

      \bigskip

      Good fit!

      \smallskip

      $\chi^2$ is moderate

      \smallskip

      Best fit has \emph{low variance} and \emph{low bias}
    \end{column}
    \begin{column}{0.25\textwidth}
      \centering
      $M=10$, $N=10$

      \bigskip

      \includegraphics[width=\columnwidth]{figs/fit_overfit_abs2.pdf}

      \bigskip

      Good fit?

      \smallskip

      $\chi^2$ is zero

      \smallskip

      Best fit has \emph{low variance} and \emph{low bias}
    \end{column}
    \begin{column}{0.25\textwidth}
      \centering
      $M=10$, $N=15$

      \bigskip

      \includegraphics[width=\columnwidth]{figs/fit_overparamfit_abs2.pdf}

      \bigskip

      Better fit?!

      \smallskip

      $\chi^2$ is zero

      \smallskip

      Best fit has \emph{low variance} and \emph{low bias}
    \end{column}
  \end{columns}
\end{frame}

\begin{frame}
  \frametitle{Linear least squares}
  \framesubtitle{Sparseness and level of noise}

  \begin{columns}
    \begin{column}{0.5\textwidth}
      \centering
      $M=10$, $N=15$

      \bigskip

      \includegraphics[width=\columnwidth]{figs/fit_overparamfit_abs2.pdf}
    \end{column}
    \begin{column}{0.5\textwidth}
      \includegraphics[width=\columnwidth]{figs/fit_bias-variance_abs2.pdf}
    \end{column}
  \end{columns}
\end{frame}

\begin{frame}
  \frametitle{Machine learning is just \emph{non}linear least squares}
  \begin{columns}
    \begin{column}{0.5\textwidth}
      Gradient descent produces poor solutions to many machine learning problems

      \bigskip

      \textbf{BUT:} no one uses gradient descent

      \bigskip

      \emph{Stochastic} gradient descent (SGD): follow approximated gradient of $\chi^2$ calculated using small subsets (batches) of the data

      \bigskip

      Approximated gradient takes \emph{fewer} steps to find \emph{better} solutions
    \end{column}
    \begin{column}{0.5\textwidth}
      \includegraphics[width=\textwidth]{figs/beneventano_2023.png}

      \medskip\tiny

      \fullcite{Beneventano_2023_On}
    \end{column}
  \end{columns}
\end{frame}

\begin{frame}
  \begin{columns}
    \begin{column}{0.9\textwidth}
      \begin{overprint}
        \onslide<1>\includegraphics[width=\columnwidth]{figs/gradient_vs_sgd_1.png}
        \onslide<2>\includegraphics[width=\columnwidth]{figs/gradient_vs_sgd_2.png}
        \onslide<3>\includegraphics[width=\columnwidth]{figs/gradient_vs_sgd_3.png}
        \onslide<4>\includegraphics[width=\columnwidth]{figs/gradient_vs_sgd_4.png}
        \onslide<5>\includegraphics[width=\columnwidth]{figs/gradient_vs_sgd_5.png}
      \end{overprint}
    \end{column}
  \end{columns}
\end{frame}

\begin{frame}
  \centering
  \begin{tikzpicture}
    \draw (0,0) node[align=center] {Overparameterization works\\\includegraphics[width=3cm]{figs/fit_overparamfit_abs2.pdf}};
    \draw (4,2) node[align=center] {Gradient descent\\implicitly regularizes\\\includegraphics[height=2cm]{figs/fit_gradient_5.pdf}};
    \draw (-4,2) node[align=center] {Neural networks\\are good models\\\includegraphics[height=2cm]{figs/fit_basis_abs.pdf}};
    \draw (-4,-2) node[align=center] {Data is sparse, low-noise,\\and high-dimensional\\\includegraphics[height=2cm]{figs/fit_data_abs2.pdf}};
    \draw (4,-2) node[align=center] {SGD finds\\high-entropy solutions\\\includegraphics[height=2cm]{figs/gradient_vs_sgd_4.png}};
  \end{tikzpicture}
\end{frame}

\begin{frame}
  \frametitle{Machine learning is just \emph{non}linear least squares}
  \begin{columns}
    \begin{column}{0.5\textwidth}
      Structure and geometry of manifold of ``perfect'' solutions integral to understanding overparameterized fits

      \medskip

      \textbf{BUT:} little is known outside the linear case

      \medskip

      State of the art: sample two points, relax an elastic line between them

      \bigskip

      \textbf{Can we develop better ways to understand the solution space in nonlinear problems?}
    \end{column}
    \begin{column}{0.5\textwidth}
      \includegraphics[width=0.52\textwidth]{figs/garipov_2018.png}
      \hfill
      \includegraphics[width=0.46\textwidth]{figs/draxler_2018.png}

      \tiny\medskip

      \fullcite{Garipov_2018_Loss}

      \medskip

      \fullcite{Draxler_2018_Essentially}
    \end{column}
  \end{columns}
\end{frame}

\begin{frame}
  \frametitle{A simple model of nonlinear least squares}
  \begin{columns}
    \begin{column}{0.9\textwidth}
      Data: $x_i\mapsto J^i$, $N\times\cdots\times N$ $p$-tensors of Gaussian random numbers\\
      \hphantom{Data: }$y_i\mapsto V_0$, a constant \\

      \medskip

      Parameters: $\pmb a$ is an $N$-dimensional vector constrained to live on the sphere \\

      \medskip

      Model: $\hat f(J\mid\pmb a)$ is the homogeneous degree-$p$ polynomial
      \[
        \hat f(J\mid \pmb a)
        =\sum_{i_1=1}^N\cdots\sum_{i_p=1}^NJ_{i_1,\ldots,i_p}a_{i_1}\cdots a_{i_p}
      \]
      Cost function: the usual
      \[
        \chi^2(\pmb a\mid J^1,\ldots,J^M)
        =\sum_{i=1}^M\left[
          V_0-\hat f(J^i\mid\pmb a)
        \right]^2
      \]
    \end{column}
  \end{columns}
\end{frame}

\begin{frame}
  \frametitle{A simple model of nonlinear least squares}
  \begin{columns}
    \begin{column}{0.33\textwidth}
      \textbf{When does \boldmath{$\chi^2$} have any solutions?}

      \medskip

      \tiny
      \fullcite{Fyodorov_2019_A}

      \smallskip

      \fullcite{Fyodorov_2020_Counting}

      \smallskip

      \fullcite{Fyodorov_2022_Optimization}

      \smallskip

      \fullcite{Montanari_2024_On}
    \end{column}
    \begin{column}{0.33\textwidth}
      \textbf{When does \boldmath{$\chi^2$} have suboptimal minima that hinder optimization?}

      \medskip

      \tiny\fullcite{Kent-Dobias_2024_Conditioning}

      \medskip

      \normalsize
      \textbf{How does gradient descent behave?}

      \medskip

      \tiny
      \fullcite{Urbani_2023_A}

      \smallskip

      \fullcite{Kamali_2023_Dynamical}

      \smallskip

      \fullcite{Montanari_2023_Solving}

      \normalsize
    \end{column}
    \begin{column}{0.33\textwidth}
      \textbf{How does stochastic gradient descent behave?}

      \medskip

      \tiny
      \fullcite{Kamali_2023_Stochastic}

      \bigskip

      \normalsize
      \textbf{What does the manifold of perfect solutions look like?}

      \medskip

      This presentation!

      \medskip

      \tiny
      \fullcite{Kent-Dobias_2024_On}
    \end{column}
  \end{columns}
\end{frame}

\begin{frame}
  \frametitle{A simple model of nonlinear least squares}
  \begin{columns}
    \begin{column}{0.85\textwidth}
      Solutions form $D=N-M-1$ dimensional manifold:
      \[
        \Omega=\left\{
          \pmb a\in\mathbb R^N\mid\|\pmb a\|^2=N, V_0=\hat f(J^i\mid\pmb a)\;\forall\;1\leq i\leq M
        \right\}
      \]
      How to count number of components? We don't know

      \medskip

      Very good at counting isolated minima:
      \[
        \#_{\substack{\text{critical}\\\text{points}}}=\int d\pmb x\,
        \delta\big(\nabla h(\pmb x)\big)\,
        \big|\det\operatorname{Hess}h(\pmb x)\big|
      \]

      \smallskip

      \tiny\fullcite{Kent-Dobias_2023_How}

      \normalsize
      \medskip

      \textbf{Today:} how to compute the \emph{Euler characteristic} of the solution manifold
    \end{column}
  \end{columns}
\end{frame}

\begin{frame}
  \frametitle{The Euler characteristic \boldmath{$\chi$}}
  \begin{columns}
    \begin{column}{0.5\textwidth}
      The Euler characteristic $\chi(\Omega)$ is a topological invariant of a manifold $\Omega$

      \medskip

      Defined by tiling the manifold, then taking the alternating sum
      \begin{align*}
        \chi(\Omega_{\text{cow}})
        &=
        {\only<2,5->{\color{Red}}\#_\text{vertices}}
        &&\hspace{-1em}-
        {\only<3,5->{\color{ictpgreen}}\#_\text{edges}}
        &&\hspace{-1em}+
        {\only<4,5->{\color{ictpblue}}\#_\text{faces}}
        \\
        &\color{White}\only<2->{\color{Black}}=
        {\only<2,5->{\color{Red}}2904}
        &&\hspace{-1em}\color{White}\only<3->{\color{Black}}-
        {\only<3,5->{\color{ictpgreen}}8706}
        &&\hspace{-1em}\color{White}\only<4->{\color{Black}}+
        {\only<4,5->{\color{ictpblue}}5804} \\
        &\color{White}\only<5->{\color{Black}}=2
      \end{align*}
      \[
        \color{White}\only<6->{\color{Black}}\chi(\Omega_\text{football})
        ={\only<6->{\color{Red}}60}-{\only<6->{\color{ictpgreen}}90}+{\only<6->{\color{ictpblue}}32}=2
      \]

      \color{White}\only<7>{\color{Black}}Cow is homeomorphic to a sphere
    \end{column}
    \begin{column}{0.5\textwidth}
      \begin{overprint}
        \onslide<1,5>\includegraphics[width=\textwidth]{figs/cow.png}
        \onslide<2>\includegraphics[width=\textwidth]{figs/cow_vert.png}
        \onslide<3>\includegraphics[width=\textwidth]{figs/cow_edge.png}
        \onslide<4>\includegraphics[width=\textwidth]{figs/cow_face.png}
        \onslide<6->\hspace{2em}\includegraphics{figs/Football_Pallo_valmiina-cropped.jpg}
      \end{overprint}
    \end{column}
  \end{columns}
\end{frame}

\begin{frame}
  \frametitle{The Euler characteristic \boldmath{$\chi$}}
  \framesubtitle{Characteristics of the characteristic}

  \begin{columns}
    \begin{column}{0.5\textwidth}
      For closed, connected 2-dimensional manifolds, related to genus $g$ by
      $\chi=2-2g$

      \medskip

      General properties:
      \vspace{-0.5em}
      \[
        \chi(\Omega)=0 \text{ for odd-dimensional $\Omega$}
      \]
      \vspace{-1.6em}
      \[
        \chi(S^D)=2\text{ for even }D
      \]
      \[
        \chi(\Omega_1\sqcup\Omega_2)=\chi(\Omega_1)+\chi(\Omega_2)
      \]
      \[
        \chi(\Omega_1\times\Omega_2)=\chi(\Omega_1)\times\chi(\Omega_2)
      \]

      \smallskip

      Examples:
      \vspace{-0.5em}
      \[\chi(M\text{ even-$D$ spheres})=2M\]
      \vspace{-1.6em}
      \[\chi(S^1\times\text{anything})=0\]
    \end{column}
    \begin{column}{0.5\textwidth}
      \includegraphics[width=\textwidth]{figs/genus.png}
    \end{column}
  \end{columns}
\end{frame}

\begin{frame}
  \frametitle{The Euler characteristic \boldmath{$\chi$}}
  \framesubtitle{Computing the Euler characteristic}
  \begin{columns}
    \begin{column}{0.5\textwidth}
      Morse theory: gradient flow on an arbitrary ``height'' function $h$ makes a complex
      \begin{align*}
        \chi(\Omega)
        &=
        {\only<2,5>{\color{Red}}\#_\text{vertices}}
        -
        {\only<3,5>{\color{ictpgreen}}\#_\text{edges}}
        +
        {\only<4,5>{\color{ictpblue}}\#_\text{faces}}
        +\cdots \\
        &=
        {\only<6>{\color{ictpblue}}\#_\text{index 0}}
        -
        {\only<6>{\color{ictpgreen}}\#_\text{index 1}}
        +
        {\only<6>{\color{Red}}\#_\text{index 2}}
        +\cdots \\
        &=\sum_{i=0}^D(-1)^i\#_\text{index i}
      \end{align*}
    \end{column}
    \begin{column}{0.5\textwidth}
      \begin{overprint}
        \onslide<1>\includegraphics[width=\textwidth]{figs/other_sphere.png}
        \onslide<2>\includegraphics[width=\textwidth]{figs/other_sphere_vert.png}
        \onslide<3>\includegraphics[width=\textwidth]{figs/other_sphere_edge.png}
        \onslide<4>\includegraphics[width=\textwidth]{figs/other_sphere_face.png}
        \onslide<5>\includegraphics[width=\textwidth]{figs/other_sphere_all.png}
        \onslide<6>\includegraphics[width=\textwidth]{figs/other_sphere_crit.png}
      \end{overprint}
    \end{column}
  \end{columns}
\end{frame}

\begin{frame}
  \frametitle{The Euler characteristic \boldmath{$\chi$}}
  \framesubtitle{Computing the Euler characteristic}

  \begin{columns}
    \begin{column}{0.6\textwidth}
      \[
        \Omega=\left\{
          \pmb a\in\mathbb R^N\mid\|\pmb a\|^2=N, V_0=\hat f(J^i\mid\pmb a)\;\forall\;1\leq i\leq M
        \right\}
      \]
      Pick whatever height function $h:\Omega\to\mathbb R$ you like: $h(\pmb a)=\frac1N\pmb
      a_0\cdot\pmb a$ for arbitrary $\pmb a_0$
      \[
        \#_{\substack{\text{critical}\\\text{points}}}
        =\int_\Omega d\pmb x\,
        \delta\big(\nabla h(\pmb x)\big)\,
        \big|\det\operatorname{Hess}h(\pmb x)\big|
      \]
      \[
        \hspace{-1em}\operatorname{sgn}\big(\det\operatorname{Hess}(\pmb x)\big)
        =
        \operatorname{sgn}\left(\prod_{i=1}^D\lambda_i\right)
        =(-1)^{\text{index}}
      \]
      \[
        \chi(\Omega)
        =\sum_{i=0}^D(-1)^i\#_\text{index i}
        =\int_\Omega d\pmb x\,\delta\big(\nabla h(\pmb x)\big)\,\det\operatorname{Hess}h(\pmb x)
      \]
    \end{column}
    \begin{column}{0.4\textwidth}
      \begin{overprint}
        \onslide<1>\includegraphics[width=\textwidth]{figs/function_1.png}
        \onslide<2>\includegraphics[width=\textwidth]{figs/function_2.png}
        \onslide<3>\includegraphics[width=\textwidth]{figs/function_3.png}
      \end{overprint}
    \end{column}
  \end{columns}
\end{frame}

\begin{frame}
  \frametitle{A simple model of nonlinear least squares}
  \framesubtitle{Results}
  \begin{columns}
    \begin{column}{0.5\textwidth}
      $M$ data points, $N$ parameters, $\alpha=M/N$
      \[
        V_0=\hat f(J\mid \pmb a)
        =\sum_{i_1=1}^N\cdots\sum_{i_p=1}^NJ_{i_1,\ldots,i_p}a_{i_1}\cdots a_{i_p}
      \]
      Simplest case: $p=1$, $\Omega$ is intersection of random hyperplanes with the sphere

      \medskip

      Results in $\chi(\Omega)=2$ or $\chi(\Omega)=0$ depending on whether solutions exist

      \medskip

      \hspace{2em}\includegraphics[width=0.33\textwidth]{figs/connected.pdf}
      \hfill
      \includegraphics[width=0.33\textwidth]{figs/gone.pdf}\hspace{2em}
    \end{column}
    \begin{column}{0.5\textwidth}
      \begin{overprint}
        \onslide<1>\includegraphics[width=\textwidth]{figs/intersections_1.pdf}
        \onslide<2>\includegraphics[width=\textwidth]{figs/intersections_2.pdf}
        \onslide<3>\includegraphics[width=\textwidth]{figs/intersections_3.pdf}
        \onslide<4>\includegraphics[width=\textwidth]{figs/intersections_4.pdf}
        \onslide<5>\includegraphics[width=\textwidth]{figs/phases_1.pdf}
      \end{overprint}
    \end{column}
  \end{columns}
\end{frame}

\begin{frame}
  \frametitle{A simple model of nonlinear least squares}
  \framesubtitle{Results}
  \begin{columns}
    \begin{column}{0.5\textwidth}
      $M$ data points, $N$ parameters, $\alpha=M/N$
      \[
        V_0=\hat f(J\mid \pmb a)
        =\sum_{i_1=1}^N\cdots\sum_{i_p=1}^NJ_{i_1,\ldots,i_p}a_{i_1}\cdots a_{i_p}
      \]

      For $p\geq2$, different phases with $|\chi(\Omega)|\gg1$ with varying sign

      \medskip

      \includegraphics[width=0.23\textwidth]{figs/middle.pdf}
      \includegraphics[width=0.23\textwidth]{figs/complex.pdf}
      \includegraphics[width=0.23\textwidth]{figs/shattered.pdf}
      \includegraphics[width=0.23\textwidth]{figs/gone.pdf}
    \end{column}
    \begin{column}{0.5\textwidth}
      \begin{overprint}
        \onslide<1>\centering\includegraphics[width=0.8\textwidth]{figs/middle.pdf}\\$\chi(\Omega)\ll0$
        \onslide<2>\centering\includegraphics[width=0.8\textwidth]{figs/complex.pdf}\\$\chi(\Omega)\ll0$
        \onslide<3>\centering\includegraphics[width=0.8\textwidth]{figs/shattered.pdf}\\$\chi(\Omega)\gg0$
        \onslide<4>\includegraphics[width=\textwidth]{figs/phases_2.pdf}
        \onslide<5>\includegraphics[width=\textwidth]{figs/phases_3.pdf}
        \onslide<6>\includegraphics[width=\textwidth]{figs/phases_4.pdf}
      \end{overprint}
    \end{column}
  \end{columns}
\end{frame}

\begin{frame}
  \frametitle{A simple model of nonlinear least squares}
  \framesubtitle{Results}

  \begin{columns}
    \begin{column}{1.1\textwidth}
      \hspace{1em}Phases for inhomogeneous models: $1-\lambda$ parts linear ($p=1$) plus $\lambda$ quadratic ($p=2$)

      \medskip

      \includegraphics[width=0.21\columnwidth]{figs/phases_12_0.pdf}
      \nolinebreak\hspace{-2.5em}
      \includegraphics[width=0.21\columnwidth]{figs/phases_12_1.pdf}
      \nolinebreak\hspace{-2.5em}
      \includegraphics[width=0.21\columnwidth]{figs/phases_12_2.pdf}
      \nolinebreak\hspace{-2.5em}
      \includegraphics[width=0.21\columnwidth]{figs/phases_12_3.pdf}
      \nolinebreak\hspace{-2.5em}
      \includegraphics[width=0.21\columnwidth]{figs/phases_12_4.pdf}
      \nolinebreak\hspace{-2.5em}
      \includegraphics[width=0.21\columnwidth]{figs/phases_12_5.pdf}

      \medskip

      \hspace{5em}
      \includegraphics[width=0.13\textwidth]{figs/connected.pdf}
      \hfill
      \includegraphics[width=0.13\textwidth]{figs/middle.pdf}
      \hfill
      \includegraphics[width=0.13\textwidth]{figs/complex.pdf}
      \hfill
      \includegraphics[width=0.13\textwidth]{figs/shattered.pdf}
      \hfill
      \includegraphics[width=0.13\textwidth]{figs/gone.pdf}
      \hspace{5em}
    \end{column}
  \end{columns}
\end{frame}

\begin{frame}
  \frametitle{A simple model of nonlinear least squares}
  \framesubtitle{Conclusions and future directions}
  \begin{columns}
    \begin{column}{0.5\textwidth}

      Many outstanding questions:

      \bigskip

      \textbf{How do these structures interact with dynamics?}

      \bigskip

      \textbf{What structures exist in a problem with a non-random ground truth?}

      \bigskip

      \textbf{How do topological properties of solutions correlate with their quality?}
    \end{column}
    \begin{column}{0.5\textwidth}
      \begin{tikzpicture}[scale=0.6, every node/.style={scale=0.6}]
        \draw (0,0) node[align=center] {Overparameterization works\\\includegraphics[width=3cm]{figs/fit_overparamfit_abs2.pdf}};
        \draw (4,2) node[align=center] {Gradient descent\\implicitly regularizes\\\includegraphics[height=2cm]{figs/fit_gradient_5.pdf}};
        \draw (-4,2) node[align=center] {Neural networks\\are good models\\\includegraphics[height=2cm]{figs/fit_basis_abs.pdf}};
        \draw (-4,-2) node[align=center] {Data is sparse, low-noise,\\and high-dimensional\\\includegraphics[height=2cm]{figs/fit_data_abs2.pdf}};
        \draw (4,-2) node[align=center] {SGD finds\\high-entropy solutions\\\includegraphics[height=2cm]{figs/gradient_vs_sgd_4.png}};
      \end{tikzpicture}
    \end{column}
  \end{columns}
\end{frame}

\end{document}