From e160fe6e482184afc25ecbcebdc8c42611e3f59c Mon Sep 17 00:00:00 2001 From: Jaron Kent-Dobias Date: Sun, 3 Dec 2023 20:14:47 +0100 Subject: Lots more correction and elaboration. --- 2-point.bib | 30 ++++++ 2-point.tex | 315 +++++++++++++++++++++++++++++++++++++++++++----------------- 2 files changed, 256 insertions(+), 89 deletions(-) diff --git a/2-point.bib b/2-point.bib index 63de3de..a2c43bb 100644 --- a/2-point.bib +++ b/2-point.bib @@ -512,3 +512,33 @@ Spherical Spin Glasses}, translator = {Erné, Reinie} } +@article{Baldassi_2021_Unveiling, + author = {Baldassi, Carlo and Lauditi, Clarissa and Malatesta, Enrico M. and Perugini, Gabriele and Zecchina, Riccardo}, + title = {Unveiling the Structure of Wide Flat Minima in Neural Networks}, + journal = {Physical Review Letters}, + publisher = {American Physical Society (APS)}, + year = {2021}, + month = {12}, + number = {27}, + volume = {127}, + pages = {278301}, + url = {http://dx.doi.org/10.1103/PhysRevLett.127.278301}, + doi = {10.1103/physrevlett.127.278301}, + issn = {1079-7114} +} + +@article{Baldassi_2016_Unreasonable, + author = {Baldassi, Carlo and Borgs, Christian and Chayes, Jennifer T. and Ingrosso, Alessandro and Lucibello, Carlo and Saglietti, Luca and Zecchina, Riccardo}, + title = {Unreasonable effectiveness of learning neural networks: From accessible states and robust ensembles to basic algorithmic schemes}, + journal = {Proceedings of the National Academy of Sciences}, + publisher = {Proceedings of the National Academy of Sciences}, + year = {2016}, + month = {11}, + number = {48}, + volume = {113}, + pages = {E7655--E7662}, + url = {http://dx.doi.org/10.1073/pnas.1608103113}, + doi = {10.1073/pnas.1608103113}, + issn = {1091-6490} +} + diff --git a/2-point.tex b/2-point.tex index 267229e..f1c0832 100644 --- a/2-point.tex +++ b/2-point.tex @@ -170,7 +170,7 @@ at the threshold energy, while at other energies typical marginal minima are far and separated by extensive barriers. Therefore, with respect to the problem of dynamics this paper merely deepens the outstanding issues. -\section{Model} +\section{The model} \label{sec:model} The mixed spherical models are defined by the Hamiltonian @@ -428,7 +428,7 @@ adjacent saddle points defines a complex with implications for the landscape topology \cite{Audin_2014_Morse}. Other stationary points are found at arbitrarily small distances from a -reference extensive saddle point. The energy and stability of these near +reference extensive saddle point, with a linear pseudogap in their complexity. The energy and stability of these near neighbors approach that of the reference point as the difference in overlap $\Delta q$ is brought to zero. However, the approach of the energy and stability are at different rates: the energy difference between the reference @@ -671,11 +671,14 @@ $\pmb\sigma$ in \eqref{eq:complexity.definition}, and is the only of the $\pmb\s This expression can now be averaged over the disordered couplings, and its integration evaluated using the saddle point method. We must assume the form of order among the replicas $\mathbf s$ and $\pmb\sigma$, and we take them to be -replica symmetric. For the $3+4$ model that is our immediate interest and other -models like it this choice is well-motivated. Details of this calculation can be found in Appendix~\ref{sec:complexity-details}. +replica symmetric. Replica symmetry means that at the saddle point, all +distinct pairs of replicas have the same overlap. This choice is well-motivated +for the $3+4$ model that is our immediate interest and other models like it. +Details of the calculation can be found in +Appendix~\ref{sec:complexity-details}. The resulting expression for the complexity, which must -still be extremized over the parameters $\hat\beta_1$, $r^{01}$, +still be extremized over the order parameters $\hat\beta_1$, $r^{01}$, $r^{11}_\mathrm d$, $r^{11}_0$, and $q^{11}_0$, is \begin{equation} \begin{aligned} @@ -701,23 +704,24 @@ $r^{11}_\mathrm d$, $r^{11}_0$, and $q^{11}_0$, is \Bigg]\Bigg\} \end{aligned} \end{equation} +where the function $\mathcal D$ is defined in \eqref{eq:hessian.func} of Appendix~\ref{sec:complexity-details}. It is possible to further extremize this expression over all the other variables but $q_0^{11}$, for which the saddle point conditions have a unique solution. However, the resulting expression is quite complicated and provides -no insight. In fact, the numeric root-finding problem is more stable preserving these parameters, rather than analytically eliminating them. In practice, the complexity can be calculated in two ways. First, +no insight. In fact, the numeric root-finding problem is more stable preserving these parameters, rather than analytically eliminating them. + +In practice, the complexity can be calculated in two ways. First, the extremal problem can be done numerically, initializing from $q=0$ where the problem reduces to that of the single-point complexity of points with energy -$E_1$ and stability $\mu_1$, and then taking small steps in $q$ or other -parameters to trace out the solution. This is how the data in all the plots of +$E_1$ and stability $\mu_1$, which has an analytical solution. Then small steps in $q$ or other +parameters are taken to trace out the solution. This is how the data in all the plots of this paper was produced. Second, the complexity can be calculated in the near -neighborhood of a reference point by expanding in small $1-q$. This is what we -describe in the next subsection. +neighborhood of a reference point by expanding in powers of small $1-q$. This expansion indicates when nearby points can be found at arbitrarily small distance, and in that case gives the form of the pseudogap in their complexity. If there is no overlap gap between the reference point and its nearest neighbors, their complexity can be calculated by an expansion in $1-q$. First, we'll use this method to describe the most common type of stationary point in -the close vicinity of a reference point. The most common neighbors of a -reference point are given by further maximizing the two-point complexity over +the close vicinity of a reference point. These are given by further maximizing the two-point complexity over the energy $E_1$ and stability $\mu_1$ of the nearby points. This gives the conditions \begin{align} @@ -733,7 +737,9 @@ saddle point in the remaining parameters is taken, the result is =\frac{f'''(1)}{8f''(1)^2}(\mu_\mathrm m^2-\mu_0^2)\left(\sqrt{2+\frac{2f''(1)\big(f''(1)-f'(1)\big)}{f'''(1)f'(1)}}-1\right)(1-q) +O\big((1-q)^2\big) \end{equation} -independent of $E_0$. To describe the properties of these most common +independent of $E_0$. Notice that slope of the complexity is positive for $\mu_0<\mu_\text m$ and vanishes when the stability of the reference point approaches the marginal stability. This implies that extensive saddle points have arbitrarily close neighbors with a linear pseudogap, while stable minima have an overlap gap with their nearest neighbors. For marginal minima, the existence of arbitrarily close neighbors must be decided at quadratic order and higher. + +To describe the properties of these most common neighbors, it is convenient to first make a definition. The population of stationary points that are most common at each energy (the blue line in Fig.~\ref{fig:complexities}) have the relation @@ -748,11 +754,11 @@ $\Delta q$ are \label{eq:expansion.mu.1} \mu_1&=\mu_0-\frac{v_f}{u_f}\big(E_0-E_\mathrm{dom}(\mu_0)\big)(1-q)+O\big((1-q)^2\big) \end{align} +where $v_f$ and $u_f$ are positive functionals of $f$ defined in \eqref{eq:v.and.u} of Appendix~\ref{sec:complexity-details}. The most common neighboring saddles to a reference saddle are much nearer to the reference in energy ($\Delta q^2$) than in stability ($\Delta q$). In fact, -this scaling also holds for the entire range of neighbors to a reference -saddle, with the limits in energy scaling like $\Delta q^2$ and those of -stability scaling like $\Delta q$. +this scaling also holds for all neighbors to a reference +saddle, not just the most common. Because both expressions are proportional to $E_0-E_\mathrm{dom}(\mu_0)$, whether the energy and stability of nearby points increases or decreases from @@ -766,20 +772,17 @@ To examine better the population of marginal points, it is necessary to look at the next term in the series of the complexity with $\Delta q$, since the linear coefficient becomes zero at the marginal line. This tells us something intuitive: stable minima have an effective repulsion between points, and one -always finds a sufficiently small $\Delta q$ that no stationary points are +always finds a sufficiently small $\Delta q$ such that no stationary points are point any nearer. For the marginal minima, it is not clear that the same should be true. -When $\mu=\mu_\mathrm m$, the linear term above vanishes. Under these conditions, the quadratic term in the expansion is +For marginal points with $\mu=\mu_\mathrm m$, the linear term above vanishes. Under these conditions, the quadratic term in the expansion for the dominant population of near neighbors is \begin{equation} \Sigma_{12} =\frac12\frac{f'''(1)v_f}{f''(1)^{3/2}u_f} \left(\sqrt{\frac{2\big[f'(1)(f'''(1)-f''(1))+f''(1)^2\big]}{f'(1)f'''(1)}}-1\right)\big(E_0-E_\textrm{th}\big)(1-q)^2+O\big((1-q)^3\big) \end{equation} -Note that this expression is only true for $\mu=\mu_\mathrm m$. Therefore, -among marginal minima, when $E_0$ is greater than the threshold one finds -neighbors at arbitrarily close distance. When $E_0$ is less than the threshold, -the complexity of nearby points is negative, and there is a desert where none -are found. +Note that this expression is only valid for $\mu=\mu_\mathrm m$. This coefficient is positive when $E>E_\text{th}$ and negative when $E