From 4d6c19367c5eff3eaf29376d663369730d36cd74 Mon Sep 17 00:00:00 2001 From: Jaron Kent-Dobias Date: Sat, 7 Sep 2024 19:48:36 +0200 Subject: Lots of writing. --- topology.bib | 23 +++++ topology.tex | 326 +++++++++++++++++++++++++++++++++++++++++------------------ 2 files changed, 252 insertions(+), 97 deletions(-) diff --git a/topology.bib b/topology.bib index 6d09b4e..81d2df8 100644 --- a/topology.bib +++ b/topology.bib @@ -39,6 +39,20 @@ translator = {Erné, Reinie} } +@unpublished{Auffinger_2022_The, + author = {Auffinger, Antonio and Zhou, Yuxin}, + title = {The Spherical $p+s$ Spin Glass At Zero Temperature}, + year = {2022}, + month = {9}, + url = {http://arxiv.org/abs/2209.03866v1}, + doi = {10.48550/arXiv.2209.03866}, + note = {arXiv preprint}, + date = {2022-09-08T15:08:26Z}, + eprint = {2209.03866v1}, + eprintclass = {math.PR}, + eprinttype = {arxiv} +} + @article{Baldassi_2016_Unreasonable, author = {Baldassi, Carlo and Borgs, Christian and Chayes, Jennifer T. and Ingrosso, Alessandro and Lucibello, Carlo and Saglietti, Luca and Zecchina, Riccardo}, title = {Unreasonable effectiveness of learning neural networks: From accessible states and robust ensembles to basic algorithmic schemes}, @@ -409,6 +423,15 @@ doi = {10.2307/2371510} } +@phdthesis{Tublin_2022_A, + author = {Tublin, Rashel}, + title = {A Few Results in Random Matrix Theory and Random Optimization}, + year = {2022}, + month = {September}, + url = {https://kclpure.kcl.ac.uk/portal/en/studentTheses/a-few-results-in-random-matrix-theory-and-random-optimization}, + school = {King's College London} +} + @article{Urbani_2023_A, author = {Urbani, Pierfrancesco}, title = {A continuous constraint satisfaction problem for the rigidity transition in confluent tissues}, diff --git a/topology.tex b/topology.tex index 6252c96..4354db3 100644 --- a/topology.tex +++ b/topology.tex @@ -126,7 +126,7 @@ zero-gradient solutions to overparameterized smooth neural networks and in verte In such problems, there is great interest in characterizing structure in the set of solutions, which can be influential in how algorithms behave when trying to solve them \cite{Baldassi_2016_Unreasonable, Baldassi_2019_Properties, -Beneventano_2023_On}. Here, we show how \emph{topological} information about +Beneventano_2023_On}. Here, we show how topological information about the set of solutions can be calculated in a simple model of satisfying random nonlinear equalities. This allows us to reason about the connectivity of this solution set. The topological properties revealed by this calculation yield @@ -157,13 +157,13 @@ with the elements of the tensors $J^{(k,p)}$ as independently distributed unit normal random variables satisfies \eqref{eq:covariance}. The size of the series coefficients of $f$ therefore control the variances in the coefficients of random polynomial constraints. When $M=1$, this problem corresponds to the -level set of a spherical spin glass with energy density $E=\sqrt{N}V_0$. +level set of a spherical spin glass with energy density $E=V_0/\sqrt{N}$. This problem or small variations thereof have attracted attention recently for their resemblance to encryption, optimization, and vertex models of confluent tissues \cite{Fyodorov_2019_A, Fyodorov_2020_Counting, - Fyodorov_2022_Optimization, Urbani_2023_A, Kamali_2023_Dynamical, + Fyodorov_2022_Optimization, Tublin_2022_A, Vivo_2024_Random, Urbani_2023_A, Kamali_2023_Dynamical, Kamali_2023_Stochastic, Urbani_2024_Statistical, Montanari_2023_Solving, Montanari_2024_On, Kent-Dobias_2024_Conditioning, Kent-Dobias_2024_Algorithm-independent}. In each of these cases, the authors studied properties of the cost function @@ -277,12 +277,12 @@ The result is the reduction of the average Euler characteristic to an expression form \begin{equation} \label{eq:pre-saddle.characteristic} \overline{\chi(\Omega)} - =\left(\frac N{2\pi}\right)^2\int dR\,dD\,dm\,d\hat m\,g(R,D,m,\hat m)\,e^{N\mathcal S_\Omega(R,D,m,\hat m)} + =\left(\frac N{2\pi}\right)^2\int dR\,dD\,dm\,d\hat m\,g(R,D,m,\hat m)\,e^{N\mathcal S_\chi(R,D,m,\hat m)} \end{equation} -where $g$ is a prefactor of $o(N^0)$, and $\mathcal S_\Omega$ is an effective action defined by +where $g$ is a prefactor of $o(N^0)$, and $\mathcal S_\chi$ is an effective action defined by \begin{equation} \label{eq:euler.action} \begin{aligned} - \mathcal S_\Omega(R,D,m,\hat m\mid\alpha,V_0) + \mathcal S_\chi(R,D,m,\hat m\mid\alpha,V_0) &=\hat m-\frac\alpha2\left[ \log\left(1+\frac{f(1)D}{f'(1)R^2}\right) +\frac{V_0^2}{f(1)}\left(1+\frac{f'(1)R^2}{f(1)D}\right)^{-1} @@ -311,30 +311,32 @@ new effective action of $m$ alone. This can be solved to give \end{equation} \begin{equation} \begin{aligned} - R^*(m) - &=\frac{-m(1-m^2)}{2[f(1)-(1-m^2)f'(1)]^2} + R^* + =\frac{-m(1-m^2)}{2[f(1)-(1-m^2)f'(1)]^2} \Bigg[ \alpha V_0^2f'(1) - +(2-\alpha)f(1)\left(\frac{f(1)}{1-m^2}-f'(1)\right) \\ - &\quad+\alpha\sqrt{ + +(2-\alpha)f(1)\left(\frac{f(1)}{1-m^2}-f'(1)\right) \quad \\ + \quad+\alpha\sqrt{ \tfrac{4V_0^2}\alpha f(1)f'(1)\left[\tfrac{f(1)}{1-m^2}-f'(1)\right] +\left[\tfrac{f(1)^2}{1-m^2}-\big(V_0^2+f(1)\big)f'(1)\right]^2 } \Bigg] \end{aligned} \end{equation} +with the resulting effective action as a function of $m$ alone given by \begin{equation} - \mathcal S_\Omega(m) + \mathcal S_\chi(m) =-\frac\alpha2\bigg[ \log\left( - 1-\frac{f(1)}{f'(1)}\frac{1+m/R^*}{1-m^2} + 1-\frac{f(1)}{f'(1)}\frac{1+\frac m{R^*}}{1-m^2} \right) +\frac{V_0^2}{f(1)}\left( - 1-\frac{f'(1)}{f(1)}\frac{1-m^2}{1+m/R^*} + 1-\frac{f'(1)}{f(1)}\frac{1-m^2}{1+\frac m{R^*}} \right)^{-1} \bigg] +\frac12\log\left(-\frac m{R^*}\right) \end{equation} +This function is plotted as a function of $m$ in Fig.~\ref{fig:action} for a variety of $V_0$ and $f$. To finish evaluating the integral, this expression should be maximized with respect to $m$. The order parameter $m$ is both physical and interpretable, as it gives the overlap of the configuration $\mathbf x$ with the height axis @@ -342,75 +344,132 @@ $\mathbf x_0$. Therefore, the value $m^*$ that maximizes this action can be understood as the latitude on the sphere where most of the contribution to the Euler characteristic is made. -The action $\mathcal S_\Omega$ is extremized with respect to $m$ at $m=0$ or $m=\pm m^*=\mp R^*(m^*)$. +\begin{figure} + \includegraphics{figs/action_1.pdf} + \hspace{-3.5em} + \includegraphics{figs/action_3.pdf} + + \caption{ + \textbf{Effective action for the Euler characteristic.} + The effective action governing the average Euler characteristic as a function of the overlap + $m=\frac1N\mathbf x\cdot\mathbf x_0$ with the height direction for two + different homogeneous polynomial functions and a variety of target values $V_0$. In both + plots $\alpha=\frac12$. \textbf{Left:} With linear functions there are two + regimes. For small $V_0$, there are maxima at $m=\pm m^*$ where the action + is zero, while after the satisfiability transition at $V_0=V_\text{\textsc{sat}}=1$, $m^*$ + goes to zero and the action becomes negative. \textbf{Right:} With nonlinear + functions there are four regimes. For small $V_0$ the behavior is the same + as in the linear case, with zero action. After an onset transition at + $V_0=V_\text{on}\simeq1.099$ the maxima are at the edge of validity of the + action and the action is positive. At a shattering transition at + $V_0=V_\text{sh}\simeq1.394$, the maximum goes to zero and the action is positive. + Finally, at the satisfiability transition + $V_0=V_\text{\textsc{sat}}\simeq1.440$ the action becomes negative. + } \label{fig:action} +\end{figure} + +The action $\mathcal S_\chi$ is extremized with respect to $m$ at $m=0$ or $m=\pm m^*=\mp R^*(m^*)$. In the latter case, $m^*$ takes the value \begin{equation} m^*=\sqrt{1-\frac{\alpha}{f'(1)}\big(V_0^2+f(1)\big)} \end{equation} -and $\mathcal S_\Omega(m^*)=0$. If this solution was always well-defined, it would vanish when the argument of the square root vanishes for +and $\mathcal S_\chi(m^*)=0$. If this solution were always well-defined, it would vanish when the argument of the square root vanishes for \begin{equation} V_0^2>V_{\text{\textsc{sat}}\ast}^2\equiv\frac{f'(1)}\alpha-f(1) \end{equation} -However, when +This corresponds precisely to the satisfiability transition found in previous work by a replica symmetric analysis of the cost function \eqref{eq:cost} \cite{Fyodorov_2019_A, Fyodorov_2020_Counting, Fyodorov_2022_Optimization, Tublin_2022_A, Vivo_2024_Random}. +However, when the magnitude of $V_0$ is sufficiently large, with \begin{equation} V_0^2>V_\text{on}^2\equiv\frac{1-\alpha+\sqrt{1-\alpha}}\alpha f(1) \end{equation} -$R^*(m^*)$ becomes complex and the solution is no longer valid. Since +$R^*(m^*)$ becomes complex and this solution is no longer valid. Since \begin{equation} V_\text{on}^2-V_{\text{\textsc{sat}}\ast}^2 =\frac{f'(1)-f(1)}\alpha-\frac{\sqrt{1-\alpha}}\alpha f(1) \end{equation} -If $f(q)$ is linear, then $f'(1)=f(1)$ and $V_\text{on}^2>V_{\text{\textsc{sat}}\ast}^2$, so the +If $f(q)$ is purely linear, then $f'(1)=f(1)$ and +$V_\text{on}^2>V_{\text{\textsc{sat}}\ast}^2$, so the naive satisfiability +transition happens first. On the other hand, when $f(q)$ contains powers of $q$ +strictly greater than 1, then $f'(1)\geq 2f(1)$ and $V_\text{on}^2\leq +V_{\text{\textsc{sat}}\ast}^2$, so the onset happens first. In situations with +mixed constant, linear, and nonlinear terms in $f$, the order of the +transitions depends on the precise form of $f$. -Likewise, when +Likewise, the solution at $m=0$ is not always valid. For $V_0$ of magnitude sufficiently small, with \begin{equation} V_0^20$, and is positive up + is maximized with $\mathcal S_\chi(m_\text{min})>0$, and is positive up to $m_\text{max}$. No stationary points are found with overlap less than $m_\text{min}$. \textbf{Right: the shattered regime.} The action is - maximized with $\mathcal S_\Omega(0)>0$, and is positive up to + maximized with $\mathcal S_\chi(0)>0$, and is positive up to $m_\text{max}$. } \end{figure} +With these caveats in mind, we can combine the information from the previous +calculations to reason about what the topology and geometry of $\Omega$ should +be. We know what the average Euler characteristic is, and we know that it +corresponds to the average number of stationary points. We also know these two +averages correspond with each other when restricted to arbitrary latitudes $m$ +associated with an arbitrary height axis $\mathbf x_0$, and we know their value +at each latitude. From this information, we draw the following inferences about +the form of the solution manifold in our four possible regimes. + \paragraph{The connected regime: \boldmath{$V_0^2V_\text{\textsc{rsb}}^2 + \equiv\frac{[f(1)-f(0)]^2}{\alpha f''(0)} + -f(0)-\frac{f'(0)}{f''(0)} +\end{equation} +Some preliminary attempts to find an instability towards finite-\textsc{rsb} in +this region in the calculation of the Euler characteristic did not turn up anything. We conjecture that the \textsc{rsb} +instability found in \cite{Urbani_2023_A} is a trait of the cost function +\eqref{eq:cost}, and is not inherent to the structure of the solution manifold. +Perhaps the best evidence for this is to consider the limit of $M=1$, or +$\alpha\to0$ with $E=V_0\sqrt\alpha$ held fixed, where this problem reduces to +the level sets of the spherical spin glasses. The instability \eqref{eq:vrsb} +implies for the pure spherical 2-spin model with $f(q)=\frac12q^2$ that +$E_\textsc{rsb}=\frac12$, though nothing of note is known to occur in the level +sets of 2-spin model at such an energy. + +\section{Implications for the dynamics of spherical spin glasses} +\label{sec:ssg} As indicated earlier, for $M=1$ the solution manifold corresponds to the energy level set of a spherical spin glass with energy density $E=\sqrt NV_0$. All the results from the previous sections follow, and can be translated to the spin -glasses by taking the limit $\alpha\to0$ while scaling $V_0=\sqrt\alpha E$. With a little algebra this procedure yields -\begin{equation} +glasses by taking the limit $\alpha\to0$ while keeping $E=V_0\alpha^{-1/2}$ fixed. With a little algebra this procedure yields +\begin{align} E_\text{on}=\pm\sqrt{2f(1)} -\end{equation} -\begin{equation} + && E_\text{sh}=\pm\sqrt{4f(1)\left(1-\frac{f(1)}{f'(1)}\right)} -\end{equation} + \label{eq:ssg.energies} +\end{align} for the energies at which level sets of the spherical spin glasses have disconnected pieces appear, and that at which a large connected component vanishes. For the pure $p$-spin spherical spin glasses with $f(q)=\frac12q^p$, @@ -660,7 +729,8 @@ asymptotically reach an energy above the threshold energy belief that the threshold energy qualitatively coincides with a kind of shattering is the origin of the expectation that the dynamic limit should be the threshold energy. Given that our work shows the actual shattering energy is -different, here we compare it with data on the asymptotic limits of dynamics to see if they coincide. +different, here we compare it with data on the asymptotic limits of dynamics to +see if they coincide. \begin{figure} \includegraphics{figs/dynamics_2.pdf} @@ -680,11 +750,73 @@ different, here we compare it with data on the asymptotic limits of dynamics to } \label{fig:ssg} \end{figure} - -\section{Conclusions} - - -\cite{Franz_2016_The, Franz_2017_Universality, Franz_2019_Critical, Annesi_2023_Star-shaped, Baldassi_2023_Typical} +Measurements of the asymptotic limits of dynamics were recently taken in +\cite{Folena_2023_On} for two different classes of models with inhomogeneous +$f(q)$, with +\begin{equation} + f(q)=\frac12\big[\lambda q^p+(1-\lambda)q^s\big] +\end{equation} +They studied models with this covariance for $p=2$ and $p=3$ while varying $s$. +In both cases, the relative weight $\lambda$ varies with $s$ and was chosen to +maximize a heuristic to increase the chances of seeing nontrivial behavior. The +authors numerically integrated the dynamic mean field theory (\textsc{dmft}) +equations for gradient descent on these models from a random initial condition to large but finite time, then attempted to +extrapolate the infinite-time behavior by two different methods. The black +symbols in Fig.~\ref{fig:ssg} show the measurements taken from +\cite{Folena_2023_On}. The difference between the two extrapolations is not +critical here, see the original paper for details. We simply note that the +authors of \cite{Folena_2023_On} did not associate an uncertainty with them, +nor were they confident that they are unbiased estimates of the asymptotic value. + +Fig.~\ref{fig:ssg} also shows the shattering and {\oldstylenums1}\textsc{rsb} +threshold energies as a function of $s$. The solid lines come from using +\textsl{Mathematica}'s \texttt{Interpolation} function to create a smooth +function $\lambda(s)$ through the values used in \cite{Folena_2023_On}. For the +$2+s$ models for sufficiently large $s$, the ground state is described by a +{\oldstylenums1}\textsc{frsb} order and both the threshold energy and +shattering energy calculated using an annealed average are likely inaccurate +\cite{Auffinger_2022_The}. In Appendix~\ref{sec:1frsb} we calculate the +quenched ground state and shattering energies for these models consistent with +the {\oldstylenums1}\textsc{frsb} equilibrium order. In the left panel of Fig.~\ref{fig:ssg}, the solid line shows the quenched calculation, while the dashed line shows the annealed formula \eqref{eq:ssg.energies}. + +Is the shattering energy consistent with the dynamic threshold for gradient +descent from a random initial condition? The evidence in Fig.~\ref{fig:ssg} is +compelling but inconclusive. The difference between the shattering energy and +the extrapolated \textsc{dmft} data is about the same as the difference between +the values predicted by the two extrapolation methods. If both extrapolation +methods suffer from similar systematic biases, it is plausible they true value +corresponds with the shattering energy. However, better estimates of the +asymptotic values are needed to support or refute this conjecture. This +motivates working to integrate the \textsc{dmft} equations to longer times, or +else look for analytic asymptotic solutions that approach $E_\text{sh}$. + +\section{Conclusion} + +We have shown how to calculate the average Euler characteristic of the solution +manifold in a simple model of random constraint satisfaction. The results +constrain the topology and geometry of this manifold, revealing when it is +connected and trivial, when it is extensive but topologically nontrivial, and +when it is shattered into disconnected pieces. These inferences are supported +by a auxiliary calculation of the complexity of stationary points for a test +function on the same solution manifold. + +This calculation has novel implications for the geometry of the energy +landscape in the spherical spin glasses, where it reveals a previously unknown +landmark energy $E_\text{sh}$. This shattering energy is where the topological +calculation implies that the level set of the energy breaks into disconnected +pieces, and differs from the threshold energy $E_\text{th}$ in mixed models. +It's possible that $E_\text{sh}$ is the asymptotic limit of gradient descent +from a random initial condition in such models, but the quality of the +currently available data makes this conjecture inconclusive. + +This paper has focused on equality constraints, while most existing studies of +constraint satisfaction study inequality constraints \cite{Franz_2016_The, +Franz_2017_Universality, Franz_2019_Critical, Annesi_2023_Star-shaped, +Baldassi_2023_Typical}. To generalize the technique developed in this paper to +such cases is not a trivial extension. The set of solutions to such problems +are manifolds with boundary, and these boundaries are often not smooth. To +study such cases with these techniques will require using extensions of the +Morse theory for manifolds with boundary, and will be the subject of future work. \paragraph{Acknowledgements} The authors thank Pierfrancesco Urbani for helpful conversations on these @@ -977,7 +1109,7 @@ and accumulate $M$ factors of $\operatorname{sign}(-Gf'(C))=\operatorname{sign}( integral over Lagrange multipliers and $N$ factors of $\operatorname{sign}(-G)$ from the Hubbard--Stratonovich transformation. Since at all saddle points $G=-R$, we have \begin{equation} - \operatorname{sign}(R)^{N+M}e^{N\mathcal S_\Omega(\tilde{\mathbb Q},\mathbb Q,\mathbb M)} + \operatorname{sign}(R)^{N+M}e^{N\mathcal S_\chi(\tilde{\mathbb Q},\mathbb Q,\mathbb M)} \end{equation} \subsection{Contribution from integrating the Grassmann order parameters} @@ -1049,16 +1181,16 @@ function $g(R,D,m,\hat m)$ of \eqref{eq:pre-saddle.characteristic} is given by \end{aligned} \end{equation} In the connected regime, there are two saddle points of the integrand that -contribute to the asymptotic value of the integral, at $m=\pm m^*$ with $R=-m^*$, $D=0$, and $\hat m=0$. At this saddle point $\mathcal S_\Omega=0$. We can therefore write +contribute to the asymptotic value of the integral, at $m=\pm m^*$ with $R=-m^*$, $D=0$, and $\hat m=0$. At this saddle point $\mathcal S_\chi=0$. We can therefore write \begin{equation} \overline{\chi(\Omega)} =\sum_{m=\pm m^*} - g(-m,0,m,0)\big[\det\partial\partial\mathcal S_\Omega(-m,0,m,0)\big]^{-\frac12} + g(-m,0,m,0)\big[\det\partial\partial\mathcal S_\chi(-m,0,m,0)\big]^{-\frac12} \end{equation} where here $\partial=[\frac{\partial}{\partial R},\frac{\partial}{\partial D},\frac\partial{\partial m},\frac\partial{\partial \hat m}]$ is the vector of derivatives with respect to the remaining order parameters. For both of the two saddle points, the determinant of the Hessian of the effective action evaluates to \begin{equation} - \det\partial\partial\mathcal S_\Omega + \det\partial\partial\mathcal S_\chi =\left[\frac1{(m^*)^4}\left(1-\frac{\alpha[V_0^2+f(1)]}{f'(1)}\right)\right]^2 \end{equation} whereas @@ -1422,7 +1554,7 @@ $\chi(1)=0$, necessary for $P$ to be a well-defined probability distribution. Now the specific form of replica symmetry breaking we expect to see is important. We want to study the mixed $2+s$ models in the regime where they may -have 1-full \textsc{rsb} in equilibrium. For the Euler characteristic like the +have 1-full \textsc{rsb} in equilibrium \cite{Auffinger_2022_The}. For the Euler characteristic like the complexity, this will correspond to full \textsc{rsb}, in an analogous way to {\oldstylenums1}\textsc{rsb} equilibria give a \textsc{rs} complexity. Such order is characterized by a piecewise smooth $\chi$ of the form \begin{equation} -- cgit v1.2.3-70-g09d2