From e160fe6e482184afc25ecbcebdc8c42611e3f59c Mon Sep 17 00:00:00 2001
From: Jaron Kent-Dobias <jaron@kent-dobias.com>
Date: Sun, 3 Dec 2023 20:14:47 +0100
Subject: Lots more correction and elaboration.

---
 2-point.bib |  30 ++++++
 2-point.tex | 315 +++++++++++++++++++++++++++++++++++++++++++-----------------
 2 files changed, 256 insertions(+), 89 deletions(-)

diff --git a/2-point.bib b/2-point.bib
index 63de3de..a2c43bb 100644
--- a/2-point.bib
+++ b/2-point.bib
@@ -512,3 +512,33 @@ Spherical Spin Glasses},
  translator = {Erné, Reinie}
 }
 
+@article{Baldassi_2021_Unveiling,
+ author = {Baldassi, Carlo and Lauditi, Clarissa and Malatesta, Enrico M. and Perugini, Gabriele and Zecchina, Riccardo},
+ title = {Unveiling the Structure of Wide Flat Minima in Neural Networks},
+ journal = {Physical Review Letters},
+ publisher = {American Physical Society (APS)},
+ year = {2021},
+ month = {12},
+ number = {27},
+ volume = {127},
+ pages = {278301},
+ url = {http://dx.doi.org/10.1103/PhysRevLett.127.278301},
+ doi = {10.1103/physrevlett.127.278301},
+ issn = {1079-7114}
+}
+
+@article{Baldassi_2016_Unreasonable,
+ author = {Baldassi, Carlo and Borgs, Christian and Chayes, Jennifer T. and Ingrosso, Alessandro and Lucibello, Carlo and Saglietti, Luca and Zecchina, Riccardo},
+ title = {Unreasonable effectiveness of learning neural networks: From accessible states and robust ensembles to basic algorithmic schemes},
+ journal = {Proceedings of the National Academy of Sciences},
+ publisher = {Proceedings of the National Academy of Sciences},
+ year = {2016},
+ month = {11},
+ number = {48},
+ volume = {113},
+ pages = {E7655--E7662},
+ url = {http://dx.doi.org/10.1073/pnas.1608103113},
+ doi = {10.1073/pnas.1608103113},
+ issn = {1091-6490}
+}
+
diff --git a/2-point.tex b/2-point.tex
index 267229e..f1c0832 100644
--- a/2-point.tex
+++ b/2-point.tex
@@ -170,7 +170,7 @@ at the threshold energy, while at other energies typical marginal minima are far
 and separated by extensive barriers. Therefore, with respect to the problem of
 dynamics this paper merely deepens the outstanding issues.
 
-\section{Model}
+\section{The model}
 \label{sec:model}
 
 The mixed spherical models are defined by the Hamiltonian
@@ -428,7 +428,7 @@ adjacent saddle points defines a complex with implications for the landscape
 topology \cite{Audin_2014_Morse}.
 
 Other stationary points are found at arbitrarily small distances from a
-reference extensive saddle point. The energy and stability of these near
+reference extensive saddle point, with a linear pseudogap in their complexity. The energy and stability of these near
 neighbors approach that of the reference point as the difference in overlap
 $\Delta q$ is brought to zero. However, the approach of the energy and
 stability are at different rates: the energy difference between the reference
@@ -671,11 +671,14 @@ $\pmb\sigma$ in \eqref{eq:complexity.definition}, and is the only of the $\pmb\s
 This expression can now be averaged over the disordered couplings, and its
 integration evaluated using the saddle point method. We must assume the form of
 order among the replicas $\mathbf s$ and $\pmb\sigma$, and we take them to be
-replica symmetric. For the $3+4$ model that is our immediate interest and other
-models like it this choice is well-motivated. Details of this calculation can be found in Appendix~\ref{sec:complexity-details}.
+replica symmetric. Replica symmetry means that at the saddle point, all
+distinct pairs of replicas have the same overlap. This choice is well-motivated
+for the $3+4$ model that is our immediate interest and other models like it.
+Details of the calculation can be found in
+Appendix~\ref{sec:complexity-details}.
 
 The resulting expression for the complexity, which must
-still be extremized over the parameters $\hat\beta_1$, $r^{01}$,
+still be extremized over the order parameters $\hat\beta_1$, $r^{01}$,
 $r^{11}_\mathrm d$, $r^{11}_0$, and $q^{11}_0$, is
 \begin{equation}
   \begin{aligned}
@@ -701,23 +704,24 @@ $r^{11}_\mathrm d$, $r^{11}_0$, and $q^{11}_0$, is
   \Bigg]\Bigg\}
   \end{aligned}
 \end{equation}
+where the function $\mathcal D$ is defined in \eqref{eq:hessian.func} of Appendix~\ref{sec:complexity-details}.
 It is possible to further extremize this expression over all the other
 variables but $q_0^{11}$, for which the saddle point conditions have a unique
 solution. However, the resulting expression is quite complicated and provides
-no insight. In fact, the numeric root-finding problem is more stable preserving these parameters, rather than analytically eliminating them. In practice, the complexity can be calculated in two ways. First,
+no insight. In fact, the numeric root-finding problem is more stable preserving these parameters, rather than analytically eliminating them.
+
+In practice, the complexity can be calculated in two ways. First,
 the extremal problem can be done numerically, initializing from $q=0$ where the
 problem reduces to that of the single-point complexity of points with energy
-$E_1$ and stability $\mu_1$, and then taking small steps in $q$ or other
-parameters to trace out the solution. This is how the data in all the plots of
+$E_1$ and stability $\mu_1$, which has an analytical solution. Then small steps in $q$ or other
+parameters are taken to trace out the solution. This is how the data in all the plots of
 this paper was produced. Second, the complexity can be calculated in the near
-neighborhood of a reference point by expanding in small $1-q$. This is what we
-describe in the next subsection.
+neighborhood of a reference point by expanding in powers of small $1-q$. This expansion indicates when nearby points can be found at arbitrarily small distance, and in that case gives the form of the pseudogap in their complexity.
 
 If there is no overlap gap between the reference point and its nearest
 neighbors, their complexity can be calculated by an expansion in $1-q$. First,
 we'll use this method to describe the most common type of stationary point in
-the close vicinity of a reference point. The most common neighbors of a
-reference point are given by further maximizing the two-point complexity over
+the close vicinity of a reference point. These are given by further maximizing the two-point complexity over
 the energy $E_1$ and stability $\mu_1$ of the nearby points. This gives the
 conditions
 \begin{align}
@@ -733,7 +737,9 @@ saddle point in the remaining parameters is taken, the result is
   =\frac{f'''(1)}{8f''(1)^2}(\mu_\mathrm m^2-\mu_0^2)\left(\sqrt{2+\frac{2f''(1)\big(f''(1)-f'(1)\big)}{f'''(1)f'(1)}}-1\right)(1-q)
   +O\big((1-q)^2\big)
 \end{equation}
-independent of $E_0$. To describe the properties of these most common
+independent of $E_0$. Notice that slope of the complexity is positive for $\mu_0<\mu_\text m$ and vanishes when the stability of the reference point approaches the marginal stability. This implies that extensive saddle points have arbitrarily close neighbors with a linear pseudogap, while stable minima have an overlap gap with their nearest neighbors. For marginal minima, the existence of arbitrarily close neighbors must be decided at quadratic order and higher.
+
+To describe the properties of these most common
 neighbors, it is convenient to first make a definition. The population of
 stationary points that are most common at each energy (the blue line in
 Fig.~\ref{fig:complexities}) have the relation
@@ -748,11 +754,11 @@ $\Delta q$ are
     \label{eq:expansion.mu.1}
   \mu_1&=\mu_0-\frac{v_f}{u_f}\big(E_0-E_\mathrm{dom}(\mu_0)\big)(1-q)+O\big((1-q)^2\big)
 \end{align}
+where $v_f$ and $u_f$ are positive functionals of $f$ defined in \eqref{eq:v.and.u} of Appendix~\ref{sec:complexity-details}.
 The most common neighboring saddles to a reference saddle are much nearer to
 the reference in energy ($\Delta q^2$) than in stability ($\Delta q$). In fact,
-this scaling also holds for the entire range of neighbors to a reference
-saddle, with the limits in energy scaling like $\Delta q^2$ and those of
-stability scaling like $\Delta q$.
+this scaling also holds for all neighbors to a reference
+saddle, not just the most common.
 
 Because both expressions are proportional to $E_0-E_\mathrm{dom}(\mu_0)$,
 whether the energy and stability of nearby points increases or decreases from
@@ -766,20 +772,17 @@ To examine better the population of marginal points, it is necessary to look at
 the next term in the series of the complexity with $\Delta q$, since the linear
 coefficient becomes zero at the marginal line. This tells us something
 intuitive: stable minima have an effective repulsion between points, and one
-always finds a sufficiently small $\Delta q$ that no stationary points are
+always finds a sufficiently small $\Delta q$ such that no stationary points are
 point any nearer. For the marginal minima, it is not clear that the same should be true.
 
-When $\mu=\mu_\mathrm m$, the linear term above vanishes. Under these conditions, the quadratic term in the expansion is
+For marginal points with $\mu=\mu_\mathrm m$, the linear term above vanishes. Under these conditions, the quadratic term in the expansion for the dominant population of near neighbors is
 \begin{equation}
   \Sigma_{12}
   =\frac12\frac{f'''(1)v_f}{f''(1)^{3/2}u_f}
   \left(\sqrt{\frac{2\big[f'(1)(f'''(1)-f''(1))+f''(1)^2\big]}{f'(1)f'''(1)}}-1\right)\big(E_0-E_\textrm{th}\big)(1-q)^2+O\big((1-q)^3\big)
 \end{equation}
-Note that this expression is only true for $\mu=\mu_\mathrm m$. Therefore,
-among marginal minima, when $E_0$ is greater than the threshold one finds
-neighbors at arbitrarily close distance. When $E_0$ is less than the threshold,
-the complexity of nearby points is negative, and there is a desert where none
-are found.
+Note that this expression is only valid for $\mu=\mu_\mathrm m$. This coefficient is positive when $E>E_\text{th}$ and negative when $E<E_\text{th}$. Therefore,
+marginal minima whose energy $E_0$ is greater than the threshold have neighbors at arbitrarily close distance with a quadratic pseudogap, while those whose energy is less than the threshold have an overlap gap. Exactly at the threshold the cubic term in the expansion is necessary; it is not insightful to share explicitly but is positive for the $3+4$ and similar models.
 
 \begin{figure}
   \centering
@@ -797,11 +800,19 @@ are found.
   } \label{fig:expansion}
 \end{figure}
 
-The properties of the nearby states above the threshold can be
-further quantified. The most common points are still given by
-\eqref{eq:expansion.E.1} and \eqref{eq:expansion.mu.1}, but the range of
-available points can also be computed, and one finds that the stability lies in
-the range
+The properties of the nearby states above the threshold can be further
+quantified. Though we know from \eqref{eq:expansion.E.1} and
+\eqref{eq:expansion.mu.1} that the most common nearby points are small distance
+are extensive saddle points with higher energy than the reference point, we do
+not know what other kinds of stationary points might also be found in close
+proximity. Could these marginal minima sit at very small distance from other
+marginal minima? The answer, in the end, is that the very near neighbors are
+exclusively extensive saddles of higher energy. Therefore, even the marginal
+minima with energies above the threshold energy have overlap gaps with one
+another.
+
+The limit of stability in which nearby points are found to marginal minima
+above the threshold are given by
 $\mu_1=\mu_\mathrm m+\delta\mu_1(1-q)\pm\delta\mu_2(1-q)^{3/2}+O\big((1-q)^2\big)$
 where $\delta\mu_1$ is given by the coefficient in \eqref{eq:expansion.mu.1}
 and
@@ -810,6 +821,7 @@ and
     \frac{E_0-E_\mathrm{th}}2\frac{2f''(1)\big(f''(1)-f'(1)\big)+f'(1)f'''(1)}{u_f}
   }
 \end{equation}
+Since the limits differ from the most common points at higher order in $\Delta q$, nearby points are of the same kind as the most common population.
 Similarly, one finds that the energy lies in the range $E_1=E_0+\delta
 E_1(1-q)^2\pm\delta E_2(1-q)^{5/2}+O\big((1-q)^3\big)$ for $\delta E_1$ given
 by the coefficient in \eqref{eq:expansion.E.1} and
@@ -839,18 +851,18 @@ saddle points for the complexity of marginal minima in
 Fig.~\ref{fig:expansion}, and the results agree well at small $1-q$.
 
 
-\section{Isolated eigenvalue}
+\section{Finding the isolated eigenvalue}
 \label{sec:eigenvalue}
 
-The two-point complexity depends on the spectrum at both stationary points
+The two-point complexity $\Sigma_{12}$ depends on the spectrum at both stationary points
 through the determinant of their Hessians, but only on the bulk of the
 distribution. As we saw, this bulk is unaffected by the conditions of energy
 and proximity. However, these conditions give rise to small-rank perturbations
 to the Hessian, which can lead a subextensive number of eigenvalues leaving the
 bulk. We study the possibility of \emph{one} stray eigenvalue.
 
-We use a technique recently developed to find the smallest eigenvalue of a
-random matrix \cite{Ikeda_2023_Bose-Einstein-like}. One defines a quadratic
+We use a technique recently developed to find the smallest eigenvalue of
+random matrices \cite{Ikeda_2023_Bose-Einstein-like}. One defines an artificial quadratic
 statistical mechanics model with configurations defined on the sphere, whose
 interaction tensor is given by the matrix of interest. By construction, the
 ground state is located in the direction of the eigenvector associated with the
@@ -924,7 +936,7 @@ model, conditioned on the relative position, energies, and stabilities
 discussed above. We must restrict the artificial spherical model to lie in the
 tangent plane of the `real' spherical configuration space at the point of
 interest, to avoid our eigenvector pointing in a direction that violates the
-spherical constraint. A sketch of the setup is shown in Fig.~\ref{fig:sphere}. The free energy of this model given a point $\mathbf s$
+spherical constraint. A sketch of the setup is shown in Fig.~\ref{fig:sphere}. The free energy of the artificial model given a point $\mathbf s$
 and a specific realization of the disordered Hamiltonian is
 \begin{equation}
   \begin{aligned}
@@ -951,7 +963,7 @@ giving
     &=\lim_{n\to0}\int\left[\prod_{a=1}^nd\nu_H(\mathbf s_a,\omega_a\mid E_1,\mu_1)\,\delta(Nq-\pmb\sigma\cdot\mathbf s_a)\right]F_H(\beta\mid\mathbf s_1,\omega_1)
   \end{aligned}
 \end{equation}
-again anticipating the use of replicas. Finally, the reference configuration $\pmb\sigma$ should itself be a stationary point of $H$ with its own energy density and stability. Averaging over these conditions gives
+again anticipating the use of replicas. Finally, the reference configuration $\pmb\sigma$ should itself be a stationary point of $H$ with its own energy density and stability, as in the previous section. Averaging over these conditions gives
 \begin{equation}
   \begin{aligned}
     F_H(\beta\mid E_1,\mu_1,E_2,\mu_2,q)
@@ -971,20 +983,60 @@ $\pmb\sigma$ replicas constrained to lie at fixed overlap with \emph{all} the
 $\mathbf s$ replicas, and the second is the only of the $\mathbf s$ replicas at
 which the Hessian is evaluated.
 
+The calculation of this minimum eigenvalue is very similar to that of the
+complexity. The details of this calculation can be found in
+Appendix~\ref{sec:eigenvalue-details}. The result for the minimum eigenvalue is given by
+\begin{equation} \label{eq:minimum.eigenvalue.text}
+  \lambda_\mathrm{min}
+  =\mu_1-\left(y+\frac1yf''(1)\right)
+\end{equation}
+where $y$ an order parameter whose value is set by the saddle-point conditions
+\begin{align} \label{eq:eigen.conditions.main}
+  0=-f''(1)+y^2(1-\mathcal X^TC\mathcal X)
+  &&
+  0=(B-yC)\mathcal X
+\end{align}
+for $\mathcal X\in\mathbb R^5$ a vector of order parameters, and $B$ and $C$
+are $5\times 5$ matrices whose elements are explicit functions of the order
+parameters from the two-point complexity problem and of $f$ and its
+derivatives. The matrices $B$ and $C$ are given in \eqref{eq:matrix.b} and \eqref{eq:matrix.c} of Appendix~\ref{sec:eigenvalue-details}.
 
-In this solution, we simultaneously find the smallest eigenvalue and information
-about the orientation of its associated eigenvector: namely, its overlap with
-the tangent vector that points directly toward the reference spin. This is
-directly related to $x_0$. This tangent vector is $\mathbf x_{0\leftarrow
-1}=\frac1{1-q}\big(\pmb\sigma_0-q\mathbf s_a\big)$, which is normalized and
-lies strictly in the tangent plane of $\mathbf s_a$. Then
+There is a trivial solution for $\mathcal X=0$ and $y^2=f''(1)$. This results
+in a minimum eigenvalue
 \begin{equation}
-  q_\textrm{min}=\frac{\mathbf x_{0\leftarrow 1}\cdot\mathbf x_\mathrm{min}}N
-  =\frac{x_0}{1-q}
+  \lambda_\mathrm{min}=\mu_1-\sqrt{4f''(1)}=\mu_1-\mu_\mathrm m
 \end{equation}
-The emergence of an isolated eigenvalue and its associated eigenvector are
-shown in Fig.~\ref{fig:isolated.eigenvalue}, for the same reference point
-properties as in Fig.~\ref{fig:min.neighborhood}.
+that corresponds with the bottom edge of the semicircle distribution. This is
+the correct solution in the absence of an isolated eigenvalue. Any nontrivial
+solution, corresponding to an isolated eigenvalue, must have nonzero $\mathcal
+X$. The only way to satisfy this with the second of the saddle conditions
+\eqref{eq:eigen.conditions.main} is for $y$ such that one of the eigenvalues of
+$B-yC$ is zero. Under these circumstances, if the normalized eigenvector
+associated with the zero eigenvector is $\hat{\mathcal X}_0$, then $\mathcal
+X=\|\mathcal X_0\|\hat{\mathcal X}_0$ is a solution. The magnitude $\|\mathcal
+X_0\|$ of the solution is set by the first saddle point condition, namely
+\begin{equation}
+  \|\mathcal X_0\|^2=\frac1{\hat{\mathcal X}_0^TC\hat{\mathcal X}_0}\left(1-\frac{f''(1)}{y^2}\right)
+\end{equation}
+In practice, we find that $\hat{\mathcal X}_0^TC\hat{\mathcal X}_0$ is positive
+at the saddle point. Therefore, for the solution to make sense we must have
+$y^2\geq f''(1)$. In practice, there is at most \emph{one} $y$ which produces a
+zero eigenvalue of $B-yC$ and satisfies this inequality, so the solution seems
+to be unique.
+
+With this solution, we simultaneously find the smallest eigenvalue and
+information about the orientation of its associated eigenvector: namely, its
+overlap $q_\mathrm{min}$ with the tangent vector that points directly toward
+the reference spin. This information is encoded the order parameter vector
+$\mathcal X$, and the detail of how it is computed can be found at the end of
+Appendix~\ref{sec:eigenvalue-details}. The emergence of an isolated eigenvalue
+and its associated eigenvector are shown in Fig.~\ref{fig:isolated.eigenvalue},
+for the same reference point properties that were used in
+Fig.~\ref{fig:min.neighborhood}. For small overlaps, the minimum eigenvalue
+corresponds with bottom of the semicircle distribution, or the trivial
+solution. As the overlap is increased, one eigenvalue continuously leaves the
+spectrum, with an eigenvector whose overlap with the vector between stationary
+points also grows continuously from zero.
 
 \begin{figure}
   \includegraphics{figs/isolated_eigenvalue.pdf}
@@ -1005,6 +1057,19 @@ properties as in Fig.~\ref{fig:min.neighborhood}.
   } \label{fig:isolated.eigenvalue}
 \end{figure}
 
+Though the two-point complexity $\Sigma_{12}$ fails to distinguish the marginal
+minima at the limits of aging dynamics, one might imagine that something
+related to the isolated eigenvalue might succeed in distinguishing them. This
+does not appear to be the case. Above and below the threshold energy, the
+nature of the isolated eigenvalue of nearest neighbors does not change: it is
+always present and varies continuously. There is an energy both above and below
+the threshold where the nearest marginal states transition from having an
+isolated eigenvalue to not having one; see for instance in the right panel of
+Fig.~\ref{fig:marginal.prop.above} that the grey region vanishes. One might
+reason that this could change the connectivity of nearby marginal-like states and
+thereby the aging dynamics. However, these energies are not close to the limits
+of aging dynamics measured by \cite{Folena_2020_Rethinking}, so that reasoning is wrong.
+
 \section{Conclusion}
 \label{sec:conclusion}
 
@@ -1023,8 +1088,8 @@ relaxes towards marginal states with energies below the threshold energy
 \cite{Folena_2020_Rethinking, Folena_2021_Gradient}. We found (see especially
 Figs.~\ref{fig:marginal.prop.below} and \ref{fig:marginal.prop.above}) that the
 neighborhoods of marginal states above and below the threshold are quite
-different, and yet the emergent aging behaviors relaxing toward states above and
-below the threshold seem to be the same. Therefore, this kind of dynamics
+different, and yet the emergent aging behavior relaxing toward states above and
+below the threshold seem to be the same. Therefore, aging dynamics
 appears to be insensitive to the neighborhood of the marginal state being
 approached. To understand something better about why certain states attract the
 dynamics in certain situations, nonlocal information, like the
@@ -1043,7 +1108,7 @@ removed by a more detailed saddle point ansatz.
 
 Our calculation studied the neighborhood of typical reference points with the
 given energy and stability. However, it is possible that marginal minima with
-atypical neighborhoods actually attract the dynamics. To determine this, a
+atypical neighborhoods actually attract the dynamics, as has been argued in certain neural networks \cite{Baldassi_2016_Unreasonable, Baldassi_2021_Unveiling}. To determine this, a
 different type of calculation is needed. As our calculation is akin to the
 quenched Franz--Parisi potential, study of atypical neighborhoods would entail
 something like the annealed Franz--Parisi approach, i.e.,
@@ -1077,11 +1142,33 @@ INFN.
 
 \appendix
 
-\section{Details of calculation for the two-point complexity}
+\section{Details of the calculation for the two-point complexity}
 \label{sec:complexity-details}
 
+The two-point complexity defined in \eqref{eq:complexity.definition} consists
+of the average over integrals consisting of products of Dirac
+$\delta$-functions and determinants of Hessians. To compute it, we first split
+the factors into two groups: one group that contains any dependence on the
+Hessian (the determinants and the $\delta$-functions fixing the stabilities)
+and a second group containing all other $\delta$-functions. The average over
+disorder for the two groups of factors can be made independently, which is
+described in subsections \ref{subsec:hessian} and \ref{subsec:other.factors}
+for the Hessian and other factors, respectively.
+
+Once the average is made over disorder, the result is an exponential integral
+that depends only on scalar products between the replicated configurations
+$\mathbf s$ and $\pmb\sigma$ and their conjugate fields. The explicit
+dependence on these microscopic configurations is removed using a
+Hubbard--Stratonovich transformation, which replaces the scalar products with
+overlap order parameters. This is described in subsection
+\ref{subsec:hubbard.strat}. Finally, the complexity is an exponential integral
+over several order parameter fields, and is amenable to evaluation by a saddle
+point method, detailed in subsection \ref{subsec:saddle}.
+
 \subsection{The Hessian factors}
+\label{subsec:hessian}
 
+The factors dependant on the Hessian can be averaged over disorder using results from random matrix theory.
 The double partial derivatives of the energy are Gaussian with the variance
 \begin{equation}
   \overline{(\partial_i\partial_jH(\mathbf s))^2}=\frac1Nf''(1)
@@ -1104,13 +1191,13 @@ fields the Hessian is independent of these \cite{Bray_2007_Statistics}. In
 principle the fact that we have conditioned the Hessian to belong to stationary
 points of certain energy, stability, and proximity to another stationary point
 will modify its statistics, but these changes will only appear at subleading
-order in $N$ \cite{Ros_2019_Complexity}. At leading order, the various expectations factorize, each yielding
+order in $N$ \cite{Ros_2019_Complexity}. This is because the conditioning amounts to a rank-one perturbation to the Hessian matrix. At leading order, the expectations related to different replicas factorize, each yielding
 \begin{equation}
   \overline{\big|\det\operatorname{Hess}H(\mathbf s,\omega)\big|\,\delta\big(N\mu-\operatorname{Tr}\operatorname{Hess}H(\mathbf s,\omega)\big)}
   =e^{N\int d\lambda\,\rho(\lambda+\mu)\log|\lambda|}\delta(N\mu-N\omega)
 \end{equation}
-Therefore, all of the Lagrange multipliers are fixed to the stabilities $\mu$. We define the function
-\begin{equation}
+Therefore, each of the Lagrange multipliers is fixed to one of the stabilities $\mu$. We define the function
+\begin{equation} \label{eq:hessian.func}
   \begin{aligned}
     \mathcal D(\mu)
     &=\int d\lambda\,\rho(\lambda+\mu)\log|\lambda| \\
@@ -1123,17 +1210,20 @@ Therefore, all of the Lagrange multipliers are fixed to the stabilities $\mu$. W
     \end{cases}
   \end{aligned}
 \end{equation}
-and the full factor due to the Hessians is
+and the full factor due to the Hessians can be written
 \begin{equation}
   e^{Nm\mathcal D(\mu_0)+Nn\mathcal D(\mu_1)}\left[\prod_a^m\delta(N\mu_0-N\varsigma_a)\right]\left[\prod_a^n\delta(N\mu_1-N\omega_a)\right]
 \end{equation}
 
 \subsection{The other factors}
+\label{subsec:other.factors}
 
-Having integrated over the Lagrange multipliers using the $\delta$-functions
-resulting from the average of the Hessians, any $\delta$-functions in the
-remaining integrand we Fourier transform into their integral representation
-over auxiliary fields. The resulting integrand has the form
+The other factors consist of $\delta$-functions of the gradient and $\delta$-functions containing the energy and spherical constraints. We take advantage of the Fourier representation of the $\delta$-function to express each of them as an exponential integral over an auxiliary field. For instance,
+\begin{equation}
+  \delta\big(\nabla H(\mathbf s,\mu_1)\big)
+  =\int\frac{d\hat{\mathbf s}}{(2\pi)^N}e^{i\hat{\mathrm s}\cdot\nabla H(\mathbf s,\mu_1)}
+\end{equation}
+replaces the $\delta$-function over the gradient by introducing the auxiliary field $\hat{\mathbf s}$. Carrying out such a transformation to each of the remaining factors gives an exponential integrand of the form
 \begin{equation}
   e^{
     Nm\hat\beta_0E_0+Nn\hat\beta_1E_1
@@ -1158,6 +1248,7 @@ where we have introduced the linear operator
     i\hat{\mathbf s}_a\cdot\partial_{\mathbf t}-\hat\beta_1
   \right)
 \end{equation}
+consolidating all of the $H$-dependent terms.
 Here the $\hat\beta$s are the fields auxiliary to the energy constraints, the
 $\hat\mu$s are auxiliary to the spherical and overlap constraints, and the
 $\hat{\pmb\sigma}$s and $\hat{\mathbf s}$s are auxiliary to the constraint that
@@ -1168,11 +1259,12 @@ We have written the $H$-dependent terms in this strange form for the ease of tak
   =e^{\frac12\int d\mathbf t\,d\mathbf t'\,\mathcal O(\mathbf t)\mathcal O(\mathbf t')\overline{H(\mathbf t)H(\mathbf t')}}
   =e^{N\frac12\int d\mathbf t\,d\mathbf t'\,\mathcal O(\mathbf t)\mathcal O(\mathbf t')f\big(\frac{\mathbf t\cdot\mathbf t'}N\big)}
 \end{equation}
-It remains only to apply the doubled operators to $f$ and then evaluate the simple integrals over the $\delta$ measures. We do not include these details, which are standard.
+It remains only to apply the doubled operators to $f$ and then evaluate the simple integrals over the $\delta$ measures. We do not include these details, which were carried out with computer algebra software.
 
 \subsection{Hubbard--Stratonovich}
+\label{subsec:hubbard.strat}
 
-Having expanded this expression, we are left with an argument in the exponential which is a function of scalar products between the fields $\mathbf s$, $\hat{\mathbf s}$, $\pmb\sigma$, and $\hat{\pmb\sigma}$. We will change integration coordinates from these fields to matrix fields given by their scalar products, defined as
+Having expanded the resulting expression, we are left with an argument in the exponential which is a function of scalar products between the fields $\mathbf s$, $\hat{\mathbf s}$, $\pmb\sigma$, and $\hat{\pmb\sigma}$. We will change integration coordinates from these fields to matrix fields given by their scalar products, defined as
 \begin{equation} \label{eq:fields}
   \begin{aligned}
     C^{00}_{ab}=\frac1N\pmb\sigma_a\cdot\pmb\sigma_b &&
@@ -1189,18 +1281,57 @@ Having expanded this expression, we are left with an argument in the exponential
 \end{equation}
 We insert into the integral the product of $\delta$-functions enforcing these
 definitions, integrated over the new matrix fields, which is equivalent to
-multiplying by one. Once this is done, the many scalar products appearing
-throughout can be replaced by the matrix fields, and the original vector fields
-can be integrated over. Conjugate matrix field integrals created when the
-$\delta$-functions are promoted to exponentials can be evaluated by saddle
-point in the standard way, yielding an effective action depending on the above
-matrix fields alone.
-
-\subsection{Saddle point}
+multiplying by one. For example, one such factor of one is given by
+\begin{equation}
+  1=\int dC^{00}\,\prod_{ab}^m\delta(NC^{00}_{ab}-\pmb\sigma_a\cdot\pmb\sigma_b)
+\end{equation}
+Once this is done, the many scalar products appearing throughout can be
+replaced by the matrix fields. The only dependence of the original vector
+fields is from these new $\delta$-functions. These are treated schematically in
+following way: let $\{\mathbf a_a\}=\{\mathbf s_a,\pmb\sigma_a,\hat{\mathbf
+s}_a,\hat{\pmb\sigma}_a\}$ index all of the original vector fields, and let
+$Q_{ab}=\frac1N\mathbf a_a\cdot\mathbf a_b$ likewise concatenate all of the
+matrix fields. Then the $\delta$-functions described above can be promoted to an exponential integral of the form
+\begin{equation}
+  \int d\mathbf a\,d\hat Q\,e^{
+    N\frac12\operatorname{Tr}\hat QQ
+    -\frac12\mathbf a^T\hat Q\mathbf a
+  }
+\end{equation}
+The integral over the vector fields $\mathbf a$ is Gaussian and can be evaluated, giving
+\begin{equation}
+  \int d\hat Q\,e^{
+    N\operatorname{Tr}\hat QQ
+  }(\det\hat Q)^{-N/2}
+  =
+  \int d\hat Q\,e^{
+    \frac12 N(\operatorname{Tr}\hat QQ
+    -\log\det\hat Q)
+  }
+\end{equation}
+Finally, the integral over $\hat Q$ can be evaluated using the saddle point
+method, giving $\hat Q=Q^{-1}$. Therefore, the term contributed to the effective
+action in the matrix fields as a result of the transformation is
+$N(\frac12+\frac12\log\det Q)$.
+
+\subsection{Replica ansatz and saddle point}
+\label{subsec:saddle}
+
+After the transformation of the previous section, the complexity has been
+brought to the form of an exponential integral over the matrix order parameters
+\eqref{eq:fields}, proportional to $N$. We are therefore in the position to
+evaluate this integral using a saddle point method.
+In this paper, we will focus on models with a replica symmetric complexity, but
+many of the intermediate formulae are valid for arbitrary replica symmetry
+breakings. At most {\oldstylenums1}\textsc{rsb} in the equilibrium is guaranteed if the function
+$\chi(q)=f''(q)^{-1/2}$ is convex \cite{Crisanti_1992_The}. The complexity at the ground state must
+reflect the structure of equilibrium, and therefore be replica symmetric.
+Recent work has found that the complexity of saddle points can produce
+other \textsc{rsb} order even when the ground state is replica symmetric, but the $3+4$ model has a safely replica symmetric complexity everywhere \cite{Kent-Dobias_2023_When}.
 
 We will always assume that the square matrices $C^{00}$, $R^{00}$, $D^{00}$,
-$C^{11}$, $R^{11}$, and $D^{11}$ are hierarchical matrices, with each set of
-three sharing the same hierarchical structure. In particular, we immediately
+$C^{11}$, $R^{11}$, and $D^{11}$ are hierarchical matrices, i.e., of the Parisi form, with each set of
+three sharing the same structure. In particular, we immediately
 define $c_\mathrm d^{00}$, $r_\mathrm d^{00}$, $d_\mathrm d^{00}$, $c_\mathrm d^{11}$, $r_\mathrm d^{11}$, and
 $d_\mathrm d^{11}$ as the value of the diagonal elements of these matrices,
 respectively. Note that $c_\mathrm d^{00}=c_\mathrm d^{11}=1$ due to the spherical constraint.
@@ -1210,7 +1341,7 @@ R^{00}, D^{00})$, $\mathcal Q_{11}=(\hat\beta_1, \hat\mu_1, C^{11}, R^{11},
 D^{11})$, and $\mathcal Q_{01}=(\hat\mu_{01},C^{01},R^{01},R^{10},D^{01})$
 the resulting complexity is
 \begin{equation}
-  \Sigma_{01}
+  \Sigma_{12}
   =\frac1N\lim_{n\to0}\lim_{m\to0}\frac\partial{\partial n}\int d\mathcal Q_{00}\,d\mathcal Q_{11}\,d\mathcal Q_{01}\,e^{Nm\mathcal S_0(\mathcal Q_{00})+Nn\mathcal S_1(\mathcal Q_{11},\mathcal Q_{01}\mid\mathcal Q_{00})}
 \end{equation}
 where
@@ -1272,33 +1403,26 @@ symmetric complexity of the reference point, this results in
   \right)^2
 \end{align}
 where we define for brevity (here and elsewhere) the constants
-\begin{align}
+\begin{align} \label{eq:v.and.u}
   u_f=f(1)\big(f'(1)+f''(1)\big)-f'(1)^2
   &&
   v_f=f'(1)\big(f''(1)+f'''(1)\big)-f''(1)^2
 \end{align}
 Note that because the coefficients of $f$ must be nonnegative for $f$ to
-be a sensible covariance, both $u_f$ and $v_f$ are strictly positive. Note also
+be a sensible covariance, both $u_f$ and $v_f$ are strictly positive.\footnote{
+  Note also
 that $u_f=v_f=0$ if $f$ is a homogeneous polynomial as in the pure models.
 These expressions are invalid for the pure models because $\mu_0$ and $E_0$
 cannot be fixed independently; we would have done the equivalent of inserting
 two identical $\delta$-functions. For the pure models, the terms $\hat\beta_0$ and
 $\hat\beta_1$ must be set to zero in our prior formulae (as if the energy was
 not constrained) and then the saddle point taken.
+}
 
 
 In general, we except the $m\times n$ matrices $C^{01}$, $R^{01}$, $R^{10}$,
 and $D^{01}$ to have constant \emph{rows} of length $n$, with blocks of rows
 corresponding to the \textsc{rsb} structure of the single-point complexity.
-
-In this paper, we will focus on models with a replica symmetric complexity, but
-many of the intermediate formulae are valid for arbitrary replica symmetry
-breakings. At most {\oldstylenums1}\textsc{rsb} in the equilibrium is guaranteed if the function
-$\chi(q)=f''(q)^{-1/2}$ is convex \cite{Crisanti_1992_The}. The complexity at the ground state must
-reflect the structure of equilibrium, and therefore be replica symmetric.
-Recent work has found that the complexity of saddle points can produce
-other \textsc{rsb} order even when the ground state is replica symmetric, but the $3+4$ model has a safely replica symmetric complexity everywhere \cite{Kent-Dobias_2023_When}.
-
 For
 the scope of this paper, where we restrict ourselves to replica symmetric
 complexities, they have the following form at the saddle point:
@@ -1346,7 +1470,10 @@ where only the first row is nonzero. The other entries, which correspond to the
 completely uncorrelated replicas in an \textsc{rsb} picture, are all zero
 because uncorrelated vectors on the sphere are orthogonal.
 
-The inverse of block hierarchical matrix is still a block hierarchical matrix, since
+The most challenging part of inserting our replica symmetry ansatz is the
+volume element in the $\log\det$, which involves the product and inverse of
+block replica matrices. The inverse of block hierarchical matrix is still a
+block hierarchical matrix, since
 \begin{equation}
   \begin{bmatrix}
     C^{00}&iR^{00}\\iR^{00}&D^{00}
@@ -1446,9 +1573,9 @@ given by
 \begin{equation} \label{eq:action.eigenvalue}
   \begin{aligned}
     \ell\mathcal S_x(\mathcal Q_x\mid\mathcal Q)
-    =-\frac12\ell\beta\mu+
+    =-&\frac12\ell\beta\mu_1+
     \frac12\beta\sum_b^\ell\bigg\{
-      \frac12\beta&f''(1)\sum_a^lA_{ab}^2\\
+      \frac12\beta f''(1)\sum_a^lA_{ab}^2\\
     &+\sum_a^m\left[
         \big(\hat\beta_0f''(C^{01}_{a1})+R^{10}_{a1}f'''(C^{01}_{a1})\big)(X^0_{ab})^2
         +2f''(C^{01}_{a1})X^0_{ab}\hat X^0_{ab}
@@ -1651,11 +1778,11 @@ where $a_{k+1}=1$ and $x_{k+1}=1$.
 The basic form of the action is (for replica symmetric $A$)
 \begin{equation}
   2\mathcal S_x(\mathcal Q_x\mid\mathcal Q)
-  =-\beta\mu+\frac12\beta^2f''(1)(1-a_0^2)+\log(1-a_0)+\frac{a_0}{1-a_0}+\mathcal X^T\left(\beta B-\frac1{1-a_0}C\right)\mathcal X
+  =-\beta\mu_1+\frac12\beta^2f''(1)(1-a_0^2)+\log(1-a_0)+\frac{a_0}{1-a_0}+\mathcal X^T\left(\beta B-\frac1{1-a_0}C\right)\mathcal X
 \end{equation}
 where the matrix $B$ comes from the $\mathcal X$-dependent parts of the first
 lines of \eqref{eq:action.eigenvalue} and is given by
-\begin{equation}
+\begin{equation} \label{eq:matrix.b}
   B=\begin{bmatrix}
     \hat\beta_0f''(q)+r_{10}f'''(q)&f''(q)&0&0&0\\
     f''(q)&0&0&0&0\\
@@ -1693,7 +1820,7 @@ and where the matrix $C$ encodes the coefficients of the quadratic form
   \qquad
   C_{24}=f'(q)
   \qquad
-  C_{25}=-C_{24}
+  C_{25}=-C_{24} \label{eq:matrix.c}
   \\
   \notag
   &
@@ -1742,7 +1869,7 @@ $a_0=1-(y\beta)^{-1}$. Upon inserting this scaling and taking the limit, we
 finally find
 \begin{equation}
   \lambda_\mathrm{min}=-2\lim_{\beta\to\infty}\frac1\beta\mathcal S_x
-  =\mu-\left(y+\frac1yf''(1)\right)
+  =\mu_1-\left(y+\frac1yf''(1)\right)
 \end{equation}
 with associated saddle point conditions
 \begin{align}
@@ -1754,7 +1881,7 @@ The trivial solution, which gives the bottom of the semicircle, is for
 $\mathcal X=0$. When this is satisfied, the first equation gives $y^2=f''(1)$,
 and
 \begin{equation}
-  \lambda_\mathrm{min}=\mu-\sqrt{4f''(1)}=\mu-\mu_\mathrm m
+  \lambda_\mathrm{min}=\mu_1-\sqrt{4f''(1)}=\mu_1-\mu_\mathrm m
 \end{equation}
 as expected. The nontrivial solutions have nonzero $\mathcal X$. The only way
 to satisfy this with the saddle conditions is for $y$ such that one of the
@@ -1771,6 +1898,16 @@ $y^2\geq f''(1)$. In practice, there is at most \emph{one} $y$ which produces a
 zero eigenvalue of $B-yC$ and satisfies this inequality, so the solution seems
 to be unique.
 
+This is
+directly related to $x_0$. This tangent vector is $\mathbf x_{0\leftarrow
+1}=\frac1{1-q}\big(\pmb\sigma_0-q\mathbf s_a\big)$, which is normalized and
+lies strictly in the tangent plane of $\mathbf s_a$. Then
+\begin{equation}
+  q_\textrm{min}=\frac{\mathbf x_{0\leftarrow 1}\cdot\mathbf x_\mathrm{min}}N
+  =\frac{x_0}{1-q}
+\end{equation}
+
+
 \section{Franz--Parisi potential}
 \label{sec:franz-parisi}
 
-- 
cgit v1.2.3-70-g09d2