summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--marginal.tex147
1 files changed, 98 insertions, 49 deletions
diff --git a/marginal.tex b/marginal.tex
index ccfe579..5c3e033 100644
--- a/marginal.tex
+++ b/marginal.tex
@@ -17,14 +17,13 @@
\affiliation{Istituto Nazionale di Fisica Nucleare, Sezione di Roma I, Rome, Italy 00184}
\begin{abstract}
- Marginal optima are minima or maxima of a function with many asymptotically
+ Marginal optima are minima or maxima of a function with many nearly
flat directions. In settings with many competing optima, marginal ones tend
to attract algorithms and physical dynamics. Often, the important family of
marginal attractors are a vanishing minority compared with nonmarginal optima
and other unstable stationary points. We introduce a generic technique for
conditioning the statistics of stationary points on their marginality, and
- apply it in three isotropic settings with different typical forms for the
- Hessian at optima: in the spherical spin-glasses, where the Hessian is GOE;
+ apply it in three isotropic settings with qualitatively different structure: in the spherical spin-glasses, where the Hessian is GOE;
in a multispherical spin glasses, which are Gaussian but non-GOE; and in a
model of random nonlinear sum of squares, which is non-Gaussian. In these
problems we are able to fully characterize the distribution of marginal
@@ -137,6 +136,7 @@ $A$ (or indeed more complicated averages) in order to condition that the
minimum eigenvalue is zero.
\subsection{Simple example: shifted GOE}
+\label{sec:shifted.GOE}
We demonstrate the efficacy of the technique by rederiving a well-known result:
the large-deviation function for pulling an eigenvalue from the bulk of the
@@ -330,7 +330,7 @@ configuration. We can therefore choose $\mu=\mu_\textrm m$ such that
\begin{equation}
0=\frac\partial{\partial\lambda^*}G_{\lambda^*}(\mu_\mathrm m)\bigg|_{\lambda^*=0}
\end{equation}
-In the previous problem, this corresponds precisely to $\mu_\mathrm m=2\sigma$,
+In the example problem of section \ref{sec:shifted.GOE}, this corresponds precisely to $\mu_\mathrm m=2\sigma$,
the correct marginal shift. Note that when we treat the Dirac $\delta$ function
using its Fourier representation with auxiliary parameter $\hat\lambda$, as in
the previous subsection, this condition corresponds with choosing $\mu$ such
@@ -461,11 +461,14 @@ more nontrivial settings.
The procedure to treat the complexity of the spherical models has been made in
detail elsewhere \cite{Kent-Dobias_2023_How}. Here we will merely sketch the steps that are standard. We start by translating elements of the Kac--Rice measure into terms more familiar to physicists. This means writing
\begin{align}
+ \label{eq:delta.grad}
\delta\big(\nabla H(\mathbf x_a,\pmb\omega_a)\big)
&=\int\frac{d\hat{\mathbf x}_a}{(2\pi)^N}e^{i\hat{\mathbf x}_a^T\nabla H(\mathbf x_a,\pmb\omega_a)} \\
+ \label{eq:delta.energy}
\delta\big(NE-H(\mathbf x_a)\big)
&=\int\frac{d\hat\beta_a}{2\pi}e^{\hat\beta_a(NE-H(\mathbf x_a))} \\
\delta\big(N\lambda^*-\mathbf s^T\operatorname{Hess}H(\mathbf x_a,\pmb\omega)\mathbf s\big)
+ \label{eq:delta.eigen}
&=\int\frac{d\hat\lambda_a}{2\pi}e^{\hat\lambda_a(N\lambda^*-\mathbf s^T\operatorname{Hess}H(\mathbf x_a,\pmb\omega)\mathbf s)}
\end{align}
for the Dirac $\delta$ functions. At this point we will also discuss an
@@ -657,6 +660,7 @@ $\Omega=S^{N-1}\times S^{N-1}$
\end{equation}
\subsection{Random nonlinear least squares}
+\label{sec:least.squares}
In this subsection we consider perhaps the simplest example of a non-Gaussian
landscape: the problem of random nonlinear least squares optimization. Though,
@@ -700,45 +704,17 @@ Applying the Lagrange multiplier method detailed above to enforce the spherical
\\
\operatorname{Hess}H(\mathbf x,\omega)=\partial V_k(\mathbf x)\partial V_k(\mathbf x)+V_k(\mathbf x)\partial\partial V_k(\mathbf x)+\omega I
\end{align}
-\begin{widetext}
-\begin{equation}
- \begin{aligned}
- &\mathcal S
- =-\frac1n\frac\alpha2\left\{\log\det\left[
- \hat\beta f(C)+\Big(
- f'(C)\odot D+(G\odot G-R\odot R)\odot f''(C)
- \Big)f(C)
- +(I+R\odot f'(C))^2
- \right]-\log\det(I+G\odot f'(C))^2\right\} \\
- &+\frac1n\frac12\Big(\log\det(CD+R^2)-\log\det G^2\Big)
- +\hat\beta E+(g_d-r_d)\mu
- \end{aligned}
-\end{equation}
-where $\odot$ gives the Hadamard or componentwise product between the matrices, while other products and powers are matrix products and powers.
-
-\begin{equation}
- \begin{aligned}
- &\hat\beta E+\mu(g_d-r_d)+\frac12\log\frac{d_d+r_d^2}{g_d^2} \\
- &-\frac\alpha2\log\left[
- 1+\hat\beta\big(f(1)-f(0)\big)
- \Big(d_d\big(f(1)-f(0)\big)+r_d\big(2+r_df'(1)\big)\Big)f'(1)
- +(g_d^2-r_d^2)\big(f(1)-f(0)\big)f''(1)
- \right] \\
- &-\alpha f(0)\left(
- \big(f(1)-f(0)\big)+\frac{1+r_d\big(2+r_df'(1)\big)f'(1)}{\hat\beta+d_df'(1)+(g_d^2-r_d^2)f''(1)}
- \right)^{-1}
- \end{aligned}
-\end{equation}
-
-In the case where $\mu$ is not specified, in which the model is supersymmetric, $D=\hat\beta R$ and the effective action becomes particularly simple:
-\begin{equation}
- \hat\beta e
- -\frac12\frac{\alpha f(0)}{1+\hat\beta\big(f(1)-f(0)\big)+r_df'(1)}
- -\frac\alpha2\log\left(1+\frac{\hat\beta\big(f(1)-f(0)\big)}{1+r_df'(1)}\right)
- +\frac12\log\frac{\hat\beta+r_d}{r_d}
-\end{equation}
-
-\cite{DeWitt_1992_Supermanifolds}
+As in the spherical and multispherical models, fixing the trace of the Hessian
+at largest order in $N$ is equivalent to constraining the value of the Lagrange
+multiplier $\omega=\mu$, since the trace of the random parts of the Hessian
+matrix contribute typical values at a lower order in $N$.
+
+The derivation of the marginal complexity for this model is complicated, but
+can be made schematically like that of the derivation of the equilibrium free
+energy by use of superspace coordinates \cite{DeWitt_1992_Supermanifolds}.
+The use of superspace coordinates in the geometry and dynamics of disordered
+systems is well-established. Here, we introduce a novel extension of the
+traditional approach to incorporate the marginality condition.
Consider supervectors in the $\mathbb R^{N|4}$ superspace of the form
\begin{equation}
\pmb\phi_{a\alpha}(1,2)
@@ -747,32 +723,53 @@ Consider supervectors in the $\mathbb R^{N|4}$ superspace of the form
+i\hat{\mathbf x}_a\bar\theta_1\theta_1
+\mathbf s_{a\alpha}(\bar\theta_1\theta_2+\bar\theta_2\theta_1)
\end{equation}
-The Kac--Rice measure with the eigenvalue-fixing term included is
+The traditional complexity problem, outlined in the appendix
+\ref{sec:dominant.complexity}, involves a supervector without the last term.
+\begin{widetext}
+ The replicated number of stationary points conditioned on energy $E$, trace $\mu$, and minimum eigenvalue $\lambda^*$ is then given by
\begin{equation}
\begin{aligned}
\mathcal N(E,\mu,\lambda^*)^n
- &=\int\prod_{a=1}^n\prod_{\alpha=1}^{m_a}d\pmb\phi_{a\alpha}
+ &=\int\prod_{a=1}^n\lim_{m_a\to0}\prod_{\alpha=1}^{m_a}d\pmb\phi_{a\alpha}
\exp\left\{
\delta_{\alpha1}N(\hat\beta_aE+\hat\lambda_a\lambda^*)
+\int d1\,d2\,B_{a\alpha}(1,2)\left[H(\pmb\phi_{a\alpha})+\frac12\mu(\|\pmb\phi_{a\alpha}\|^2-N)\right]
\right\}
\end{aligned}
\end{equation}
+where we use the compact notation $d1=d\theta_1\,d\bar\theta_1$ for the
+measures associated with the Grassmann directions. Here we have also defined
\begin{equation}
B_{a\alpha}(1,2)=\delta_{\alpha1}\bar\theta_2\theta_2
(1-\hat\beta_a\bar\theta_1\theta_1)
-\delta_{\alpha1}\hat\lambda_a-\beta
\end{equation}
+which encodes various aspects of the complexity problem, and the measure
\begin{align}
d\pmb\phi_{a\alpha}
=d\mathbf x_a\,\delta(\|\mathbf x_a\|^2-N)\,\frac{d\hat{\mathbf x}_a}{(2\pi)^N}\,d\pmb\eta_a\,d\bar{\pmb\eta}_a\,
d\mathbf s_{a\alpha}\,\delta(\|\mathbf s_{a\alpha}\|^2-N)\,
\delta(\mathbf x_a^T\mathbf s_{a\alpha})
\end{align}
-
-\begin{equation}
- i\int d1\,d2\,\hat v_{a\alpha}^k(1,2)(V^k(\pmb\phi_{a\alpha}(1,2))-v_{a\alpha}^k(1,2))
-\end{equation}
+encoding the measures of all the superfield's constituent variables. Expanding
+functions of the superfield in the coordinates $\theta$ and performing the
+integrals, this expression is equivalent to that of the replicated Kac--Rice
+integrand \eqref{eq:min.complexity.expanded} with the substitutions of the
+Dirac $\delta$ functions of \eqref{eq:delta.grad}, \eqref{eq:delta.energy}, and
+\eqref{eq:delta.eigen}.
+
+The first step to evaluate this expression is to linearize the dependence on the random functions $V$. This is accomplished by inserting into the integral a Dirac $\delta$ function fixing the value of the energy for each replica, or
+\begin{equation}
+ \delta\big(
+ V^k(\pmb\phi_{a\alpha}(1,2))-v_{a\alpha}^k(1,2)
+ \big)
+ =
+ \int\prod_{a\alpha k}d\hat v_{a\alpha}^k\exp\left[
+ i\int d1\,d2\,\hat v_{a\alpha}^k(1,2)
+ \big(V^k(\pmb\phi_{a\alpha}(1,2))-v_{a\alpha}^k(1,2)\big)
+ \right]
+\end{equation}
+where we have introduced auxiliary fields $\hat v$.
\begin{equation}
-\sum_{ab}\sum_{\alpha\gamma}\sum_k\frac12\int d1\,d2\,d3\,d4\,
\hat v_{a\alpha}^kf\big(\pmb\phi_{a\alpha}(1,2)^T\pmb\phi_{b\gamma}(3,4)\big)\hat v_{b\gamma}^k
@@ -841,6 +838,58 @@ We further take a planted replica symmetric structure for the matrix $Q$,
identical to that in \eqref{eq:Q.structure}.
\end{widetext}
+\appendix
+
+\section{Complexity of dominant optima in the least-squares problem}
+\label{sec:dominant.complexity}
+
+Here we share an outline of the derivation of formulas for the complexity of
+dominant optima in the random nonlinear least squares problem of section
+\ref{sec:least.squares}. While in this paper we only treat problems with a
+replica symmetric structure, formulas for the effective action are generic to
+any structure and provide a starting point for analyzing the challenging
+full-RSB setting.
+
+\begin{widetext}
+\begin{equation}
+ \begin{aligned}
+ &\mathcal S
+ =-\frac1n\frac\alpha2\left\{\log\det\left[
+ \hat\beta f(C)+\Big(
+ f'(C)\odot D+(G\odot G-R\odot R)\odot f''(C)
+ \Big)f(C)
+ +(I+R\odot f'(C))^2
+ \right]-\log\det(I+G\odot f'(C))^2\right\} \\
+ &+\frac1n\frac12\Big(\log\det(CD+R^2)-\log\det G^2\Big)
+ +\hat\beta E+(g_d-r_d)\mu
+ \end{aligned}
+\end{equation}
+where $\odot$ gives the Hadamard or componentwise product between the matrices, while other products and powers are matrix products and powers.
+
+\begin{equation}
+ \begin{aligned}
+ &\hat\beta E+\mu(g_d-r_d)+\frac12\log\frac{d_d+r_d^2}{g_d^2} \\
+ &-\frac\alpha2\log\left[
+ 1+\hat\beta\big(f(1)-f(0)\big)
+ \Big(d_d\big(f(1)-f(0)\big)+r_d\big(2+r_df'(1)\big)\Big)f'(1)
+ +(g_d^2-r_d^2)\big(f(1)-f(0)\big)f''(1)
+ \right] \\
+ &-\alpha f(0)\left(
+ \big(f(1)-f(0)\big)+\frac{1+r_d\big(2+r_df'(1)\big)f'(1)}{\hat\beta+d_df'(1)+(g_d^2-r_d^2)f''(1)}
+ \right)^{-1}
+ \end{aligned}
+\end{equation}
+
+In the case where $\mu$ is not specified, in which the model is supersymmetric, $D=\hat\beta R$ and the effective action becomes particularly simple:
+\begin{equation}
+ \hat\beta e
+ -\frac12\frac{\alpha f(0)}{1+\hat\beta\big(f(1)-f(0)\big)+r_df'(1)}
+ -\frac\alpha2\log\left(1+\frac{\hat\beta\big(f(1)-f(0)\big)}{1+r_df'(1)}\right)
+ +\frac12\log\frac{\hat\beta+r_d}{r_d}
+\end{equation}
+
+\end{widetext}
+
\bibliography{marginal}
\end{document}