From a8c59a6b7e7ac4a26e94b257590f6d6fbcc8dc93 Mon Sep 17 00:00:00 2001
From: Jaron Kent-Dobias <jaron@kent-dobias.com>
Date: Tue, 4 Jun 2024 11:25:17 -0700
Subject: Some writing.

---
 marginal.tex | 147 +++++++++++++++++++++++++++++++++++++++--------------------
 1 file changed, 98 insertions(+), 49 deletions(-)

diff --git a/marginal.tex b/marginal.tex
index ccfe579..5c3e033 100644
--- a/marginal.tex
+++ b/marginal.tex
@@ -17,14 +17,13 @@
 \affiliation{Istituto Nazionale di Fisica Nucleare, Sezione di Roma I, Rome, Italy 00184}
 
 \begin{abstract}
-  Marginal optima are minima or maxima of a function with many asymptotically
+  Marginal optima are minima or maxima of a function with many nearly
   flat directions. In settings with many competing optima, marginal ones tend
   to attract algorithms and physical dynamics. Often, the important family of
   marginal attractors are a vanishing minority compared with nonmarginal optima
   and other unstable stationary points. We introduce a generic technique for
   conditioning the statistics of stationary points on their marginality, and
-  apply it in three isotropic settings with different typical forms for the
-  Hessian at optima: in the spherical spin-glasses, where the Hessian is GOE;
+  apply it in three isotropic settings with qualitatively different structure: in the spherical spin-glasses, where the Hessian is GOE;
   in a multispherical spin glasses, which are Gaussian but non-GOE; and in a
   model of random nonlinear sum of squares, which is non-Gaussian. In these
   problems we are able to fully characterize the distribution of marginal
@@ -137,6 +136,7 @@ $A$ (or indeed more complicated averages) in order to condition that the
 minimum eigenvalue is zero.
 
 \subsection{Simple example: shifted GOE}
+\label{sec:shifted.GOE}
 
 We demonstrate the efficacy of the technique by rederiving a well-known result:
 the large-deviation function for pulling an eigenvalue from the bulk of the
@@ -330,7 +330,7 @@ configuration. We can therefore choose $\mu=\mu_\textrm m$ such that
 \begin{equation}
   0=\frac\partial{\partial\lambda^*}G_{\lambda^*}(\mu_\mathrm m)\bigg|_{\lambda^*=0}
 \end{equation}
-In the previous problem, this corresponds precisely to $\mu_\mathrm m=2\sigma$,
+In the example problem of section \ref{sec:shifted.GOE}, this corresponds precisely to $\mu_\mathrm m=2\sigma$,
 the correct marginal shift. Note that when we treat the Dirac $\delta$ function
 using its Fourier representation with auxiliary parameter $\hat\lambda$, as in
 the previous subsection, this condition corresponds with choosing $\mu$ such
@@ -461,11 +461,14 @@ more nontrivial settings.
 The procedure to treat the complexity of the spherical models has been made in
 detail elsewhere \cite{Kent-Dobias_2023_How}. Here we will merely sketch the steps that are standard. We start by translating elements of the Kac--Rice measure into terms more familiar to physicists. This means writing
 \begin{align}
+  \label{eq:delta.grad}
   \delta\big(\nabla H(\mathbf x_a,\pmb\omega_a)\big)
     &=\int\frac{d\hat{\mathbf x}_a}{(2\pi)^N}e^{i\hat{\mathbf x}_a^T\nabla H(\mathbf x_a,\pmb\omega_a)} \\
+    \label{eq:delta.energy}
   \delta\big(NE-H(\mathbf x_a)\big)
     &=\int\frac{d\hat\beta_a}{2\pi}e^{\hat\beta_a(NE-H(\mathbf x_a))} \\
   \delta\big(N\lambda^*-\mathbf s^T\operatorname{Hess}H(\mathbf x_a,\pmb\omega)\mathbf s\big)
+  \label{eq:delta.eigen}
     &=\int\frac{d\hat\lambda_a}{2\pi}e^{\hat\lambda_a(N\lambda^*-\mathbf s^T\operatorname{Hess}H(\mathbf x_a,\pmb\omega)\mathbf s)}
 \end{align}
 for the Dirac $\delta$ functions. At this point we will also discuss an
@@ -657,6 +660,7 @@ $\Omega=S^{N-1}\times S^{N-1}$
 \end{equation}
 
 \subsection{Random nonlinear least squares}
+\label{sec:least.squares}
 
 In this subsection we consider perhaps the simplest example of a non-Gaussian
 landscape: the problem of random nonlinear least squares optimization. Though,
@@ -700,45 +704,17 @@ Applying the Lagrange multiplier method detailed above to enforce the spherical
   \\
   \operatorname{Hess}H(\mathbf x,\omega)=\partial V_k(\mathbf x)\partial V_k(\mathbf x)+V_k(\mathbf x)\partial\partial V_k(\mathbf x)+\omega I
 \end{align}
-\begin{widetext}
-\begin{equation}
-  \begin{aligned}
-    &\mathcal S
-    =-\frac1n\frac\alpha2\left\{\log\det\left[
-      \hat\beta f(C)+\Big(
-        f'(C)\odot D+(G\odot G-R\odot R)\odot f''(C)
-      \Big)f(C)
-      +(I+R\odot f'(C))^2
-    \right]-\log\det(I+G\odot f'(C))^2\right\} \\
-    &+\frac1n\frac12\Big(\log\det(CD+R^2)-\log\det G^2\Big)
-    +\hat\beta E+(g_d-r_d)\mu
-  \end{aligned}
-\end{equation}
-where $\odot$ gives the Hadamard or componentwise product between the matrices, while other products and powers are matrix products and powers.
-
-\begin{equation}
-  \begin{aligned}
-    &\hat\beta E+\mu(g_d-r_d)+\frac12\log\frac{d_d+r_d^2}{g_d^2} \\
-    &-\frac\alpha2\log\left[
-      1+\hat\beta\big(f(1)-f(0)\big)
-      \Big(d_d\big(f(1)-f(0)\big)+r_d\big(2+r_df'(1)\big)\Big)f'(1)
-      +(g_d^2-r_d^2)\big(f(1)-f(0)\big)f''(1)
-    \right] \\
-    &-\alpha f(0)\left(
-      \big(f(1)-f(0)\big)+\frac{1+r_d\big(2+r_df'(1)\big)f'(1)}{\hat\beta+d_df'(1)+(g_d^2-r_d^2)f''(1)}
-    \right)^{-1}
-  \end{aligned}
-\end{equation}
-
-In the case where $\mu$ is not specified, in which the model is supersymmetric, $D=\hat\beta R$ and the effective action becomes particularly simple:
-\begin{equation}
-  \hat\beta e
-  -\frac12\frac{\alpha f(0)}{1+\hat\beta\big(f(1)-f(0)\big)+r_df'(1)}
-  -\frac\alpha2\log\left(1+\frac{\hat\beta\big(f(1)-f(0)\big)}{1+r_df'(1)}\right)
-  +\frac12\log\frac{\hat\beta+r_d}{r_d}
-\end{equation}
-
-\cite{DeWitt_1992_Supermanifolds}
+As in the spherical and multispherical models, fixing the trace of the Hessian
+at largest order in $N$ is equivalent to constraining the value of the Lagrange
+multiplier $\omega=\mu$, since the trace of the random parts of the Hessian
+matrix contribute typical values at a lower order in $N$.
+
+The derivation of the marginal complexity for this model is complicated, but
+can be made schematically like that of the derivation of the equilibrium free
+energy by use of superspace coordinates \cite{DeWitt_1992_Supermanifolds}.
+The use of superspace coordinates in the geometry and dynamics of disordered
+systems is well-established. Here, we introduce a novel extension of the
+traditional approach to incorporate the marginality condition.
 Consider supervectors in the $\mathbb R^{N|4}$ superspace of the form
 \begin{equation}
   \pmb\phi_{a\alpha}(1,2)
@@ -747,32 +723,53 @@ Consider supervectors in the $\mathbb R^{N|4}$ superspace of the form
   +i\hat{\mathbf x}_a\bar\theta_1\theta_1
   +\mathbf s_{a\alpha}(\bar\theta_1\theta_2+\bar\theta_2\theta_1)
 \end{equation}
-The Kac--Rice measure with the eigenvalue-fixing term included is
+The traditional complexity problem, outlined in the appendix
+\ref{sec:dominant.complexity}, involves a supervector without the last term.
+\begin{widetext}
+  The replicated number of stationary points conditioned on energy $E$, trace $\mu$, and minimum eigenvalue $\lambda^*$ is then given by
 \begin{equation}
   \begin{aligned}
     \mathcal N(E,\mu,\lambda^*)^n
-    &=\int\prod_{a=1}^n\prod_{\alpha=1}^{m_a}d\pmb\phi_{a\alpha}
+    &=\int\prod_{a=1}^n\lim_{m_a\to0}\prod_{\alpha=1}^{m_a}d\pmb\phi_{a\alpha}
     \exp\left\{
       \delta_{\alpha1}N(\hat\beta_aE+\hat\lambda_a\lambda^*)
       +\int d1\,d2\,B_{a\alpha}(1,2)\left[H(\pmb\phi_{a\alpha})+\frac12\mu(\|\pmb\phi_{a\alpha}\|^2-N)\right]
     \right\}
   \end{aligned}
 \end{equation}
+where we use the compact notation $d1=d\theta_1\,d\bar\theta_1$ for the
+measures associated with the Grassmann directions. Here we have also defined
 \begin{equation}
   B_{a\alpha}(1,2)=\delta_{\alpha1}\bar\theta_2\theta_2
         (1-\hat\beta_a\bar\theta_1\theta_1)
         -\delta_{\alpha1}\hat\lambda_a-\beta
 \end{equation}
+which encodes various aspects of the complexity problem, and the measure
 \begin{align}
   d\pmb\phi_{a\alpha}
   =d\mathbf x_a\,\delta(\|\mathbf x_a\|^2-N)\,\frac{d\hat{\mathbf x}_a}{(2\pi)^N}\,d\pmb\eta_a\,d\bar{\pmb\eta}_a\,
   d\mathbf s_{a\alpha}\,\delta(\|\mathbf s_{a\alpha}\|^2-N)\,
   \delta(\mathbf x_a^T\mathbf s_{a\alpha})
 \end{align}
-
-\begin{equation}
-  i\int d1\,d2\,\hat v_{a\alpha}^k(1,2)(V^k(\pmb\phi_{a\alpha}(1,2))-v_{a\alpha}^k(1,2))
-\end{equation}
+encoding the measures of all the superfield's constituent variables. Expanding
+functions of the superfield in the coordinates $\theta$ and performing the
+integrals, this expression is equivalent to that of the replicated Kac--Rice
+integrand \eqref{eq:min.complexity.expanded} with the substitutions of the
+Dirac $\delta$ functions of \eqref{eq:delta.grad}, \eqref{eq:delta.energy}, and
+\eqref{eq:delta.eigen}.
+
+The first step to evaluate this expression is to linearize the dependence on the random functions $V$. This is accomplished by inserting into the integral a Dirac $\delta$ function fixing the value of the energy for each replica, or
+\begin{equation}
+  \delta\big(
+    V^k(\pmb\phi_{a\alpha}(1,2))-v_{a\alpha}^k(1,2)
+  \big)
+  =
+  \int\prod_{a\alpha k}d\hat v_{a\alpha}^k\exp\left[
+    i\int d1\,d2\,\hat v_{a\alpha}^k(1,2)
+    \big(V^k(\pmb\phi_{a\alpha}(1,2))-v_{a\alpha}^k(1,2)\big)
+  \right]
+\end{equation}
+where we have introduced auxiliary fields $\hat v$.
 \begin{equation}
   -\sum_{ab}\sum_{\alpha\gamma}\sum_k\frac12\int d1\,d2\,d3\,d4\,
   \hat v_{a\alpha}^kf\big(\pmb\phi_{a\alpha}(1,2)^T\pmb\phi_{b\gamma}(3,4)\big)\hat v_{b\gamma}^k
@@ -841,6 +838,58 @@ We further take a planted replica symmetric structure for the matrix $Q$,
 identical to that in \eqref{eq:Q.structure}.
 \end{widetext}
 
+\appendix
+
+\section{Complexity of dominant optima in the least-squares problem}
+\label{sec:dominant.complexity}
+
+Here we share an outline of the derivation of formulas for the complexity of
+dominant optima in the random nonlinear least squares problem of section
+\ref{sec:least.squares}. While in this paper we only treat problems with a
+replica symmetric structure, formulas for the effective action are generic to
+any structure and provide a starting point for analyzing the challenging
+full-RSB setting.
+
+\begin{widetext}
+\begin{equation}
+  \begin{aligned}
+    &\mathcal S
+    =-\frac1n\frac\alpha2\left\{\log\det\left[
+      \hat\beta f(C)+\Big(
+        f'(C)\odot D+(G\odot G-R\odot R)\odot f''(C)
+      \Big)f(C)
+      +(I+R\odot f'(C))^2
+    \right]-\log\det(I+G\odot f'(C))^2\right\} \\
+    &+\frac1n\frac12\Big(\log\det(CD+R^2)-\log\det G^2\Big)
+    +\hat\beta E+(g_d-r_d)\mu
+  \end{aligned}
+\end{equation}
+where $\odot$ gives the Hadamard or componentwise product between the matrices, while other products and powers are matrix products and powers.
+
+\begin{equation}
+  \begin{aligned}
+    &\hat\beta E+\mu(g_d-r_d)+\frac12\log\frac{d_d+r_d^2}{g_d^2} \\
+    &-\frac\alpha2\log\left[
+      1+\hat\beta\big(f(1)-f(0)\big)
+      \Big(d_d\big(f(1)-f(0)\big)+r_d\big(2+r_df'(1)\big)\Big)f'(1)
+      +(g_d^2-r_d^2)\big(f(1)-f(0)\big)f''(1)
+    \right] \\
+    &-\alpha f(0)\left(
+      \big(f(1)-f(0)\big)+\frac{1+r_d\big(2+r_df'(1)\big)f'(1)}{\hat\beta+d_df'(1)+(g_d^2-r_d^2)f''(1)}
+    \right)^{-1}
+  \end{aligned}
+\end{equation}
+
+In the case where $\mu$ is not specified, in which the model is supersymmetric, $D=\hat\beta R$ and the effective action becomes particularly simple:
+\begin{equation}
+  \hat\beta e
+  -\frac12\frac{\alpha f(0)}{1+\hat\beta\big(f(1)-f(0)\big)+r_df'(1)}
+  -\frac\alpha2\log\left(1+\frac{\hat\beta\big(f(1)-f(0)\big)}{1+r_df'(1)}\right)
+  +\frac12\log\frac{\hat\beta+r_d}{r_d}
+\end{equation}
+
+\end{widetext}
+
 \bibliography{marginal}
 
 \end{document}
-- 
cgit v1.2.3-70-g09d2